Fix service status

This commit is contained in:
2025-12-20 02:28:41 +07:00
parent c645171671
commit 243abe55b5
5 changed files with 210 additions and 33 deletions

View File

@@ -804,6 +804,14 @@ export function LobbyPage() {
</select> </select>
</div> </div>
</div> </div>
<div>
<label className="text-xs text-gray-400 mb-1 block">Подсказка для пруфа</label>
<Input
placeholder="Что именно должно быть на скриншоте/видео"
value={editChallenge.proof_hint}
onChange={(e) => setEditChallenge(prev => ({ ...prev, proof_hint: e.target.value }))}
/>
</div>
<div className="flex gap-2"> <div className="flex gap-2">
<NeonButton <NeonButton
size="sm" size="sm"
@@ -852,6 +860,9 @@ export function LobbyPage() {
</div> </div>
<h5 className="text-sm font-medium text-white">{challenge.title}</h5> <h5 className="text-sm font-medium text-white">{challenge.title}</h5>
<p className="text-xs text-gray-400 mt-1">{challenge.description}</p> <p className="text-xs text-gray-400 mt-1">{challenge.description}</p>
{challenge.proof_hint && (
<p className="text-xs text-gray-500 mt-1">Пруф: {challenge.proof_hint}</p>
)}
</div> </div>
{isOrganizer && ( {isOrganizer && (
<div className="flex gap-1 shrink-0"> <div className="flex gap-1 shrink-0">
@@ -1187,6 +1198,14 @@ export function LobbyPage() {
</select> </select>
</div> </div>
</div> </div>
<div>
<label className="text-xs text-gray-400 mb-1 block">Подсказка для пруфа</label>
<Input
placeholder="Что именно должно быть на скриншоте/видео"
value={editChallenge.proof_hint}
onChange={(e) => setEditChallenge(prev => ({ ...prev, proof_hint: e.target.value }))}
/>
</div>
<div className="flex gap-2"> <div className="flex gap-2">
<NeonButton <NeonButton
size="sm" size="sm"
@@ -1232,6 +1251,9 @@ export function LobbyPage() {
</div> </div>
<h4 className="font-medium text-white mb-1">{challenge.title}</h4> <h4 className="font-medium text-white mb-1">{challenge.title}</h4>
<p className="text-sm text-gray-400 mb-2">{challenge.description}</p> <p className="text-sm text-gray-400 mb-2">{challenge.description}</p>
{challenge.proof_hint && (
<p className="text-xs text-gray-500 mb-2">Пруф: {challenge.proof_hint}</p>
)}
{challenge.proposed_by && ( {challenge.proposed_by && (
<p className="text-xs text-gray-500 flex items-center gap-1"> <p className="text-xs text-gray-500 flex items-center gap-1">
<User className="w-3 h-3" /> Предложил: {challenge.proposed_by.nickname} <User className="w-3 h-3" /> Предложил: {challenge.proposed_by.nickname}
@@ -1310,6 +1332,9 @@ export function LobbyPage() {
</div> </div>
<h4 className="font-medium text-white mb-1">{challenge.title}</h4> <h4 className="font-medium text-white mb-1">{challenge.title}</h4>
<p className="text-sm text-gray-400">{challenge.description}</p> <p className="text-sm text-gray-400">{challenge.description}</p>
{challenge.proof_hint && (
<p className="text-xs text-gray-500 mt-1">Пруф: {challenge.proof_hint}</p>
)}
</div> </div>
{challenge.status === 'pending' && ( {challenge.status === 'pending' && (
<button <button

View File

@@ -90,7 +90,7 @@ def get_latency_history(service_name: str, hours: int = 24) -> list[dict]:
conn = get_connection() conn = get_connection()
cursor = conn.cursor() cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours) since = datetime.utcnow() - timedelta(hours=hours)
cursor.execute(""" cursor.execute("""
SELECT latency_ms, status, checked_at SELECT latency_ms, status, checked_at
FROM metrics FROM metrics
@@ -116,7 +116,7 @@ def get_uptime_stats(service_name: str, hours: int = 24) -> dict:
conn = get_connection() conn = get_connection()
cursor = conn.cursor() cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours) since = datetime.utcnow() - timedelta(hours=hours)
cursor.execute(""" cursor.execute("""
SELECT COUNT(*) as total, SELECT COUNT(*) as total,
@@ -143,7 +143,7 @@ def get_avg_latency(service_name: str, hours: int = 24) -> Optional[float]:
conn = get_connection() conn = get_connection()
cursor = conn.cursor() cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours) since = datetime.utcnow() - timedelta(hours=hours)
cursor.execute(""" cursor.execute("""
SELECT AVG(latency_ms) as avg_latency SELECT AVG(latency_ms) as avg_latency
FROM metrics FROM metrics
@@ -249,11 +249,11 @@ def get_ssl_info(domain: str) -> Optional[dict]:
return None return None
def cleanup_old_metrics(days: int = 1): def cleanup_old_metrics(hours: int = 24):
"""Delete metrics older than specified days (default: 24 hours).""" """Delete metrics older than specified hours (default: 24 hours)."""
conn = get_connection() conn = get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cutoff = datetime.now() - timedelta(days=days) cutoff = datetime.utcnow() - timedelta(hours=hours)
cursor.execute("DELETE FROM metrics WHERE checked_at < ?", (cutoff.isoformat(),)) cursor.execute("DELETE FROM metrics WHERE checked_at < ?", (cutoff.isoformat(),))
deleted = cursor.rowcount deleted = cursor.rowcount
conn.commit() conn.commit()

View File

@@ -9,7 +9,7 @@ from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from monitors import ServiceMonitor from monitors import ServiceMonitor, Status
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
@@ -19,52 +19,91 @@ FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
BOT_URL = os.getenv("BOT_URL", "http://bot:8080") BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "600")) # 10 minutes CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "60")) # Normal interval (1 minute)
FAST_CHECK_INTERVAL = int(os.getenv("FAST_CHECK_INTERVAL", "5")) # Fast interval when issues detected
STARTUP_GRACE_PERIOD = int(os.getenv("STARTUP_GRACE_PERIOD", "60")) # Wait before alerting after startup
# Initialize monitor # Initialize monitor
monitor = ServiceMonitor() monitor = ServiceMonitor()
startup_time: Optional[datetime] = None # Track when service started
# Background task reference # Background task reference
background_task: Optional[asyncio.Task] = None background_task: Optional[asyncio.Task] = None
cleanup_task: Optional[asyncio.Task] = None cleanup_task: Optional[asyncio.Task] = None
def has_issues() -> bool:
"""Check if any monitored service has issues."""
for name, svc in monitor.services.items():
# Skip external if not configured
if name == "external" and svc.status == Status.UNKNOWN:
continue
if svc.status in (Status.DOWN, Status.DEGRADED):
return True
return False
async def periodic_health_check(): async def periodic_health_check():
"""Background task to check services periodically.""" """Background task to check services periodically with adaptive polling."""
while True: while True:
try: try:
# Suppress alerts during startup grace period
suppress_alerts = is_in_grace_period()
if suppress_alerts:
remaining = STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()
print(f"Grace period: {remaining:.0f}s remaining (alerts suppressed)")
await monitor.check_all_services( await monitor.check_all_services(
backend_url=BACKEND_URL, backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL, frontend_url=FRONTEND_URL,
bot_url=BOT_URL, bot_url=BOT_URL,
external_url=EXTERNAL_URL, external_url=EXTERNAL_URL,
public_url=PUBLIC_URL public_url=PUBLIC_URL,
suppress_alerts=suppress_alerts
) )
except Exception as e: except Exception as e:
print(f"Health check error: {e}") print(f"Health check error: {e}")
# Adaptive polling: check more frequently when issues detected
if has_issues():
await asyncio.sleep(FAST_CHECK_INTERVAL)
else:
await asyncio.sleep(CHECK_INTERVAL) await asyncio.sleep(CHECK_INTERVAL)
async def periodic_cleanup(): async def periodic_cleanup():
"""Background task to cleanup old metrics (hourly).""" """Background task to cleanup old metrics (runs immediately, then hourly)."""
while True: while True:
await asyncio.sleep(3600) # 1 hour
try: try:
deleted = cleanup_old_metrics(days=1) # Keep only last 24 hours deleted = cleanup_old_metrics(hours=24) # Keep only last 24 hours
if deleted > 0:
print(f"Cleaned up {deleted} old metrics") print(f"Cleaned up {deleted} old metrics")
except Exception as e: except Exception as e:
print(f"Cleanup error: {e}") print(f"Cleanup error: {e}")
await asyncio.sleep(3600) # Wait 1 hour before next cleanup
def is_in_grace_period() -> bool:
"""Check if we're still in startup grace period."""
if startup_time is None:
return True
elapsed = (datetime.now() - startup_time).total_seconds()
return elapsed < STARTUP_GRACE_PERIOD
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
"""Startup and shutdown events.""" """Startup and shutdown events."""
global background_task, cleanup_task global background_task, cleanup_task, startup_time
# Initialize database # Initialize database
init_db() init_db()
print("Database initialized") print("Database initialized")
# Mark startup time
startup_time = datetime.now()
print(f"Startup grace period: {STARTUP_GRACE_PERIOD}s (no alerts until services stabilize)")
# Start background health checks # Start background health checks
background_task = asyncio.create_task(periodic_health_check()) background_task = asyncio.create_task(periodic_health_check())
cleanup_task = asyncio.create_task(periodic_cleanup()) cleanup_task = asyncio.create_task(periodic_cleanup())
@@ -91,12 +130,20 @@ templates = Jinja2Templates(directory="templates")
@app.get("/", response_class=HTMLResponse) @app.get("/", response_class=HTMLResponse)
async def status_page(request: Request): async def status_page(request: Request, period: int = 24):
"""Main status page.""" """Main status page."""
services = monitor.get_all_statuses() # Validate period (1, 12, or 24 hours)
if period not in (1, 12, 24):
period = 24
services = monitor.get_all_statuses(period_hours=period)
overall_status = monitor.get_overall_status() overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status() ssl_status = monitor.get_ssl_status()
incidents = get_recent_incidents(limit=5) incidents = get_recent_incidents(limit=5)
fast_mode = has_issues()
current_interval = FAST_CHECK_INTERVAL if fast_mode else CHECK_INTERVAL
grace_period_active = is_in_grace_period()
grace_period_remaining = max(0, STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()) if startup_time else 0
return templates.TemplateResponse( return templates.TemplateResponse(
"index.html", "index.html",
@@ -107,7 +154,11 @@ async def status_page(request: Request):
"ssl_status": ssl_status, "ssl_status": ssl_status,
"incidents": incidents, "incidents": incidents,
"last_check": monitor.last_check, "last_check": monitor.last_check,
"check_interval": CHECK_INTERVAL "check_interval": current_interval,
"fast_mode": fast_mode,
"grace_period_active": grace_period_active,
"grace_period_remaining": int(grace_period_remaining),
"period": period
} }
) )
@@ -118,13 +169,15 @@ async def api_status():
services = monitor.get_all_statuses() services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status() overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status() ssl_status = monitor.get_ssl_status()
current_interval = FAST_CHECK_INTERVAL if has_issues() else CHECK_INTERVAL
return { return {
"overall_status": overall_status.value, "overall_status": overall_status.value,
"services": {name: status.to_dict() for name, status in services.items()}, "services": {name: status.to_dict() for name, status in services.items()},
"ssl": ssl_status, "ssl": ssl_status,
"last_check": monitor.last_check.isoformat() if monitor.last_check else None, "last_check": monitor.last_check.isoformat() if monitor.last_check else None,
"check_interval_seconds": CHECK_INTERVAL "check_interval_seconds": current_interval,
"fast_mode": has_issues()
} }

View File

@@ -184,7 +184,8 @@ class ServiceMonitor:
self, self,
service_name: str, service_name: str,
result: tuple, result: tuple,
now: datetime now: datetime,
suppress_alerts: bool = False
): ):
"""Process check result with DB persistence and alerting.""" """Process check result with DB persistence and alerting."""
if isinstance(result, Exception): if isinstance(result, Exception):
@@ -221,11 +222,12 @@ class ServiceMonitor:
if stats["total_checks"] > 0: if stats["total_checks"] > 0:
svc.uptime_percent = stats["uptime_percent"] svc.uptime_percent = stats["uptime_percent"]
# Handle incident tracking and alerting # Handle incident tracking and alerting (skip alerts during grace period)
if is_down and not was_down: if is_down and not was_down:
# Service just went down # Service just went down
svc.last_incident = now svc.last_incident = now
incident_id = create_incident(service_name, status.value, message) incident_id = create_incident(service_name, status.value, message)
if not suppress_alerts:
await alert_service_down(service_name, svc.display_name, message) await alert_service_down(service_name, svc.display_name, message)
mark_incident_notified(incident_id) mark_incident_notified(incident_id)
@@ -236,6 +238,7 @@ class ServiceMonitor:
started_at = datetime.fromisoformat(open_incident["started_at"]) started_at = datetime.fromisoformat(open_incident["started_at"])
downtime_minutes = int((now - started_at).total_seconds() / 60) downtime_minutes = int((now - started_at).total_seconds() / 60)
resolve_incident(service_name) resolve_incident(service_name)
if not suppress_alerts:
await alert_service_recovered(service_name, svc.display_name, downtime_minutes) await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
async def check_all_services( async def check_all_services(
@@ -244,7 +247,8 @@ class ServiceMonitor:
frontend_url: str, frontend_url: str,
bot_url: str, bot_url: str,
external_url: str = "", external_url: str = "",
public_url: str = "" public_url: str = "",
suppress_alerts: bool = False
): ):
"""Check all services concurrently.""" """Check all services concurrently."""
now = datetime.now() now = datetime.now()
@@ -262,7 +266,7 @@ class ServiceMonitor:
# Process results # Process results
service_names = ["backend", "database", "frontend", "bot", "external"] service_names = ["backend", "database", "frontend", "bot", "external"]
for i, service_name in enumerate(service_names): for i, service_name in enumerate(service_names):
await self._process_check_result(service_name, results[i], now) await self._process_check_result(service_name, results[i], now, suppress_alerts)
# Check SSL certificate (if public URL is HTTPS) # Check SSL certificate (if public URL is HTTPS)
if public_url and public_url.startswith("https://"): if public_url and public_url.startswith("https://"):
@@ -270,7 +274,15 @@ class ServiceMonitor:
self.last_check = now self.last_check = now
def get_all_statuses(self) -> dict[str, ServiceStatus]: def get_all_statuses(self, period_hours: int = 24) -> dict[str, ServiceStatus]:
"""Get all service statuses with data for specified period."""
# Update historical data for requested period
for name, svc in self.services.items():
svc.latency_history = get_latency_history(name, hours=period_hours)
svc.avg_latency_24h = get_avg_latency(name, hours=period_hours)
stats = get_uptime_stats(name, hours=period_hours)
if stats["total_checks"] > 0:
svc.uptime_percent = stats["uptime_percent"]
return self.services return self.services
def get_overall_status(self) -> Status: def get_overall_status(self) -> Status:

View File

@@ -107,6 +107,32 @@
font-size: 0.9rem; font-size: 0.9rem;
} }
.fast-mode-badge {
display: inline-block;
margin-left: 8px;
padding: 2px 8px;
background: rgba(250, 204, 21, 0.15);
border: 1px solid rgba(250, 204, 21, 0.3);
border-radius: 12px;
color: #facc15;
font-size: 0.8rem;
font-weight: 500;
animation: pulse 2s infinite;
}
.grace-period-badge {
display: inline-block;
margin-left: 8px;
padding: 2px 8px;
background: rgba(59, 130, 246, 0.15);
border: 1px solid rgba(59, 130, 246, 0.3);
border-radius: 12px;
color: #3b82f6;
font-size: 0.8rem;
font-weight: 500;
animation: pulse 2s infinite;
}
.services-grid { .services-grid {
display: grid; display: grid;
gap: 16px; gap: 16px;
@@ -347,6 +373,37 @@
color: #64748b; color: #64748b;
} }
.period-selector {
display: flex;
justify-content: center;
gap: 8px;
margin-bottom: 24px;
}
.period-btn {
padding: 8px 16px;
background: rgba(30, 41, 59, 0.5);
border: 1px solid rgba(100, 116, 139, 0.3);
border-radius: 8px;
color: #94a3b8;
font-size: 0.9rem;
font-weight: 500;
cursor: pointer;
transition: all 0.2s ease;
text-decoration: none;
}
.period-btn:hover {
border-color: rgba(0, 212, 255, 0.3);
color: #e2e8f0;
}
.period-btn.active {
background: linear-gradient(135deg, rgba(0, 212, 255, 0.2), rgba(168, 85, 247, 0.2));
border-color: rgba(0, 212, 255, 0.5);
color: #00d4ff;
}
.refresh-btn { .refresh-btn {
display: inline-flex; display: inline-flex;
align-items: center; align-items: center;
@@ -424,9 +481,21 @@
Checking services... Checking services...
{% endif %} {% endif %}
&bull; Auto-refresh every {{ check_interval }}s &bull; Auto-refresh every {{ check_interval }}s
{% if grace_period_active %}
<span class="grace-period-badge">🚀 Startup ({{ grace_period_remaining }}s)</span>
{% elif fast_mode %}
<span class="fast-mode-badge">⚡ Fast Mode</span>
{% endif %}
</p> </p>
</header> </header>
<!-- Period Selector -->
<div class="period-selector">
<a href="?period=1" class="period-btn {% if period == 1 %}active{% endif %}">1 час</a>
<a href="?period=12" class="period-btn {% if period == 12 %}active{% endif %}">12 часов</a>
<a href="?period=24" class="period-btn {% if period == 24 %}active{% endif %}">24 часа</a>
</div>
{% if ssl_status %} {% if ssl_status %}
<div class="ssl-card {% if ssl_status.days_until_expiry <= 0 %}danger{% elif ssl_status.days_until_expiry <= 14 %}warning{% endif %}"> <div class="ssl-card {% if ssl_status.days_until_expiry <= 0 %}danger{% elif ssl_status.days_until_expiry <= 14 %}warning{% endif %}">
<div class="ssl-header"> <div class="ssl-header">
@@ -491,7 +560,7 @@
</div> </div>
</div> </div>
<div class="metric"> <div class="metric">
<div class="metric-label">Avg 24h</div> <div class="metric-label">Avg {{ period }}h</div>
<div class="metric-value {% if service.avg_latency_24h and service.avg_latency_24h < 200 %}good{% elif service.avg_latency_24h and service.avg_latency_24h < 500 %}warning{% elif service.avg_latency_24h %}bad{% endif %}"> <div class="metric-value {% if service.avg_latency_24h and service.avg_latency_24h < 200 %}good{% elif service.avg_latency_24h and service.avg_latency_24h < 500 %}warning{% elif service.avg_latency_24h %}bad{% endif %}">
{% if service.avg_latency_24h %} {% if service.avg_latency_24h %}
{{ "%.0f"|format(service.avg_latency_24h) }} ms {{ "%.0f"|format(service.avg_latency_24h) }} ms
@@ -501,7 +570,7 @@
</div> </div>
</div> </div>
<div class="metric"> <div class="metric">
<div class="metric-label">Uptime 24h</div> <div class="metric-label">Uptime {{ period }}h</div>
<div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}"> <div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}">
{{ "%.1f"|format(service.uptime_percent) }}% {{ "%.1f"|format(service.uptime_percent) }}%
</div> </div>
@@ -620,13 +689,29 @@
{% endif %} {% endif %}
{% endfor %} {% endfor %}
// Save scroll position before unload
window.addEventListener('beforeunload', () => {
sessionStorage.setItem('scrollPos', window.scrollY.toString());
});
// Restore scroll position on load
window.addEventListener('load', () => {
const scrollPos = sessionStorage.getItem('scrollPos');
if (scrollPos) {
window.scrollTo(0, parseInt(scrollPos));
}
});
async function refreshStatus(btn) { async function refreshStatus(btn) {
btn.classList.add('loading'); btn.classList.add('loading');
btn.disabled = true; btn.disabled = true;
try { try {
await fetch('/api/refresh', { method: 'POST' }); await fetch('/api/refresh', { method: 'POST' });
window.location.reload(); // Preserve period parameter on reload
const url = new URL(window.location);
url.searchParams.set('period', '{{ period }}');
window.location.href = url.toString();
} catch (e) { } catch (e) {
console.error('Refresh failed:', e); console.error('Refresh failed:', e);
btn.classList.remove('loading'); btn.classList.remove('loading');
@@ -634,9 +719,11 @@
} }
} }
// Auto-refresh page // Auto-refresh page (preserve period parameter)
setTimeout(() => { setTimeout(() => {
window.location.reload(); const url = new URL(window.location);
url.searchParams.set('period', '{{ period }}');
window.location.href = url.toString();
}, {{ check_interval }} * 1000); }, {{ check_interval }} * 1000);
</script> </script>
</body> </body>