Fix service status

This commit is contained in:
2025-12-20 02:28:41 +07:00
parent c645171671
commit 243abe55b5
5 changed files with 210 additions and 33 deletions

View File

@@ -804,6 +804,14 @@ export function LobbyPage() {
</select>
</div>
</div>
<div>
<label className="text-xs text-gray-400 mb-1 block">Подсказка для пруфа</label>
<Input
placeholder="Что именно должно быть на скриншоте/видео"
value={editChallenge.proof_hint}
onChange={(e) => setEditChallenge(prev => ({ ...prev, proof_hint: e.target.value }))}
/>
</div>
<div className="flex gap-2">
<NeonButton
size="sm"
@@ -852,6 +860,9 @@ export function LobbyPage() {
</div>
<h5 className="text-sm font-medium text-white">{challenge.title}</h5>
<p className="text-xs text-gray-400 mt-1">{challenge.description}</p>
{challenge.proof_hint && (
<p className="text-xs text-gray-500 mt-1">Пруф: {challenge.proof_hint}</p>
)}
</div>
{isOrganizer && (
<div className="flex gap-1 shrink-0">
@@ -1187,6 +1198,14 @@ export function LobbyPage() {
</select>
</div>
</div>
<div>
<label className="text-xs text-gray-400 mb-1 block">Подсказка для пруфа</label>
<Input
placeholder="Что именно должно быть на скриншоте/видео"
value={editChallenge.proof_hint}
onChange={(e) => setEditChallenge(prev => ({ ...prev, proof_hint: e.target.value }))}
/>
</div>
<div className="flex gap-2">
<NeonButton
size="sm"
@@ -1232,6 +1251,9 @@ export function LobbyPage() {
</div>
<h4 className="font-medium text-white mb-1">{challenge.title}</h4>
<p className="text-sm text-gray-400 mb-2">{challenge.description}</p>
{challenge.proof_hint && (
<p className="text-xs text-gray-500 mb-2">Пруф: {challenge.proof_hint}</p>
)}
{challenge.proposed_by && (
<p className="text-xs text-gray-500 flex items-center gap-1">
<User className="w-3 h-3" /> Предложил: {challenge.proposed_by.nickname}
@@ -1310,6 +1332,9 @@ export function LobbyPage() {
</div>
<h4 className="font-medium text-white mb-1">{challenge.title}</h4>
<p className="text-sm text-gray-400">{challenge.description}</p>
{challenge.proof_hint && (
<p className="text-xs text-gray-500 mt-1">Пруф: {challenge.proof_hint}</p>
)}
</div>
{challenge.status === 'pending' && (
<button

View File

@@ -90,7 +90,7 @@ def get_latency_history(service_name: str, hours: int = 24) -> list[dict]:
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
since = datetime.utcnow() - timedelta(hours=hours)
cursor.execute("""
SELECT latency_ms, status, checked_at
FROM metrics
@@ -116,7 +116,7 @@ def get_uptime_stats(service_name: str, hours: int = 24) -> dict:
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
since = datetime.utcnow() - timedelta(hours=hours)
cursor.execute("""
SELECT COUNT(*) as total,
@@ -143,7 +143,7 @@ def get_avg_latency(service_name: str, hours: int = 24) -> Optional[float]:
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
since = datetime.utcnow() - timedelta(hours=hours)
cursor.execute("""
SELECT AVG(latency_ms) as avg_latency
FROM metrics
@@ -249,11 +249,11 @@ def get_ssl_info(domain: str) -> Optional[dict]:
return None
def cleanup_old_metrics(days: int = 1):
"""Delete metrics older than specified days (default: 24 hours)."""
def cleanup_old_metrics(hours: int = 24):
"""Delete metrics older than specified hours (default: 24 hours)."""
conn = get_connection()
cursor = conn.cursor()
cutoff = datetime.now() - timedelta(days=days)
cutoff = datetime.utcnow() - timedelta(hours=hours)
cursor.execute("DELETE FROM metrics WHERE checked_at < ?", (cutoff.isoformat(),))
deleted = cursor.rowcount
conn.commit()

View File

@@ -9,7 +9,7 @@ from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from monitors import ServiceMonitor
from monitors import ServiceMonitor, Status
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
@@ -19,52 +19,91 @@ FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "600")) # 10 minutes
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "60")) # Normal interval (1 minute)
FAST_CHECK_INTERVAL = int(os.getenv("FAST_CHECK_INTERVAL", "5")) # Fast interval when issues detected
STARTUP_GRACE_PERIOD = int(os.getenv("STARTUP_GRACE_PERIOD", "60")) # Wait before alerting after startup
# Initialize monitor
monitor = ServiceMonitor()
startup_time: Optional[datetime] = None # Track when service started
# Background task reference
background_task: Optional[asyncio.Task] = None
cleanup_task: Optional[asyncio.Task] = None
def has_issues() -> bool:
"""Check if any monitored service has issues."""
for name, svc in monitor.services.items():
# Skip external if not configured
if name == "external" and svc.status == Status.UNKNOWN:
continue
if svc.status in (Status.DOWN, Status.DEGRADED):
return True
return False
async def periodic_health_check():
"""Background task to check services periodically."""
"""Background task to check services periodically with adaptive polling."""
while True:
try:
# Suppress alerts during startup grace period
suppress_alerts = is_in_grace_period()
if suppress_alerts:
remaining = STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()
print(f"Grace period: {remaining:.0f}s remaining (alerts suppressed)")
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
public_url=PUBLIC_URL,
suppress_alerts=suppress_alerts
)
except Exception as e:
print(f"Health check error: {e}")
# Adaptive polling: check more frequently when issues detected
if has_issues():
await asyncio.sleep(FAST_CHECK_INTERVAL)
else:
await asyncio.sleep(CHECK_INTERVAL)
async def periodic_cleanup():
"""Background task to cleanup old metrics (hourly)."""
"""Background task to cleanup old metrics (runs immediately, then hourly)."""
while True:
await asyncio.sleep(3600) # 1 hour
try:
deleted = cleanup_old_metrics(days=1) # Keep only last 24 hours
deleted = cleanup_old_metrics(hours=24) # Keep only last 24 hours
if deleted > 0:
print(f"Cleaned up {deleted} old metrics")
except Exception as e:
print(f"Cleanup error: {e}")
await asyncio.sleep(3600) # Wait 1 hour before next cleanup
def is_in_grace_period() -> bool:
"""Check if we're still in startup grace period."""
if startup_time is None:
return True
elapsed = (datetime.now() - startup_time).total_seconds()
return elapsed < STARTUP_GRACE_PERIOD
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Startup and shutdown events."""
global background_task, cleanup_task
global background_task, cleanup_task, startup_time
# Initialize database
init_db()
print("Database initialized")
# Mark startup time
startup_time = datetime.now()
print(f"Startup grace period: {STARTUP_GRACE_PERIOD}s (no alerts until services stabilize)")
# Start background health checks
background_task = asyncio.create_task(periodic_health_check())
cleanup_task = asyncio.create_task(periodic_cleanup())
@@ -91,12 +130,20 @@ templates = Jinja2Templates(directory="templates")
@app.get("/", response_class=HTMLResponse)
async def status_page(request: Request):
async def status_page(request: Request, period: int = 24):
"""Main status page."""
services = monitor.get_all_statuses()
# Validate period (1, 12, or 24 hours)
if period not in (1, 12, 24):
period = 24
services = monitor.get_all_statuses(period_hours=period)
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
incidents = get_recent_incidents(limit=5)
fast_mode = has_issues()
current_interval = FAST_CHECK_INTERVAL if fast_mode else CHECK_INTERVAL
grace_period_active = is_in_grace_period()
grace_period_remaining = max(0, STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()) if startup_time else 0
return templates.TemplateResponse(
"index.html",
@@ -107,7 +154,11 @@ async def status_page(request: Request):
"ssl_status": ssl_status,
"incidents": incidents,
"last_check": monitor.last_check,
"check_interval": CHECK_INTERVAL
"check_interval": current_interval,
"fast_mode": fast_mode,
"grace_period_active": grace_period_active,
"grace_period_remaining": int(grace_period_remaining),
"period": period
}
)
@@ -118,13 +169,15 @@ async def api_status():
services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
current_interval = FAST_CHECK_INTERVAL if has_issues() else CHECK_INTERVAL
return {
"overall_status": overall_status.value,
"services": {name: status.to_dict() for name, status in services.items()},
"ssl": ssl_status,
"last_check": monitor.last_check.isoformat() if monitor.last_check else None,
"check_interval_seconds": CHECK_INTERVAL
"check_interval_seconds": current_interval,
"fast_mode": has_issues()
}

View File

@@ -184,7 +184,8 @@ class ServiceMonitor:
self,
service_name: str,
result: tuple,
now: datetime
now: datetime,
suppress_alerts: bool = False
):
"""Process check result with DB persistence and alerting."""
if isinstance(result, Exception):
@@ -221,11 +222,12 @@ class ServiceMonitor:
if stats["total_checks"] > 0:
svc.uptime_percent = stats["uptime_percent"]
# Handle incident tracking and alerting
# Handle incident tracking and alerting (skip alerts during grace period)
if is_down and not was_down:
# Service just went down
svc.last_incident = now
incident_id = create_incident(service_name, status.value, message)
if not suppress_alerts:
await alert_service_down(service_name, svc.display_name, message)
mark_incident_notified(incident_id)
@@ -236,6 +238,7 @@ class ServiceMonitor:
started_at = datetime.fromisoformat(open_incident["started_at"])
downtime_minutes = int((now - started_at).total_seconds() / 60)
resolve_incident(service_name)
if not suppress_alerts:
await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
async def check_all_services(
@@ -244,7 +247,8 @@ class ServiceMonitor:
frontend_url: str,
bot_url: str,
external_url: str = "",
public_url: str = ""
public_url: str = "",
suppress_alerts: bool = False
):
"""Check all services concurrently."""
now = datetime.now()
@@ -262,7 +266,7 @@ class ServiceMonitor:
# Process results
service_names = ["backend", "database", "frontend", "bot", "external"]
for i, service_name in enumerate(service_names):
await self._process_check_result(service_name, results[i], now)
await self._process_check_result(service_name, results[i], now, suppress_alerts)
# Check SSL certificate (if public URL is HTTPS)
if public_url and public_url.startswith("https://"):
@@ -270,7 +274,15 @@ class ServiceMonitor:
self.last_check = now
def get_all_statuses(self) -> dict[str, ServiceStatus]:
def get_all_statuses(self, period_hours: int = 24) -> dict[str, ServiceStatus]:
"""Get all service statuses with data for specified period."""
# Update historical data for requested period
for name, svc in self.services.items():
svc.latency_history = get_latency_history(name, hours=period_hours)
svc.avg_latency_24h = get_avg_latency(name, hours=period_hours)
stats = get_uptime_stats(name, hours=period_hours)
if stats["total_checks"] > 0:
svc.uptime_percent = stats["uptime_percent"]
return self.services
def get_overall_status(self) -> Status:

View File

@@ -107,6 +107,32 @@
font-size: 0.9rem;
}
.fast-mode-badge {
display: inline-block;
margin-left: 8px;
padding: 2px 8px;
background: rgba(250, 204, 21, 0.15);
border: 1px solid rgba(250, 204, 21, 0.3);
border-radius: 12px;
color: #facc15;
font-size: 0.8rem;
font-weight: 500;
animation: pulse 2s infinite;
}
.grace-period-badge {
display: inline-block;
margin-left: 8px;
padding: 2px 8px;
background: rgba(59, 130, 246, 0.15);
border: 1px solid rgba(59, 130, 246, 0.3);
border-radius: 12px;
color: #3b82f6;
font-size: 0.8rem;
font-weight: 500;
animation: pulse 2s infinite;
}
.services-grid {
display: grid;
gap: 16px;
@@ -347,6 +373,37 @@
color: #64748b;
}
.period-selector {
display: flex;
justify-content: center;
gap: 8px;
margin-bottom: 24px;
}
.period-btn {
padding: 8px 16px;
background: rgba(30, 41, 59, 0.5);
border: 1px solid rgba(100, 116, 139, 0.3);
border-radius: 8px;
color: #94a3b8;
font-size: 0.9rem;
font-weight: 500;
cursor: pointer;
transition: all 0.2s ease;
text-decoration: none;
}
.period-btn:hover {
border-color: rgba(0, 212, 255, 0.3);
color: #e2e8f0;
}
.period-btn.active {
background: linear-gradient(135deg, rgba(0, 212, 255, 0.2), rgba(168, 85, 247, 0.2));
border-color: rgba(0, 212, 255, 0.5);
color: #00d4ff;
}
.refresh-btn {
display: inline-flex;
align-items: center;
@@ -424,9 +481,21 @@
Checking services...
{% endif %}
&bull; Auto-refresh every {{ check_interval }}s
{% if grace_period_active %}
<span class="grace-period-badge">🚀 Startup ({{ grace_period_remaining }}s)</span>
{% elif fast_mode %}
<span class="fast-mode-badge">⚡ Fast Mode</span>
{% endif %}
</p>
</header>
<!-- Period Selector -->
<div class="period-selector">
<a href="?period=1" class="period-btn {% if period == 1 %}active{% endif %}">1 час</a>
<a href="?period=12" class="period-btn {% if period == 12 %}active{% endif %}">12 часов</a>
<a href="?period=24" class="period-btn {% if period == 24 %}active{% endif %}">24 часа</a>
</div>
{% if ssl_status %}
<div class="ssl-card {% if ssl_status.days_until_expiry <= 0 %}danger{% elif ssl_status.days_until_expiry <= 14 %}warning{% endif %}">
<div class="ssl-header">
@@ -491,7 +560,7 @@
</div>
</div>
<div class="metric">
<div class="metric-label">Avg 24h</div>
<div class="metric-label">Avg {{ period }}h</div>
<div class="metric-value {% if service.avg_latency_24h and service.avg_latency_24h < 200 %}good{% elif service.avg_latency_24h and service.avg_latency_24h < 500 %}warning{% elif service.avg_latency_24h %}bad{% endif %}">
{% if service.avg_latency_24h %}
{{ "%.0f"|format(service.avg_latency_24h) }} ms
@@ -501,7 +570,7 @@
</div>
</div>
<div class="metric">
<div class="metric-label">Uptime 24h</div>
<div class="metric-label">Uptime {{ period }}h</div>
<div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}">
{{ "%.1f"|format(service.uptime_percent) }}%
</div>
@@ -620,13 +689,29 @@
{% endif %}
{% endfor %}
// Save scroll position before unload
window.addEventListener('beforeunload', () => {
sessionStorage.setItem('scrollPos', window.scrollY.toString());
});
// Restore scroll position on load
window.addEventListener('load', () => {
const scrollPos = sessionStorage.getItem('scrollPos');
if (scrollPos) {
window.scrollTo(0, parseInt(scrollPos));
}
});
async function refreshStatus(btn) {
btn.classList.add('loading');
btn.disabled = true;
try {
await fetch('/api/refresh', { method: 'POST' });
window.location.reload();
// Preserve period parameter on reload
const url = new URL(window.location);
url.searchParams.set('period', '{{ period }}');
window.location.href = url.toString();
} catch (e) {
console.error('Refresh failed:', e);
btn.classList.remove('loading');
@@ -634,9 +719,11 @@
}
}
// Auto-refresh page
// Auto-refresh page (preserve period parameter)
setTimeout(() => {
window.location.reload();
const url = new URL(window.location);
url.searchParams.set('period', '{{ period }}');
window.location.href = url.toString();
}, {{ check_interval }} * 1000);
</script>
</body>