228 lines
8.5 KiB
Python
228 lines
8.5 KiB
Python
|
|
import asyncio
|
||
|
|
from datetime import datetime, timedelta
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
from typing import Optional
|
||
|
|
from enum import Enum
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
|
||
|
|
|
||
|
|
class Status(str, Enum):
|
||
|
|
OPERATIONAL = "operational"
|
||
|
|
DEGRADED = "degraded"
|
||
|
|
DOWN = "down"
|
||
|
|
UNKNOWN = "unknown"
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ServiceStatus:
|
||
|
|
name: str
|
||
|
|
display_name: str
|
||
|
|
status: Status = Status.UNKNOWN
|
||
|
|
latency_ms: Optional[float] = None
|
||
|
|
last_check: Optional[datetime] = None
|
||
|
|
last_incident: Optional[datetime] = None
|
||
|
|
uptime_percent: float = 100.0
|
||
|
|
message: Optional[str] = None
|
||
|
|
version: Optional[str] = None
|
||
|
|
|
||
|
|
# For uptime calculation
|
||
|
|
total_checks: int = 0
|
||
|
|
successful_checks: int = 0
|
||
|
|
|
||
|
|
def to_dict(self) -> dict:
|
||
|
|
return {
|
||
|
|
"name": self.name,
|
||
|
|
"display_name": self.display_name,
|
||
|
|
"status": self.status.value,
|
||
|
|
"latency_ms": round(self.latency_ms, 2) if self.latency_ms else None,
|
||
|
|
"last_check": self.last_check.isoformat() if self.last_check else None,
|
||
|
|
"last_incident": self.last_incident.isoformat() if self.last_incident else None,
|
||
|
|
"uptime_percent": round(self.uptime_percent, 2),
|
||
|
|
"message": self.message,
|
||
|
|
"version": self.version
|
||
|
|
}
|
||
|
|
|
||
|
|
def update_uptime(self, is_success: bool):
|
||
|
|
self.total_checks += 1
|
||
|
|
if is_success:
|
||
|
|
self.successful_checks += 1
|
||
|
|
if self.total_checks > 0:
|
||
|
|
self.uptime_percent = (self.successful_checks / self.total_checks) * 100
|
||
|
|
|
||
|
|
|
||
|
|
class ServiceMonitor:
|
||
|
|
def __init__(self):
|
||
|
|
self.services: dict[str, ServiceStatus] = {
|
||
|
|
"backend": ServiceStatus(
|
||
|
|
name="backend",
|
||
|
|
display_name="Backend API"
|
||
|
|
),
|
||
|
|
"database": ServiceStatus(
|
||
|
|
name="database",
|
||
|
|
display_name="Database"
|
||
|
|
),
|
||
|
|
"frontend": ServiceStatus(
|
||
|
|
name="frontend",
|
||
|
|
display_name="Frontend"
|
||
|
|
),
|
||
|
|
"bot": ServiceStatus(
|
||
|
|
name="bot",
|
||
|
|
display_name="Telegram Bot"
|
||
|
|
)
|
||
|
|
}
|
||
|
|
self.last_check: Optional[datetime] = None
|
||
|
|
|
||
|
|
async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]:
|
||
|
|
"""Check backend API health"""
|
||
|
|
try:
|
||
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||
|
|
start = datetime.now()
|
||
|
|
response = await client.get(f"{url}/health")
|
||
|
|
latency = (datetime.now() - start).total_seconds() * 1000
|
||
|
|
|
||
|
|
if response.status_code == 200:
|
||
|
|
data = response.json()
|
||
|
|
return Status.OPERATIONAL, latency, None, data.get("version")
|
||
|
|
else:
|
||
|
|
return Status.DEGRADED, latency, f"HTTP {response.status_code}", None
|
||
|
|
except httpx.TimeoutException:
|
||
|
|
return Status.DOWN, None, "Timeout", None
|
||
|
|
except Exception as e:
|
||
|
|
return Status.DOWN, None, str(e)[:100], None
|
||
|
|
|
||
|
|
async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||
|
|
"""Check database through backend"""
|
||
|
|
# We check database indirectly - if backend is up, DB is likely up
|
||
|
|
# Could add a specific /health/db endpoint to backend later
|
||
|
|
try:
|
||
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||
|
|
start = datetime.now()
|
||
|
|
response = await client.get(f"{backend_url}/health")
|
||
|
|
latency = (datetime.now() - start).total_seconds() * 1000
|
||
|
|
|
||
|
|
if response.status_code == 200:
|
||
|
|
return Status.OPERATIONAL, latency, None
|
||
|
|
else:
|
||
|
|
return Status.DOWN, latency, "Backend reports unhealthy"
|
||
|
|
except Exception as e:
|
||
|
|
return Status.DOWN, None, "Cannot reach backend"
|
||
|
|
|
||
|
|
async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||
|
|
"""Check frontend availability"""
|
||
|
|
try:
|
||
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||
|
|
start = datetime.now()
|
||
|
|
response = await client.get(url)
|
||
|
|
latency = (datetime.now() - start).total_seconds() * 1000
|
||
|
|
|
||
|
|
if response.status_code == 200:
|
||
|
|
return Status.OPERATIONAL, latency, None
|
||
|
|
else:
|
||
|
|
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
|
||
|
|
except httpx.TimeoutException:
|
||
|
|
return Status.DOWN, None, "Timeout"
|
||
|
|
except Exception as e:
|
||
|
|
return Status.DOWN, None, str(e)[:100]
|
||
|
|
|
||
|
|
async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||
|
|
"""Check Telegram bot health"""
|
||
|
|
try:
|
||
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||
|
|
start = datetime.now()
|
||
|
|
response = await client.get(f"{url}/health")
|
||
|
|
latency = (datetime.now() - start).total_seconds() * 1000
|
||
|
|
|
||
|
|
if response.status_code == 200:
|
||
|
|
return Status.OPERATIONAL, latency, None
|
||
|
|
else:
|
||
|
|
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
|
||
|
|
except httpx.TimeoutException:
|
||
|
|
return Status.DOWN, None, "Timeout"
|
||
|
|
except Exception as e:
|
||
|
|
return Status.DOWN, None, str(e)[:100]
|
||
|
|
|
||
|
|
async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str):
|
||
|
|
"""Check all services concurrently"""
|
||
|
|
now = datetime.now()
|
||
|
|
|
||
|
|
# Run all checks concurrently
|
||
|
|
results = await asyncio.gather(
|
||
|
|
self.check_backend(backend_url),
|
||
|
|
self.check_database(backend_url),
|
||
|
|
self.check_frontend(frontend_url),
|
||
|
|
self.check_bot(bot_url),
|
||
|
|
return_exceptions=True
|
||
|
|
)
|
||
|
|
|
||
|
|
# Process backend result
|
||
|
|
if not isinstance(results[0], Exception):
|
||
|
|
status, latency, message, version = results[0]
|
||
|
|
svc = self.services["backend"]
|
||
|
|
was_down = svc.status == Status.DOWN
|
||
|
|
svc.status = status
|
||
|
|
svc.latency_ms = latency
|
||
|
|
svc.message = message
|
||
|
|
svc.version = version
|
||
|
|
svc.last_check = now
|
||
|
|
svc.update_uptime(status == Status.OPERATIONAL)
|
||
|
|
if status != Status.OPERATIONAL and not was_down:
|
||
|
|
svc.last_incident = now
|
||
|
|
|
||
|
|
# Process database result
|
||
|
|
if not isinstance(results[1], Exception):
|
||
|
|
status, latency, message = results[1]
|
||
|
|
svc = self.services["database"]
|
||
|
|
was_down = svc.status == Status.DOWN
|
||
|
|
svc.status = status
|
||
|
|
svc.latency_ms = latency
|
||
|
|
svc.message = message
|
||
|
|
svc.last_check = now
|
||
|
|
svc.update_uptime(status == Status.OPERATIONAL)
|
||
|
|
if status != Status.OPERATIONAL and not was_down:
|
||
|
|
svc.last_incident = now
|
||
|
|
|
||
|
|
# Process frontend result
|
||
|
|
if not isinstance(results[2], Exception):
|
||
|
|
status, latency, message = results[2]
|
||
|
|
svc = self.services["frontend"]
|
||
|
|
was_down = svc.status == Status.DOWN
|
||
|
|
svc.status = status
|
||
|
|
svc.latency_ms = latency
|
||
|
|
svc.message = message
|
||
|
|
svc.last_check = now
|
||
|
|
svc.update_uptime(status == Status.OPERATIONAL)
|
||
|
|
if status != Status.OPERATIONAL and not was_down:
|
||
|
|
svc.last_incident = now
|
||
|
|
|
||
|
|
# Process bot result
|
||
|
|
if not isinstance(results[3], Exception):
|
||
|
|
status, latency, message = results[3]
|
||
|
|
svc = self.services["bot"]
|
||
|
|
was_down = svc.status == Status.DOWN
|
||
|
|
svc.status = status
|
||
|
|
svc.latency_ms = latency
|
||
|
|
svc.message = message
|
||
|
|
svc.last_check = now
|
||
|
|
svc.update_uptime(status == Status.OPERATIONAL)
|
||
|
|
if status != Status.OPERATIONAL and not was_down:
|
||
|
|
svc.last_incident = now
|
||
|
|
|
||
|
|
self.last_check = now
|
||
|
|
|
||
|
|
def get_all_statuses(self) -> dict[str, ServiceStatus]:
|
||
|
|
return self.services
|
||
|
|
|
||
|
|
def get_overall_status(self) -> Status:
|
||
|
|
"""Get overall system status based on all services"""
|
||
|
|
statuses = [svc.status for svc in self.services.values()]
|
||
|
|
|
||
|
|
if all(s == Status.OPERATIONAL for s in statuses):
|
||
|
|
return Status.OPERATIONAL
|
||
|
|
elif any(s == Status.DOWN for s in statuses):
|
||
|
|
return Status.DOWN
|
||
|
|
elif any(s == Status.DEGRADED for s in statuses):
|
||
|
|
return Status.DEGRADED
|
||
|
|
else:
|
||
|
|
return Status.UNKNOWN
|