From 57bad3b4a8ae68d99dd5139c8218875451d7a901 Mon Sep 17 00:00:00 2001 From: Oronemu Date: Thu, 18 Dec 2025 03:35:13 +0700 Subject: [PATCH] Redesign health service + create backup service --- .env.example | 9 + Makefile | 23 +++ backup-service/Dockerfile | 30 +++ backup-service/backup.py | 217 ++++++++++++++++++++++ backup-service/config.py | 33 ++++ backup-service/crontab | 4 + backup-service/requirements.txt | 2 + backup-service/restore.py | 158 ++++++++++++++++ docker-compose.yml | 32 ++++ status-service/Dockerfile | 3 + status-service/alerts.py | 85 +++++++++ status-service/database.py | 261 ++++++++++++++++++++++++++ status-service/main.py | 104 ++++++++--- status-service/monitors.py | 208 ++++++++++++++------- status-service/ssl_monitor.py | 140 ++++++++++++++ status-service/templates/index.html | 275 +++++++++++++++++++++++++++- 16 files changed, 1486 insertions(+), 98 deletions(-) create mode 100644 backup-service/Dockerfile create mode 100644 backup-service/backup.py create mode 100644 backup-service/config.py create mode 100644 backup-service/crontab create mode 100644 backup-service/requirements.txt create mode 100644 backup-service/restore.py create mode 100644 status-service/alerts.py create mode 100644 status-service/database.py create mode 100644 status-service/ssl_monitor.py diff --git a/.env.example b/.env.example index 2371084..0d9095b 100644 --- a/.env.example +++ b/.env.example @@ -20,5 +20,14 @@ S3_SECRET_ACCESS_KEY=your-secret-access-key S3_ENDPOINT_URL=https://s3.firstvds.ru S3_PUBLIC_URL=https://your-bucket-name.s3.firstvds.ru +# Backup Service +TELEGRAM_ADMIN_ID=947392854 +S3_BACKUP_PREFIX=backups/ +BACKUP_RETENTION_DAYS=14 + +# Status Service (optional - for external monitoring) +EXTERNAL_URL=https://your-domain.com +PUBLIC_URL=https://your-domain.com + # Frontend (for build) VITE_API_URL=/api/v1 diff --git a/Makefile b/Makefile index 12ef5e7..8cc670d 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,12 @@ help: @echo " make shell - Open backend shell" @echo " make frontend-sh - Open frontend shell" @echo "" + @echo " Backup:" + @echo " make backup-now - Run backup immediately" + @echo " make backup-list - List available backups in S3" + @echo " make backup-restore - Restore from backup (interactive)" + @echo " make backup-logs - Show backup service logs" + @echo "" @echo " Cleanup:" @echo " make clean - Stop and remove containers, volumes" @echo " make prune - Remove unused Docker resources" @@ -137,3 +143,20 @@ test-backend: # Production prod: $(DC) -f docker-compose.yml up -d --build + +# Backup +backup-now: + $(DC) exec backup python /app/backup.py + +backup-list: + $(DC) exec backup python /app/restore.py + +backup-restore: + @read -p "Backup filename: " file; \ + $(DC) exec -it backup python /app/restore.py "$$file" + +backup-logs: + $(DC) logs -f backup + +backup-shell: + $(DC) exec backup bash diff --git a/backup-service/Dockerfile b/backup-service/Dockerfile new file mode 100644 index 0000000..09ab093 --- /dev/null +++ b/backup-service/Dockerfile @@ -0,0 +1,30 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install PostgreSQL client (for pg_dump and psql) and cron +RUN apt-get update && apt-get install -y \ + postgresql-client \ + cron \ + && rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +# Make scripts executable +RUN chmod +x backup.py restore.py + +# Setup cron +COPY crontab /etc/cron.d/backup-cron +RUN chmod 0644 /etc/cron.d/backup-cron +RUN crontab /etc/cron.d/backup-cron + +# Create log file +RUN touch /var/log/cron.log + +# Start cron in foreground and tail logs +CMD ["sh", "-c", "printenv > /etc/environment && cron && tail -f /var/log/cron.log"] diff --git a/backup-service/backup.py b/backup-service/backup.py new file mode 100644 index 0000000..075a4ca --- /dev/null +++ b/backup-service/backup.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 +""" +PostgreSQL Backup Service for WebApp. + +- Creates pg_dump backup +- Compresses with gzip +- Uploads to S3 FirstVDS +- Rotates old backups (configurable retention) +- Sends Telegram notifications +""" +import gzip +import os +import subprocess +import sys +from datetime import datetime, timedelta, timezone + +import boto3 +import httpx +from botocore.config import Config as BotoConfig +from botocore.exceptions import ClientError + +from config import config + + +def create_s3_client(): + """Initialize S3 client (same pattern as backend storage.py).""" + return boto3.client( + "s3", + endpoint_url=config.S3_ENDPOINT_URL, + aws_access_key_id=config.S3_ACCESS_KEY_ID, + aws_secret_access_key=config.S3_SECRET_ACCESS_KEY, + region_name=config.S3_REGION or "us-east-1", + config=BotoConfig(signature_version="s3v4"), + ) + + +def send_telegram_notification(message: str, is_error: bool = False) -> None: + """Send notification to Telegram admin.""" + if not config.TELEGRAM_BOT_TOKEN or not config.TELEGRAM_ADMIN_ID: + print("Telegram not configured, skipping notification") + return + + emoji = "\u274c" if is_error else "\u2705" + text = f"{emoji} *Database Backup*\n\n{message}" + + url = f"https://api.telegram.org/bot{config.TELEGRAM_BOT_TOKEN}/sendMessage" + data = { + "chat_id": config.TELEGRAM_ADMIN_ID, + "text": text, + "parse_mode": "Markdown", + } + + try: + response = httpx.post(url, json=data, timeout=30) + response.raise_for_status() + print("Telegram notification sent") + except Exception as e: + print(f"Failed to send Telegram notification: {e}") + + +def create_backup() -> tuple[str, bytes]: + """Create pg_dump backup and compress it.""" + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + filename = f"marathon_backup_{timestamp}.sql.gz" + + # Build pg_dump command + env = os.environ.copy() + env["PGPASSWORD"] = config.DB_PASSWORD + + cmd = [ + "pg_dump", + "-h", + config.DB_HOST, + "-p", + config.DB_PORT, + "-U", + config.DB_USER, + "-d", + config.DB_NAME, + "--no-owner", + "--no-acl", + "-F", + "p", # plain SQL format + ] + + print(f"Running pg_dump for database {config.DB_NAME}...") + result = subprocess.run( + cmd, + env=env, + capture_output=True, + ) + + if result.returncode != 0: + raise Exception(f"pg_dump failed: {result.stderr.decode()}") + + # Compress the output + print("Compressing backup...") + compressed = gzip.compress(result.stdout, compresslevel=9) + + return filename, compressed + + +def upload_to_s3(s3_client, filename: str, data: bytes) -> str: + """Upload backup to S3.""" + key = f"{config.S3_BACKUP_PREFIX}{filename}" + + print(f"Uploading to S3: {key}...") + s3_client.put_object( + Bucket=config.S3_BUCKET_NAME, + Key=key, + Body=data, + ContentType="application/gzip", + ) + + return key + + +def rotate_old_backups(s3_client) -> int: + """Delete backups older than BACKUP_RETENTION_DAYS.""" + cutoff_date = datetime.now(timezone.utc) - timedelta( + days=config.BACKUP_RETENTION_DAYS + ) + deleted_count = 0 + + print(f"Rotating backups older than {config.BACKUP_RETENTION_DAYS} days...") + + # List all objects with backup prefix + try: + paginator = s3_client.get_paginator("list_objects_v2") + pages = paginator.paginate( + Bucket=config.S3_BUCKET_NAME, + Prefix=config.S3_BACKUP_PREFIX, + ) + + for page in pages: + for obj in page.get("Contents", []): + last_modified = obj["LastModified"] + if last_modified.tzinfo is None: + last_modified = last_modified.replace(tzinfo=timezone.utc) + + if last_modified < cutoff_date: + s3_client.delete_object( + Bucket=config.S3_BUCKET_NAME, + Key=obj["Key"], + ) + deleted_count += 1 + print(f"Deleted old backup: {obj['Key']}") + except ClientError as e: + print(f"Error during rotation: {e}") + + return deleted_count + + +def main() -> int: + """Main backup routine.""" + start_time = datetime.now() + + print(f"{'=' * 50}") + print(f"Backup started at {start_time}") + print(f"{'=' * 50}") + + try: + # Validate configuration + if not config.S3_BUCKET_NAME: + raise Exception("S3_BUCKET_NAME is not configured") + if not config.S3_ACCESS_KEY_ID: + raise Exception("S3_ACCESS_KEY_ID is not configured") + if not config.S3_SECRET_ACCESS_KEY: + raise Exception("S3_SECRET_ACCESS_KEY is not configured") + if not config.S3_ENDPOINT_URL: + raise Exception("S3_ENDPOINT_URL is not configured") + + # Create S3 client + s3_client = create_s3_client() + + # Create backup + filename, data = create_backup() + size_mb = len(data) / (1024 * 1024) + print(f"Backup created: {filename} ({size_mb:.2f} MB)") + + # Upload to S3 + s3_key = upload_to_s3(s3_client, filename, data) + print(f"Uploaded to S3: {s3_key}") + + # Rotate old backups + deleted_count = rotate_old_backups(s3_client) + print(f"Deleted {deleted_count} old backups") + + # Calculate duration + duration = datetime.now() - start_time + + # Send success notification + message = ( + f"Backup completed successfully!\n\n" + f"*File:* `{filename}`\n" + f"*Size:* {size_mb:.2f} MB\n" + f"*Duration:* {duration.seconds}s\n" + f"*Deleted old:* {deleted_count} files" + ) + send_telegram_notification(message, is_error=False) + + print(f"{'=' * 50}") + print("Backup completed successfully!") + print(f"{'=' * 50}") + return 0 + + except Exception as e: + error_msg = f"Backup failed!\n\n*Error:* `{str(e)}`" + send_telegram_notification(error_msg, is_error=True) + print(f"{'=' * 50}") + print(f"Backup failed: {e}") + print(f"{'=' * 50}") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/backup-service/config.py b/backup-service/config.py new file mode 100644 index 0000000..594e465 --- /dev/null +++ b/backup-service/config.py @@ -0,0 +1,33 @@ +"""Configuration for backup service.""" +import os +from dataclasses import dataclass + + +@dataclass +class Config: + """Backup service configuration from environment variables.""" + + # Database + DB_HOST: str = os.getenv("DB_HOST", "db") + DB_PORT: str = os.getenv("DB_PORT", "5432") + DB_NAME: str = os.getenv("DB_NAME", "marathon") + DB_USER: str = os.getenv("DB_USER", "marathon") + DB_PASSWORD: str = os.getenv("DB_PASSWORD", "123") + + # S3 + S3_BUCKET_NAME: str = os.getenv("S3_BUCKET_NAME", "") + S3_REGION: str = os.getenv("S3_REGION", "ru-1") + S3_ACCESS_KEY_ID: str = os.getenv("S3_ACCESS_KEY_ID", "") + S3_SECRET_ACCESS_KEY: str = os.getenv("S3_SECRET_ACCESS_KEY", "") + S3_ENDPOINT_URL: str = os.getenv("S3_ENDPOINT_URL", "") + S3_BACKUP_PREFIX: str = os.getenv("S3_BACKUP_PREFIX", "backups/") + + # Telegram + TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "") + TELEGRAM_ADMIN_ID: str = os.getenv("TELEGRAM_ADMIN_ID", "947392854") + + # Backup settings + BACKUP_RETENTION_DAYS: int = int(os.getenv("BACKUP_RETENTION_DAYS", "14")) + + +config = Config() diff --git a/backup-service/crontab b/backup-service/crontab new file mode 100644 index 0000000..de00c64 --- /dev/null +++ b/backup-service/crontab @@ -0,0 +1,4 @@ +# Backup cron job +# Run backup daily at 3:00 AM UTC +0 3 * * * /usr/local/bin/python /app/backup.py >> /var/log/cron.log 2>&1 +# Empty line required at end of crontab diff --git a/backup-service/requirements.txt b/backup-service/requirements.txt new file mode 100644 index 0000000..644e4ae --- /dev/null +++ b/backup-service/requirements.txt @@ -0,0 +1,2 @@ +boto3==1.34.0 +httpx==0.26.0 diff --git a/backup-service/restore.py b/backup-service/restore.py new file mode 100644 index 0000000..23f48fb --- /dev/null +++ b/backup-service/restore.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +""" +Restore PostgreSQL database from S3 backup. + +Usage: + python restore.py - List available backups + python restore.py - Restore from specific backup +""" +import gzip +import os +import subprocess +import sys + +import boto3 +from botocore.config import Config as BotoConfig +from botocore.exceptions import ClientError + +from config import config + + +def create_s3_client(): + """Initialize S3 client.""" + return boto3.client( + "s3", + endpoint_url=config.S3_ENDPOINT_URL, + aws_access_key_id=config.S3_ACCESS_KEY_ID, + aws_secret_access_key=config.S3_SECRET_ACCESS_KEY, + region_name=config.S3_REGION or "us-east-1", + config=BotoConfig(signature_version="s3v4"), + ) + + +def list_backups(s3_client) -> list[tuple[str, float, str]]: + """List all available backups.""" + print("Available backups:\n") + + try: + paginator = s3_client.get_paginator("list_objects_v2") + pages = paginator.paginate( + Bucket=config.S3_BUCKET_NAME, + Prefix=config.S3_BACKUP_PREFIX, + ) + + backups = [] + for page in pages: + for obj in page.get("Contents", []): + filename = obj["Key"].replace(config.S3_BACKUP_PREFIX, "") + size_mb = obj["Size"] / (1024 * 1024) + modified = obj["LastModified"].strftime("%Y-%m-%d %H:%M:%S") + backups.append((filename, size_mb, modified)) + + # Sort by date descending (newest first) + backups.sort(key=lambda x: x[2], reverse=True) + + for filename, size_mb, modified in backups: + print(f" {filename} ({size_mb:.2f} MB) - {modified}") + + return backups + + except ClientError as e: + print(f"Error listing backups: {e}") + return [] + + +def restore_backup(s3_client, filename: str) -> None: + """Download and restore backup.""" + key = f"{config.S3_BACKUP_PREFIX}{filename}" + + print(f"Downloading {filename} from S3...") + try: + response = s3_client.get_object( + Bucket=config.S3_BUCKET_NAME, + Key=key, + ) + compressed_data = response["Body"].read() + except ClientError as e: + raise Exception(f"Failed to download backup: {e}") + + print("Decompressing...") + sql_data = gzip.decompress(compressed_data) + + print(f"Restoring to database {config.DB_NAME}...") + + # Build psql command + env = os.environ.copy() + env["PGPASSWORD"] = config.DB_PASSWORD + + cmd = [ + "psql", + "-h", + config.DB_HOST, + "-p", + config.DB_PORT, + "-U", + config.DB_USER, + "-d", + config.DB_NAME, + ] + + result = subprocess.run( + cmd, + env=env, + input=sql_data, + capture_output=True, + ) + + if result.returncode != 0: + stderr = result.stderr.decode() + # psql may return warnings that aren't fatal errors + if "ERROR" in stderr: + raise Exception(f"psql restore failed: {stderr}") + else: + print(f"Warnings: {stderr}") + + print("Restore completed successfully!") + + +def main() -> int: + """Main restore routine.""" + # Validate configuration + if not config.S3_BUCKET_NAME: + print("Error: S3_BUCKET_NAME is not configured") + return 1 + + s3_client = create_s3_client() + + if len(sys.argv) < 2: + # List available backups + backups = list_backups(s3_client) + if backups: + print(f"\nTo restore, run: python restore.py ") + else: + print("No backups found.") + return 0 + + filename = sys.argv[1] + + # Confirm restore + print(f"WARNING: This will restore database from {filename}") + print("This may overwrite existing data!") + print() + + confirm = input("Type 'yes' to continue: ") + + if confirm.lower() != "yes": + print("Restore cancelled.") + return 0 + + try: + restore_backup(s3_client, filename) + return 0 + except Exception as e: + print(f"Restore failed: {e}") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docker-compose.yml b/docker-compose.yml index c733bd4..e5d98f7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -94,7 +94,13 @@ services: BACKEND_URL: http://backend:8000 FRONTEND_URL: http://frontend:80 BOT_URL: http://bot:8080 + EXTERNAL_URL: ${EXTERNAL_URL:-} + PUBLIC_URL: ${PUBLIC_URL:-} CHECK_INTERVAL: "30" + TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN} + TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854} + volumes: + - status_data:/app/data ports: - "8001:8001" depends_on: @@ -103,5 +109,31 @@ services: - bot restart: unless-stopped + backup: + build: + context: ./backup-service + dockerfile: Dockerfile + container_name: marathon-backup + environment: + DB_HOST: db + DB_PORT: "5432" + DB_NAME: marathon + DB_USER: marathon + DB_PASSWORD: ${DB_PASSWORD:-marathon} + S3_BUCKET_NAME: ${S3_BUCKET_NAME:-} + S3_REGION: ${S3_REGION:-ru-1} + S3_ACCESS_KEY_ID: ${S3_ACCESS_KEY_ID:-} + S3_SECRET_ACCESS_KEY: ${S3_SECRET_ACCESS_KEY:-} + S3_ENDPOINT_URL: ${S3_ENDPOINT_URL:-} + S3_BACKUP_PREFIX: ${S3_BACKUP_PREFIX:-backups/} + TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN} + TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854} + BACKUP_RETENTION_DAYS: ${BACKUP_RETENTION_DAYS:-14} + depends_on: + db: + condition: service_healthy + restart: unless-stopped + volumes: postgres_data: + status_data: diff --git a/status-service/Dockerfile b/status-service/Dockerfile index 943ecff..8b78fb0 100644 --- a/status-service/Dockerfile +++ b/status-service/Dockerfile @@ -6,6 +6,9 @@ WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt +# Create data directory for SQLite +RUN mkdir -p /app/data + # Copy application COPY . . diff --git a/status-service/alerts.py b/status-service/alerts.py new file mode 100644 index 0000000..a795276 --- /dev/null +++ b/status-service/alerts.py @@ -0,0 +1,85 @@ +"""Telegram alerting for status changes.""" +import os +from datetime import datetime +from typing import Optional + +import httpx + + +TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "") +TELEGRAM_ADMIN_ID = os.getenv("TELEGRAM_ADMIN_ID", "") + + +async def send_telegram_alert(message: str, is_recovery: bool = False) -> bool: + """Send alert to Telegram.""" + if not TELEGRAM_BOT_TOKEN or not TELEGRAM_ADMIN_ID: + print("Telegram alerting not configured") + return False + + emoji = "\u2705" if is_recovery else "\u26a0\ufe0f" + text = f"{emoji} *Status Alert*\n\n{message}" + + url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage" + data = { + "chat_id": TELEGRAM_ADMIN_ID, + "text": text, + "parse_mode": "Markdown", + } + + try: + async with httpx.AsyncClient(timeout=10.0) as client: + response = await client.post(url, json=data) + response.raise_for_status() + print(f"Telegram alert sent: {message[:50]}...") + return True + except Exception as e: + print(f"Failed to send Telegram alert: {e}") + return False + + +async def alert_service_down(service_name: str, display_name: str, message: Optional[str]): + """Alert when service goes down.""" + now = datetime.now().strftime("%d.%m.%Y %H:%M:%S") + text = ( + f"*{display_name}* is DOWN\n\n" + f"Time: `{now}`\n" + ) + if message: + text += f"Error: `{message}`" + + await send_telegram_alert(text, is_recovery=False) + + +async def alert_service_recovered(service_name: str, display_name: str, downtime_minutes: int): + """Alert when service recovers.""" + now = datetime.now().strftime("%d.%m.%Y %H:%M:%S") + text = ( + f"*{display_name}* is back ONLINE\n\n" + f"Time: `{now}`\n" + f"Downtime: `{downtime_minutes} min`" + ) + + await send_telegram_alert(text, is_recovery=True) + + +async def alert_ssl_expiring(domain: str, days_left: int): + """Alert when SSL certificate is expiring soon.""" + text = ( + f"*SSL Certificate Expiring*\n\n" + f"Domain: `{domain}`\n" + f"Days left: `{days_left}`\n\n" + f"Please renew the certificate!" + ) + + await send_telegram_alert(text, is_recovery=False) + + +async def alert_ssl_expired(domain: str): + """Alert when SSL certificate has expired.""" + text = ( + f"*SSL Certificate EXPIRED*\n\n" + f"Domain: `{domain}`\n\n" + f"Certificate has expired! Site may show security warnings." + ) + + await send_telegram_alert(text, is_recovery=False) diff --git a/status-service/database.py b/status-service/database.py new file mode 100644 index 0000000..ed1013c --- /dev/null +++ b/status-service/database.py @@ -0,0 +1,261 @@ +"""SQLite database for storing metrics history.""" +import sqlite3 +from datetime import datetime, timedelta +from pathlib import Path +from typing import Optional +import json + + +DB_PATH = Path("/app/data/metrics.db") + + +def get_connection() -> sqlite3.Connection: + """Get database connection.""" + DB_PATH.parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(str(DB_PATH)) + conn.row_factory = sqlite3.Row + return conn + + +def init_db(): + """Initialize database tables.""" + conn = get_connection() + cursor = conn.cursor() + + # Metrics history table + cursor.execute(""" + CREATE TABLE IF NOT EXISTS metrics ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + service_name TEXT NOT NULL, + status TEXT NOT NULL, + latency_ms REAL, + message TEXT, + checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + # Incidents table + cursor.execute(""" + CREATE TABLE IF NOT EXISTS incidents ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + service_name TEXT NOT NULL, + status TEXT NOT NULL, + message TEXT, + started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + resolved_at TIMESTAMP, + notified BOOLEAN DEFAULT FALSE + ) + """) + + # SSL certificates table + cursor.execute(""" + CREATE TABLE IF NOT EXISTS ssl_certificates ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + domain TEXT NOT NULL UNIQUE, + issuer TEXT, + expires_at TIMESTAMP, + days_until_expiry INTEGER, + checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + + # Create indexes + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_metrics_service_time + ON metrics(service_name, checked_at DESC) + """) + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_incidents_service + ON incidents(service_name, started_at DESC) + """) + + conn.commit() + conn.close() + + +def save_metric(service_name: str, status: str, latency_ms: Optional[float], message: Optional[str]): + """Save a metric record.""" + conn = get_connection() + cursor = conn.cursor() + cursor.execute( + "INSERT INTO metrics (service_name, status, latency_ms, message) VALUES (?, ?, ?, ?)", + (service_name, status, latency_ms, message) + ) + conn.commit() + conn.close() + + +def get_latency_history(service_name: str, hours: int = 24) -> list[dict]: + """Get latency history for a service.""" + conn = get_connection() + cursor = conn.cursor() + + since = datetime.now() - timedelta(hours=hours) + cursor.execute(""" + SELECT latency_ms, status, checked_at + FROM metrics + WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL + ORDER BY checked_at ASC + """, (service_name, since.isoformat())) + + rows = cursor.fetchall() + conn.close() + + return [ + { + "latency_ms": row["latency_ms"], + "status": row["status"], + "checked_at": row["checked_at"] + } + for row in rows + ] + + +def get_uptime_stats(service_name: str, hours: int = 24) -> dict: + """Calculate uptime statistics for a service.""" + conn = get_connection() + cursor = conn.cursor() + + since = datetime.now() - timedelta(hours=hours) + + cursor.execute(""" + SELECT COUNT(*) as total, + SUM(CASE WHEN status = 'operational' THEN 1 ELSE 0 END) as successful + FROM metrics + WHERE service_name = ? AND checked_at > ? + """, (service_name, since.isoformat())) + + row = cursor.fetchone() + conn.close() + + total = row["total"] or 0 + successful = row["successful"] or 0 + + return { + "total_checks": total, + "successful_checks": successful, + "uptime_percent": (successful / total * 100) if total > 0 else 100.0 + } + + +def get_avg_latency(service_name: str, hours: int = 24) -> Optional[float]: + """Get average latency for a service.""" + conn = get_connection() + cursor = conn.cursor() + + since = datetime.now() - timedelta(hours=hours) + cursor.execute(""" + SELECT AVG(latency_ms) as avg_latency + FROM metrics + WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL + """, (service_name, since.isoformat())) + + row = cursor.fetchone() + conn.close() + + return row["avg_latency"] + + +def create_incident(service_name: str, status: str, message: Optional[str]) -> int: + """Create a new incident.""" + conn = get_connection() + cursor = conn.cursor() + cursor.execute( + "INSERT INTO incidents (service_name, status, message) VALUES (?, ?, ?)", + (service_name, status, message) + ) + incident_id = cursor.lastrowid + conn.commit() + conn.close() + return incident_id + + +def resolve_incident(service_name: str): + """Resolve open incidents for a service.""" + conn = get_connection() + cursor = conn.cursor() + cursor.execute(""" + UPDATE incidents + SET resolved_at = CURRENT_TIMESTAMP + WHERE service_name = ? AND resolved_at IS NULL + """, (service_name,)) + conn.commit() + conn.close() + + +def get_open_incident(service_name: str) -> Optional[dict]: + """Get open incident for a service.""" + conn = get_connection() + cursor = conn.cursor() + cursor.execute(""" + SELECT * FROM incidents + WHERE service_name = ? AND resolved_at IS NULL + ORDER BY started_at DESC LIMIT 1 + """, (service_name,)) + row = cursor.fetchone() + conn.close() + + if row: + return dict(row) + return None + + +def mark_incident_notified(incident_id: int): + """Mark incident as notified.""" + conn = get_connection() + cursor = conn.cursor() + cursor.execute("UPDATE incidents SET notified = TRUE WHERE id = ?", (incident_id,)) + conn.commit() + conn.close() + + +def get_recent_incidents(limit: int = 10) -> list[dict]: + """Get recent incidents.""" + conn = get_connection() + cursor = conn.cursor() + cursor.execute(""" + SELECT * FROM incidents + ORDER BY started_at DESC + LIMIT ? + """, (limit,)) + rows = cursor.fetchall() + conn.close() + return [dict(row) for row in rows] + + +def save_ssl_info(domain: str, issuer: str, expires_at: datetime, days_until_expiry: int): + """Save SSL certificate info.""" + conn = get_connection() + cursor = conn.cursor() + cursor.execute(""" + INSERT OR REPLACE INTO ssl_certificates + (domain, issuer, expires_at, days_until_expiry, checked_at) + VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP) + """, (domain, issuer, expires_at.isoformat(), days_until_expiry)) + conn.commit() + conn.close() + + +def get_ssl_info(domain: str) -> Optional[dict]: + """Get SSL certificate info.""" + conn = get_connection() + cursor = conn.cursor() + cursor.execute("SELECT * FROM ssl_certificates WHERE domain = ?", (domain,)) + row = cursor.fetchone() + conn.close() + + if row: + return dict(row) + return None + + +def cleanup_old_metrics(days: int = 7): + """Delete metrics older than specified days.""" + conn = get_connection() + cursor = conn.cursor() + cutoff = datetime.now() - timedelta(days=days) + cursor.execute("DELETE FROM metrics WHERE checked_at < ?", (cutoff.isoformat(),)) + deleted = cursor.rowcount + conn.commit() + conn.close() + return deleted diff --git a/status-service/main.py b/status-service/main.py index 4713cec..f06d6f4 100644 --- a/status-service/main.py +++ b/status-service/main.py @@ -1,6 +1,7 @@ +"""Status monitoring service with persistence and alerting.""" import os import asyncio -from datetime import datetime, timedelta +from datetime import datetime from typing import Optional from contextlib import asynccontextmanager @@ -8,13 +9,16 @@ from fastapi import FastAPI, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates -from monitors import ServiceMonitor, ServiceStatus +from monitors import ServiceMonitor +from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics # Configuration BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000") FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80") BOT_URL = os.getenv("BOT_URL", "http://bot:8080") +EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks +PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30")) # Initialize monitor @@ -22,38 +26,64 @@ monitor = ServiceMonitor() # Background task reference background_task: Optional[asyncio.Task] = None +cleanup_task: Optional[asyncio.Task] = None async def periodic_health_check(): - """Background task to check services periodically""" + """Background task to check services periodically.""" while True: - await monitor.check_all_services( - backend_url=BACKEND_URL, - frontend_url=FRONTEND_URL, - bot_url=BOT_URL - ) + try: + await monitor.check_all_services( + backend_url=BACKEND_URL, + frontend_url=FRONTEND_URL, + bot_url=BOT_URL, + external_url=EXTERNAL_URL, + public_url=PUBLIC_URL + ) + except Exception as e: + print(f"Health check error: {e}") await asyncio.sleep(CHECK_INTERVAL) +async def periodic_cleanup(): + """Background task to cleanup old metrics (daily).""" + while True: + await asyncio.sleep(86400) # 24 hours + try: + deleted = cleanup_old_metrics(days=7) + print(f"Cleaned up {deleted} old metrics") + except Exception as e: + print(f"Cleanup error: {e}") + + @asynccontextmanager async def lifespan(app: FastAPI): - """Startup and shutdown events""" - global background_task + """Startup and shutdown events.""" + global background_task, cleanup_task + + # Initialize database + init_db() + print("Database initialized") + # Start background health checks background_task = asyncio.create_task(periodic_health_check()) + cleanup_task = asyncio.create_task(periodic_cleanup()) + yield - # Cancel background task on shutdown - if background_task: - background_task.cancel() - try: - await background_task - except asyncio.CancelledError: - pass + + # Cancel background tasks on shutdown + for task in [background_task, cleanup_task]: + if task: + task.cancel() + try: + await task + except asyncio.CancelledError: + pass app = FastAPI( title="Status Monitor", - description="Service health monitoring", + description="Service health monitoring with persistence and alerting", lifespan=lifespan ) @@ -62,9 +92,11 @@ templates = Jinja2Templates(directory="templates") @app.get("/", response_class=HTMLResponse) async def status_page(request: Request): - """Main status page""" + """Main status page.""" services = monitor.get_all_statuses() overall_status = monitor.get_overall_status() + ssl_status = monitor.get_ssl_status() + incidents = get_recent_incidents(limit=5) return templates.TemplateResponse( "index.html", @@ -72,6 +104,8 @@ async def status_page(request: Request): "request": request, "services": services, "overall_status": overall_status, + "ssl_status": ssl_status, + "incidents": incidents, "last_check": monitor.last_check, "check_interval": CHECK_INTERVAL } @@ -80,30 +114,52 @@ async def status_page(request: Request): @app.get("/api/status") async def api_status(): - """API endpoint for service statuses""" + """API endpoint for service statuses.""" services = monitor.get_all_statuses() overall_status = monitor.get_overall_status() + ssl_status = monitor.get_ssl_status() return { - "overall_status": overall_status, + "overall_status": overall_status.value, "services": {name: status.to_dict() for name, status in services.items()}, + "ssl": ssl_status, "last_check": monitor.last_check.isoformat() if monitor.last_check else None, "check_interval_seconds": CHECK_INTERVAL } +@app.get("/api/history/{service_name}") +async def api_history(service_name: str, hours: int = 24): + """API endpoint for service latency history.""" + history = get_latency_history(service_name, hours=hours) + return { + "service": service_name, + "hours": hours, + "data": history + } + + +@app.get("/api/incidents") +async def api_incidents(limit: int = 20): + """API endpoint for recent incidents.""" + incidents = get_recent_incidents(limit=limit) + return {"incidents": incidents} + + @app.get("/api/health") async def health(): - """Health check for this service""" + """Health check for this service.""" return {"status": "ok", "service": "status-monitor"} @app.post("/api/refresh") async def refresh_status(): - """Force refresh all service statuses""" + """Force refresh all service statuses.""" await monitor.check_all_services( backend_url=BACKEND_URL, frontend_url=FRONTEND_URL, - bot_url=BOT_URL + bot_url=BOT_URL, + external_url=EXTERNAL_URL, + public_url=PUBLIC_URL ) return {"status": "refreshed"} diff --git a/status-service/monitors.py b/status-service/monitors.py index 018cd01..9c7a730 100644 --- a/status-service/monitors.py +++ b/status-service/monitors.py @@ -1,11 +1,19 @@ +"""Service monitoring with persistence and alerting.""" import asyncio from datetime import datetime, timedelta -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Optional from enum import Enum import httpx +from database import ( + save_metric, get_latency_history, get_uptime_stats, get_avg_latency, + create_incident, resolve_incident, get_open_incident, mark_incident_notified +) +from alerts import alert_service_down, alert_service_recovered +from ssl_monitor import check_and_alert_ssl, SSLInfo + class Status(str, Enum): OPERATIONAL = "operational" @@ -25,11 +33,17 @@ class ServiceStatus: uptime_percent: float = 100.0 message: Optional[str] = None version: Optional[str] = None + avg_latency_24h: Optional[float] = None + latency_history: list = None - # For uptime calculation + # For uptime calculation (in-memory, backed by DB) total_checks: int = 0 successful_checks: int = 0 + def __post_init__(self): + if self.latency_history is None: + self.latency_history = [] + def to_dict(self) -> dict: return { "name": self.name, @@ -40,7 +54,8 @@ class ServiceStatus: "last_incident": self.last_incident.isoformat() if self.last_incident else None, "uptime_percent": round(self.uptime_percent, 2), "message": self.message, - "version": self.version + "version": self.version, + "avg_latency_24h": round(self.avg_latency_24h, 2) if self.avg_latency_24h else None, } def update_uptime(self, is_success: bool): @@ -69,12 +84,17 @@ class ServiceMonitor: "bot": ServiceStatus( name="bot", display_name="Telegram Bot" - ) + ), + "external": ServiceStatus( + name="external", + display_name="External Access" + ), } self.last_check: Optional[datetime] = None + self.ssl_info: Optional[SSLInfo] = None async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]: - """Check backend API health""" + """Check backend API health.""" try: async with httpx.AsyncClient(timeout=10.0) as client: start = datetime.now() @@ -92,9 +112,7 @@ class ServiceMonitor: return Status.DOWN, None, str(e)[:100], None async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]: - """Check database through backend""" - # We check database indirectly - if backend is up, DB is likely up - # Could add a specific /health/db endpoint to backend later + """Check database through backend.""" try: async with httpx.AsyncClient(timeout=10.0) as client: start = datetime.now() @@ -109,7 +127,7 @@ class ServiceMonitor: return Status.DOWN, None, "Cannot reach backend" async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]: - """Check frontend availability""" + """Check frontend availability.""" try: async with httpx.AsyncClient(timeout=10.0) as client: start = datetime.now() @@ -126,7 +144,7 @@ class ServiceMonitor: return Status.DOWN, None, str(e)[:100] async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]: - """Check Telegram bot health""" + """Check Telegram bot health.""" try: async with httpx.AsyncClient(timeout=10.0) as client: start = datetime.now() @@ -142,8 +160,93 @@ class ServiceMonitor: except Exception as e: return Status.DOWN, None, str(e)[:100] - async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str): - """Check all services concurrently""" + async def check_external(self, url: str) -> tuple[Status, Optional[float], Optional[str]]: + """Check external (public) URL availability.""" + if not url: + return Status.UNKNOWN, None, "Not configured" + + try: + async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: + start = datetime.now() + response = await client.get(url) + latency = (datetime.now() - start).total_seconds() * 1000 + + if response.status_code == 200: + return Status.OPERATIONAL, latency, None + else: + return Status.DEGRADED, latency, f"HTTP {response.status_code}" + except httpx.TimeoutException: + return Status.DOWN, None, "Timeout" + except Exception as e: + return Status.DOWN, None, str(e)[:100] + + async def _process_check_result( + self, + service_name: str, + result: tuple, + now: datetime + ): + """Process check result with DB persistence and alerting.""" + if isinstance(result, Exception): + return + + if len(result) == 4: + status, latency, message, version = result + else: + status, latency, message = result + version = None + + svc = self.services[service_name] + was_down = svc.status in (Status.DOWN, Status.DEGRADED) + is_down = status in (Status.DOWN, Status.DEGRADED) + + # Update service status + svc.status = status + svc.latency_ms = latency + svc.message = message + if version: + svc.version = version + svc.last_check = now + svc.update_uptime(status == Status.OPERATIONAL) + + # Save metric to database + save_metric(service_name, status.value, latency, message) + + # Load historical data + svc.latency_history = get_latency_history(service_name, hours=24) + svc.avg_latency_24h = get_avg_latency(service_name, hours=24) + + # Update uptime from DB + stats = get_uptime_stats(service_name, hours=24) + if stats["total_checks"] > 0: + svc.uptime_percent = stats["uptime_percent"] + + # Handle incident tracking and alerting + if is_down and not was_down: + # Service just went down + svc.last_incident = now + incident_id = create_incident(service_name, status.value, message) + await alert_service_down(service_name, svc.display_name, message) + mark_incident_notified(incident_id) + + elif not is_down and was_down: + # Service recovered + open_incident = get_open_incident(service_name) + if open_incident: + started_at = datetime.fromisoformat(open_incident["started_at"]) + downtime_minutes = int((now - started_at).total_seconds() / 60) + resolve_incident(service_name) + await alert_service_recovered(service_name, svc.display_name, downtime_minutes) + + async def check_all_services( + self, + backend_url: str, + frontend_url: str, + bot_url: str, + external_url: str = "", + public_url: str = "" + ): + """Check all services concurrently.""" now = datetime.now() # Run all checks concurrently @@ -152,61 +255,18 @@ class ServiceMonitor: self.check_database(backend_url), self.check_frontend(frontend_url), self.check_bot(bot_url), + self.check_external(external_url), return_exceptions=True ) - # Process backend result - if not isinstance(results[0], Exception): - status, latency, message, version = results[0] - svc = self.services["backend"] - was_down = svc.status == Status.DOWN - svc.status = status - svc.latency_ms = latency - svc.message = message - svc.version = version - svc.last_check = now - svc.update_uptime(status == Status.OPERATIONAL) - if status != Status.OPERATIONAL and not was_down: - svc.last_incident = now + # Process results + service_names = ["backend", "database", "frontend", "bot", "external"] + for i, service_name in enumerate(service_names): + await self._process_check_result(service_name, results[i], now) - # Process database result - if not isinstance(results[1], Exception): - status, latency, message = results[1] - svc = self.services["database"] - was_down = svc.status == Status.DOWN - svc.status = status - svc.latency_ms = latency - svc.message = message - svc.last_check = now - svc.update_uptime(status == Status.OPERATIONAL) - if status != Status.OPERATIONAL and not was_down: - svc.last_incident = now - - # Process frontend result - if not isinstance(results[2], Exception): - status, latency, message = results[2] - svc = self.services["frontend"] - was_down = svc.status == Status.DOWN - svc.status = status - svc.latency_ms = latency - svc.message = message - svc.last_check = now - svc.update_uptime(status == Status.OPERATIONAL) - if status != Status.OPERATIONAL and not was_down: - svc.last_incident = now - - # Process bot result - if not isinstance(results[3], Exception): - status, latency, message = results[3] - svc = self.services["bot"] - was_down = svc.status == Status.DOWN - svc.status = status - svc.latency_ms = latency - svc.message = message - svc.last_check = now - svc.update_uptime(status == Status.OPERATIONAL) - if status != Status.OPERATIONAL and not was_down: - svc.last_incident = now + # Check SSL certificate (if public URL is HTTPS) + if public_url and public_url.startswith("https://"): + self.ssl_info = await check_and_alert_ssl(public_url) self.last_check = now @@ -214,8 +274,12 @@ class ServiceMonitor: return self.services def get_overall_status(self) -> Status: - """Get overall system status based on all services""" - statuses = [svc.status for svc in self.services.values()] + """Get overall system status based on all services.""" + # Exclude external from overall status if not configured + statuses = [ + svc.status for name, svc in self.services.items() + if name != "external" or svc.status != Status.UNKNOWN + ] if all(s == Status.OPERATIONAL for s in statuses): return Status.OPERATIONAL @@ -225,3 +289,17 @@ class ServiceMonitor: return Status.DEGRADED else: return Status.UNKNOWN + + def get_ssl_status(self) -> Optional[dict]: + """Get SSL certificate status.""" + if not self.ssl_info: + return None + + return { + "domain": self.ssl_info.domain, + "issuer": self.ssl_info.issuer, + "expires_at": self.ssl_info.expires_at.isoformat(), + "days_until_expiry": self.ssl_info.days_until_expiry, + "is_valid": self.ssl_info.is_valid, + "error": self.ssl_info.error + } diff --git a/status-service/ssl_monitor.py b/status-service/ssl_monitor.py new file mode 100644 index 0000000..9837eed --- /dev/null +++ b/status-service/ssl_monitor.py @@ -0,0 +1,140 @@ +"""SSL certificate monitoring.""" +import ssl +import socket +from datetime import datetime, timezone +from dataclasses import dataclass +from typing import Optional +from urllib.parse import urlparse + +from database import save_ssl_info, get_ssl_info +from alerts import alert_ssl_expiring, alert_ssl_expired + + +@dataclass +class SSLInfo: + domain: str + issuer: str + expires_at: datetime + days_until_expiry: int + is_valid: bool + error: Optional[str] = None + + +def check_ssl_certificate(url: str) -> Optional[SSLInfo]: + """Check SSL certificate for a URL.""" + try: + parsed = urlparse(url) + hostname = parsed.hostname + + if not hostname: + return None + + # Skip non-HTTPS or localhost + if parsed.scheme != "https" or hostname in ("localhost", "127.0.0.1"): + return None + + context = ssl.create_default_context() + conn = context.wrap_socket( + socket.socket(socket.AF_INET), + server_hostname=hostname + ) + conn.settimeout(10.0) + + try: + conn.connect((hostname, parsed.port or 443)) + cert = conn.getpeercert() + finally: + conn.close() + + if not cert: + return SSLInfo( + domain=hostname, + issuer="Unknown", + expires_at=datetime.now(timezone.utc), + days_until_expiry=0, + is_valid=False, + error="No certificate found" + ) + + # Parse expiry date + not_after = cert.get("notAfter", "") + expires_at = datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z") + expires_at = expires_at.replace(tzinfo=timezone.utc) + + # Calculate days until expiry + now = datetime.now(timezone.utc) + days_until_expiry = (expires_at - now).days + + # Get issuer + issuer_parts = cert.get("issuer", ()) + issuer = "Unknown" + for part in issuer_parts: + for key, value in part: + if key == "organizationName": + issuer = value + break + + return SSLInfo( + domain=hostname, + issuer=issuer, + expires_at=expires_at, + days_until_expiry=days_until_expiry, + is_valid=days_until_expiry > 0 + ) + + except ssl.SSLCertVerificationError as e: + hostname = urlparse(url).hostname or url + return SSLInfo( + domain=hostname, + issuer="Invalid", + expires_at=datetime.now(timezone.utc), + days_until_expiry=0, + is_valid=False, + error=f"SSL verification failed: {str(e)[:100]}" + ) + except Exception as e: + hostname = urlparse(url).hostname or url + return SSLInfo( + domain=hostname, + issuer="Unknown", + expires_at=datetime.now(timezone.utc), + days_until_expiry=0, + is_valid=False, + error=str(e)[:100] + ) + + +async def check_and_alert_ssl(url: str, warn_days: int = 14) -> Optional[SSLInfo]: + """Check SSL and send alerts if needed.""" + ssl_info = check_ssl_certificate(url) + + if not ssl_info: + return None + + # Save to database + save_ssl_info( + domain=ssl_info.domain, + issuer=ssl_info.issuer, + expires_at=ssl_info.expires_at, + days_until_expiry=ssl_info.days_until_expiry + ) + + # Check if we need to alert + prev_info = get_ssl_info(ssl_info.domain) + + if ssl_info.days_until_expiry <= 0: + # Certificate expired + await alert_ssl_expired(ssl_info.domain) + elif ssl_info.days_until_expiry <= warn_days: + # Certificate expiring soon - alert once per day + should_alert = True + if prev_info and prev_info.get("checked_at"): + # Check if we already alerted today + last_check = datetime.fromisoformat(prev_info["checked_at"]) + if (datetime.now() - last_check).days < 1: + should_alert = False + + if should_alert: + await alert_ssl_expiring(ssl_info.domain, ssl_info.days_until_expiry) + + return ssl_info diff --git a/status-service/templates/index.html b/status-service/templates/index.html index ab50cbf..cfadd2d 100644 --- a/status-service/templates/index.html +++ b/status-service/templates/index.html @@ -4,6 +4,7 @@ System Status +