From 57bad3b4a8ae68d99dd5139c8218875451d7a901 Mon Sep 17 00:00:00 2001
From: Oronemu <Oronemu@mail.ru>
Date: Thu, 18 Dec 2025 03:35:13 +0700
Subject: [PATCH] Redesign health service + create backup service

---
 .env.example                        |   9 +
 Makefile                            |  23 +++
 backup-service/Dockerfile           |  30 +++
 backup-service/backup.py            | 217 ++++++++++++++++++++++
 backup-service/config.py            |  33 ++++
 backup-service/crontab              |   4 +
 backup-service/requirements.txt     |   2 +
 backup-service/restore.py           | 158 ++++++++++++++++
 docker-compose.yml                  |  32 ++++
 status-service/Dockerfile           |   3 +
 status-service/alerts.py            |  85 +++++++++
 status-service/database.py          | 261 ++++++++++++++++++++++++++
 status-service/main.py              | 104 ++++++++---
 status-service/monitors.py          | 208 ++++++++++++++-------
 status-service/ssl_monitor.py       | 140 ++++++++++++++
 status-service/templates/index.html | 275 +++++++++++++++++++++++++++-
 16 files changed, 1486 insertions(+), 98 deletions(-)
 create mode 100644 backup-service/Dockerfile
 create mode 100644 backup-service/backup.py
 create mode 100644 backup-service/config.py
 create mode 100644 backup-service/crontab
 create mode 100644 backup-service/requirements.txt
 create mode 100644 backup-service/restore.py
 create mode 100644 status-service/alerts.py
 create mode 100644 status-service/database.py
 create mode 100644 status-service/ssl_monitor.py

diff --git a/.env.example b/.env.example
index 2371084..0d9095b 100644
--- a/.env.example
+++ b/.env.example
@@ -20,5 +20,14 @@ S3_SECRET_ACCESS_KEY=your-secret-access-key
 S3_ENDPOINT_URL=https://s3.firstvds.ru
 S3_PUBLIC_URL=https://your-bucket-name.s3.firstvds.ru
 
+# Backup Service
+TELEGRAM_ADMIN_ID=947392854
+S3_BACKUP_PREFIX=backups/
+BACKUP_RETENTION_DAYS=14
+
+# Status Service (optional - for external monitoring)
+EXTERNAL_URL=https://your-domain.com
+PUBLIC_URL=https://your-domain.com
+
 # Frontend (for build)
 VITE_API_URL=/api/v1
diff --git a/Makefile b/Makefile
index 12ef5e7..8cc670d 100644
--- a/Makefile
+++ b/Makefile
@@ -31,6 +31,12 @@ help:
 	@echo "    make shell        - Open backend shell"
 	@echo "    make frontend-sh  - Open frontend shell"
 	@echo ""
+	@echo "  Backup:"
+	@echo "    make backup-now   - Run backup immediately"
+	@echo "    make backup-list  - List available backups in S3"
+	@echo "    make backup-restore - Restore from backup (interactive)"
+	@echo "    make backup-logs  - Show backup service logs"
+	@echo ""
 	@echo "  Cleanup:"
 	@echo "    make clean        - Stop and remove containers, volumes"
 	@echo "    make prune        - Remove unused Docker resources"
@@ -137,3 +143,20 @@ test-backend:
 # Production
 prod:
 	$(DC) -f docker-compose.yml up -d --build
+
+# Backup
+backup-now:
+	$(DC) exec backup python /app/backup.py
+
+backup-list:
+	$(DC) exec backup python /app/restore.py
+
+backup-restore:
+	@read -p "Backup filename: " file; \
+	$(DC) exec -it backup python /app/restore.py "$$file"
+
+backup-logs:
+	$(DC) logs -f backup
+
+backup-shell:
+	$(DC) exec backup bash
diff --git a/backup-service/Dockerfile b/backup-service/Dockerfile
new file mode 100644
index 0000000..09ab093
--- /dev/null
+++ b/backup-service/Dockerfile
@@ -0,0 +1,30 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install PostgreSQL client (for pg_dump and psql) and cron
+RUN apt-get update && apt-get install -y \
+    postgresql-client \
+    cron \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application
+COPY . .
+
+# Make scripts executable
+RUN chmod +x backup.py restore.py
+
+# Setup cron
+COPY crontab /etc/cron.d/backup-cron
+RUN chmod 0644 /etc/cron.d/backup-cron
+RUN crontab /etc/cron.d/backup-cron
+
+# Create log file
+RUN touch /var/log/cron.log
+
+# Start cron in foreground and tail logs
+CMD ["sh", "-c", "printenv > /etc/environment && cron && tail -f /var/log/cron.log"]
diff --git a/backup-service/backup.py b/backup-service/backup.py
new file mode 100644
index 0000000..075a4ca
--- /dev/null
+++ b/backup-service/backup.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""
+PostgreSQL Backup Service for WebApp.
+
+- Creates pg_dump backup
+- Compresses with gzip
+- Uploads to S3 FirstVDS
+- Rotates old backups (configurable retention)
+- Sends Telegram notifications
+"""
+import gzip
+import os
+import subprocess
+import sys
+from datetime import datetime, timedelta, timezone
+
+import boto3
+import httpx
+from botocore.config import Config as BotoConfig
+from botocore.exceptions import ClientError
+
+from config import config
+
+
+def create_s3_client():
+    """Initialize S3 client (same pattern as backend storage.py)."""
+    return boto3.client(
+        "s3",
+        endpoint_url=config.S3_ENDPOINT_URL,
+        aws_access_key_id=config.S3_ACCESS_KEY_ID,
+        aws_secret_access_key=config.S3_SECRET_ACCESS_KEY,
+        region_name=config.S3_REGION or "us-east-1",
+        config=BotoConfig(signature_version="s3v4"),
+    )
+
+
+def send_telegram_notification(message: str, is_error: bool = False) -> None:
+    """Send notification to Telegram admin."""
+    if not config.TELEGRAM_BOT_TOKEN or not config.TELEGRAM_ADMIN_ID:
+        print("Telegram not configured, skipping notification")
+        return
+
+    emoji = "\u274c" if is_error else "\u2705"
+    text = f"{emoji} *Database Backup*\n\n{message}"
+
+    url = f"https://api.telegram.org/bot{config.TELEGRAM_BOT_TOKEN}/sendMessage"
+    data = {
+        "chat_id": config.TELEGRAM_ADMIN_ID,
+        "text": text,
+        "parse_mode": "Markdown",
+    }
+
+    try:
+        response = httpx.post(url, json=data, timeout=30)
+        response.raise_for_status()
+        print("Telegram notification sent")
+    except Exception as e:
+        print(f"Failed to send Telegram notification: {e}")
+
+
+def create_backup() -> tuple[str, bytes]:
+    """Create pg_dump backup and compress it."""
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+    filename = f"marathon_backup_{timestamp}.sql.gz"
+
+    # Build pg_dump command
+    env = os.environ.copy()
+    env["PGPASSWORD"] = config.DB_PASSWORD
+
+    cmd = [
+        "pg_dump",
+        "-h",
+        config.DB_HOST,
+        "-p",
+        config.DB_PORT,
+        "-U",
+        config.DB_USER,
+        "-d",
+        config.DB_NAME,
+        "--no-owner",
+        "--no-acl",
+        "-F",
+        "p",  # plain SQL format
+    ]
+
+    print(f"Running pg_dump for database {config.DB_NAME}...")
+    result = subprocess.run(
+        cmd,
+        env=env,
+        capture_output=True,
+    )
+
+    if result.returncode != 0:
+        raise Exception(f"pg_dump failed: {result.stderr.decode()}")
+
+    # Compress the output
+    print("Compressing backup...")
+    compressed = gzip.compress(result.stdout, compresslevel=9)
+
+    return filename, compressed
+
+
+def upload_to_s3(s3_client, filename: str, data: bytes) -> str:
+    """Upload backup to S3."""
+    key = f"{config.S3_BACKUP_PREFIX}{filename}"
+
+    print(f"Uploading to S3: {key}...")
+    s3_client.put_object(
+        Bucket=config.S3_BUCKET_NAME,
+        Key=key,
+        Body=data,
+        ContentType="application/gzip",
+    )
+
+    return key
+
+
+def rotate_old_backups(s3_client) -> int:
+    """Delete backups older than BACKUP_RETENTION_DAYS."""
+    cutoff_date = datetime.now(timezone.utc) - timedelta(
+        days=config.BACKUP_RETENTION_DAYS
+    )
+    deleted_count = 0
+
+    print(f"Rotating backups older than {config.BACKUP_RETENTION_DAYS} days...")
+
+    # List all objects with backup prefix
+    try:
+        paginator = s3_client.get_paginator("list_objects_v2")
+        pages = paginator.paginate(
+            Bucket=config.S3_BUCKET_NAME,
+            Prefix=config.S3_BACKUP_PREFIX,
+        )
+
+        for page in pages:
+            for obj in page.get("Contents", []):
+                last_modified = obj["LastModified"]
+                if last_modified.tzinfo is None:
+                    last_modified = last_modified.replace(tzinfo=timezone.utc)
+
+                if last_modified < cutoff_date:
+                    s3_client.delete_object(
+                        Bucket=config.S3_BUCKET_NAME,
+                        Key=obj["Key"],
+                    )
+                    deleted_count += 1
+                    print(f"Deleted old backup: {obj['Key']}")
+    except ClientError as e:
+        print(f"Error during rotation: {e}")
+
+    return deleted_count
+
+
+def main() -> int:
+    """Main backup routine."""
+    start_time = datetime.now()
+
+    print(f"{'=' * 50}")
+    print(f"Backup started at {start_time}")
+    print(f"{'=' * 50}")
+
+    try:
+        # Validate configuration
+        if not config.S3_BUCKET_NAME:
+            raise Exception("S3_BUCKET_NAME is not configured")
+        if not config.S3_ACCESS_KEY_ID:
+            raise Exception("S3_ACCESS_KEY_ID is not configured")
+        if not config.S3_SECRET_ACCESS_KEY:
+            raise Exception("S3_SECRET_ACCESS_KEY is not configured")
+        if not config.S3_ENDPOINT_URL:
+            raise Exception("S3_ENDPOINT_URL is not configured")
+
+        # Create S3 client
+        s3_client = create_s3_client()
+
+        # Create backup
+        filename, data = create_backup()
+        size_mb = len(data) / (1024 * 1024)
+        print(f"Backup created: {filename} ({size_mb:.2f} MB)")
+
+        # Upload to S3
+        s3_key = upload_to_s3(s3_client, filename, data)
+        print(f"Uploaded to S3: {s3_key}")
+
+        # Rotate old backups
+        deleted_count = rotate_old_backups(s3_client)
+        print(f"Deleted {deleted_count} old backups")
+
+        # Calculate duration
+        duration = datetime.now() - start_time
+
+        # Send success notification
+        message = (
+            f"Backup completed successfully!\n\n"
+            f"*File:* `{filename}`\n"
+            f"*Size:* {size_mb:.2f} MB\n"
+            f"*Duration:* {duration.seconds}s\n"
+            f"*Deleted old:* {deleted_count} files"
+        )
+        send_telegram_notification(message, is_error=False)
+
+        print(f"{'=' * 50}")
+        print("Backup completed successfully!")
+        print(f"{'=' * 50}")
+        return 0
+
+    except Exception as e:
+        error_msg = f"Backup failed!\n\n*Error:* `{str(e)}`"
+        send_telegram_notification(error_msg, is_error=True)
+        print(f"{'=' * 50}")
+        print(f"Backup failed: {e}")
+        print(f"{'=' * 50}")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/backup-service/config.py b/backup-service/config.py
new file mode 100644
index 0000000..594e465
--- /dev/null
+++ b/backup-service/config.py
@@ -0,0 +1,33 @@
+"""Configuration for backup service."""
+import os
+from dataclasses import dataclass
+
+
+@dataclass
+class Config:
+    """Backup service configuration from environment variables."""
+
+    # Database
+    DB_HOST: str = os.getenv("DB_HOST", "db")
+    DB_PORT: str = os.getenv("DB_PORT", "5432")
+    DB_NAME: str = os.getenv("DB_NAME", "marathon")
+    DB_USER: str = os.getenv("DB_USER", "marathon")
+    DB_PASSWORD: str = os.getenv("DB_PASSWORD", "123")
+
+    # S3
+    S3_BUCKET_NAME: str = os.getenv("S3_BUCKET_NAME", "")
+    S3_REGION: str = os.getenv("S3_REGION", "ru-1")
+    S3_ACCESS_KEY_ID: str = os.getenv("S3_ACCESS_KEY_ID", "")
+    S3_SECRET_ACCESS_KEY: str = os.getenv("S3_SECRET_ACCESS_KEY", "")
+    S3_ENDPOINT_URL: str = os.getenv("S3_ENDPOINT_URL", "")
+    S3_BACKUP_PREFIX: str = os.getenv("S3_BACKUP_PREFIX", "backups/")
+
+    # Telegram
+    TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "")
+    TELEGRAM_ADMIN_ID: str = os.getenv("TELEGRAM_ADMIN_ID", "947392854")
+
+    # Backup settings
+    BACKUP_RETENTION_DAYS: int = int(os.getenv("BACKUP_RETENTION_DAYS", "14"))
+
+
+config = Config()
diff --git a/backup-service/crontab b/backup-service/crontab
new file mode 100644
index 0000000..de00c64
--- /dev/null
+++ b/backup-service/crontab
@@ -0,0 +1,4 @@
+# Backup cron job
+# Run backup daily at 3:00 AM UTC
+0 3 * * * /usr/local/bin/python /app/backup.py >> /var/log/cron.log 2>&1
+# Empty line required at end of crontab
diff --git a/backup-service/requirements.txt b/backup-service/requirements.txt
new file mode 100644
index 0000000..644e4ae
--- /dev/null
+++ b/backup-service/requirements.txt
@@ -0,0 +1,2 @@
+boto3==1.34.0
+httpx==0.26.0
diff --git a/backup-service/restore.py b/backup-service/restore.py
new file mode 100644
index 0000000..23f48fb
--- /dev/null
+++ b/backup-service/restore.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+"""
+Restore PostgreSQL database from S3 backup.
+
+Usage:
+    python restore.py              - List available backups
+    python restore.py <filename>   - Restore from specific backup
+"""
+import gzip
+import os
+import subprocess
+import sys
+
+import boto3
+from botocore.config import Config as BotoConfig
+from botocore.exceptions import ClientError
+
+from config import config
+
+
+def create_s3_client():
+    """Initialize S3 client."""
+    return boto3.client(
+        "s3",
+        endpoint_url=config.S3_ENDPOINT_URL,
+        aws_access_key_id=config.S3_ACCESS_KEY_ID,
+        aws_secret_access_key=config.S3_SECRET_ACCESS_KEY,
+        region_name=config.S3_REGION or "us-east-1",
+        config=BotoConfig(signature_version="s3v4"),
+    )
+
+
+def list_backups(s3_client) -> list[tuple[str, float, str]]:
+    """List all available backups."""
+    print("Available backups:\n")
+
+    try:
+        paginator = s3_client.get_paginator("list_objects_v2")
+        pages = paginator.paginate(
+            Bucket=config.S3_BUCKET_NAME,
+            Prefix=config.S3_BACKUP_PREFIX,
+        )
+
+        backups = []
+        for page in pages:
+            for obj in page.get("Contents", []):
+                filename = obj["Key"].replace(config.S3_BACKUP_PREFIX, "")
+                size_mb = obj["Size"] / (1024 * 1024)
+                modified = obj["LastModified"].strftime("%Y-%m-%d %H:%M:%S")
+                backups.append((filename, size_mb, modified))
+
+        # Sort by date descending (newest first)
+        backups.sort(key=lambda x: x[2], reverse=True)
+
+        for filename, size_mb, modified in backups:
+            print(f"  {filename} ({size_mb:.2f} MB) - {modified}")
+
+        return backups
+
+    except ClientError as e:
+        print(f"Error listing backups: {e}")
+        return []
+
+
+def restore_backup(s3_client, filename: str) -> None:
+    """Download and restore backup."""
+    key = f"{config.S3_BACKUP_PREFIX}{filename}"
+
+    print(f"Downloading {filename} from S3...")
+    try:
+        response = s3_client.get_object(
+            Bucket=config.S3_BUCKET_NAME,
+            Key=key,
+        )
+        compressed_data = response["Body"].read()
+    except ClientError as e:
+        raise Exception(f"Failed to download backup: {e}")
+
+    print("Decompressing...")
+    sql_data = gzip.decompress(compressed_data)
+
+    print(f"Restoring to database {config.DB_NAME}...")
+
+    # Build psql command
+    env = os.environ.copy()
+    env["PGPASSWORD"] = config.DB_PASSWORD
+
+    cmd = [
+        "psql",
+        "-h",
+        config.DB_HOST,
+        "-p",
+        config.DB_PORT,
+        "-U",
+        config.DB_USER,
+        "-d",
+        config.DB_NAME,
+    ]
+
+    result = subprocess.run(
+        cmd,
+        env=env,
+        input=sql_data,
+        capture_output=True,
+    )
+
+    if result.returncode != 0:
+        stderr = result.stderr.decode()
+        # psql may return warnings that aren't fatal errors
+        if "ERROR" in stderr:
+            raise Exception(f"psql restore failed: {stderr}")
+        else:
+            print(f"Warnings: {stderr}")
+
+    print("Restore completed successfully!")
+
+
+def main() -> int:
+    """Main restore routine."""
+    # Validate configuration
+    if not config.S3_BUCKET_NAME:
+        print("Error: S3_BUCKET_NAME is not configured")
+        return 1
+
+    s3_client = create_s3_client()
+
+    if len(sys.argv) < 2:
+        # List available backups
+        backups = list_backups(s3_client)
+        if backups:
+            print(f"\nTo restore, run: python restore.py <filename>")
+        else:
+            print("No backups found.")
+        return 0
+
+    filename = sys.argv[1]
+
+    # Confirm restore
+    print(f"WARNING: This will restore database from {filename}")
+    print("This may overwrite existing data!")
+    print()
+
+    confirm = input("Type 'yes' to continue: ")
+
+    if confirm.lower() != "yes":
+        print("Restore cancelled.")
+        return 0
+
+    try:
+        restore_backup(s3_client, filename)
+        return 0
+    except Exception as e:
+        print(f"Restore failed: {e}")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/docker-compose.yml b/docker-compose.yml
index c733bd4..e5d98f7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -94,7 +94,13 @@ services:
       BACKEND_URL: http://backend:8000
       FRONTEND_URL: http://frontend:80
       BOT_URL: http://bot:8080
+      EXTERNAL_URL: ${EXTERNAL_URL:-}
+      PUBLIC_URL: ${PUBLIC_URL:-}
       CHECK_INTERVAL: "30"
+      TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
+      TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854}
+    volumes:
+      - status_data:/app/data
     ports:
       - "8001:8001"
     depends_on:
@@ -103,5 +109,31 @@ services:
       - bot
     restart: unless-stopped
 
+  backup:
+    build:
+      context: ./backup-service
+      dockerfile: Dockerfile
+    container_name: marathon-backup
+    environment:
+      DB_HOST: db
+      DB_PORT: "5432"
+      DB_NAME: marathon
+      DB_USER: marathon
+      DB_PASSWORD: ${DB_PASSWORD:-marathon}
+      S3_BUCKET_NAME: ${S3_BUCKET_NAME:-}
+      S3_REGION: ${S3_REGION:-ru-1}
+      S3_ACCESS_KEY_ID: ${S3_ACCESS_KEY_ID:-}
+      S3_SECRET_ACCESS_KEY: ${S3_SECRET_ACCESS_KEY:-}
+      S3_ENDPOINT_URL: ${S3_ENDPOINT_URL:-}
+      S3_BACKUP_PREFIX: ${S3_BACKUP_PREFIX:-backups/}
+      TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
+      TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854}
+      BACKUP_RETENTION_DAYS: ${BACKUP_RETENTION_DAYS:-14}
+    depends_on:
+      db:
+        condition: service_healthy
+    restart: unless-stopped
+
 volumes:
   postgres_data:
+  status_data:
diff --git a/status-service/Dockerfile b/status-service/Dockerfile
index 943ecff..8b78fb0 100644
--- a/status-service/Dockerfile
+++ b/status-service/Dockerfile
@@ -6,6 +6,9 @@ WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
+# Create data directory for SQLite
+RUN mkdir -p /app/data
+
 # Copy application
 COPY . .
 
diff --git a/status-service/alerts.py b/status-service/alerts.py
new file mode 100644
index 0000000..a795276
--- /dev/null
+++ b/status-service/alerts.py
@@ -0,0 +1,85 @@
+"""Telegram alerting for status changes."""
+import os
+from datetime import datetime
+from typing import Optional
+
+import httpx
+
+
+TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "")
+TELEGRAM_ADMIN_ID = os.getenv("TELEGRAM_ADMIN_ID", "")
+
+
+async def send_telegram_alert(message: str, is_recovery: bool = False) -> bool:
+    """Send alert to Telegram."""
+    if not TELEGRAM_BOT_TOKEN or not TELEGRAM_ADMIN_ID:
+        print("Telegram alerting not configured")
+        return False
+
+    emoji = "\u2705" if is_recovery else "\u26a0\ufe0f"
+    text = f"{emoji} *Status Alert*\n\n{message}"
+
+    url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
+    data = {
+        "chat_id": TELEGRAM_ADMIN_ID,
+        "text": text,
+        "parse_mode": "Markdown",
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            response = await client.post(url, json=data)
+            response.raise_for_status()
+            print(f"Telegram alert sent: {message[:50]}...")
+            return True
+    except Exception as e:
+        print(f"Failed to send Telegram alert: {e}")
+        return False
+
+
+async def alert_service_down(service_name: str, display_name: str, message: Optional[str]):
+    """Alert when service goes down."""
+    now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
+    text = (
+        f"*{display_name}* is DOWN\n\n"
+        f"Time: `{now}`\n"
+    )
+    if message:
+        text += f"Error: `{message}`"
+
+    await send_telegram_alert(text, is_recovery=False)
+
+
+async def alert_service_recovered(service_name: str, display_name: str, downtime_minutes: int):
+    """Alert when service recovers."""
+    now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
+    text = (
+        f"*{display_name}* is back ONLINE\n\n"
+        f"Time: `{now}`\n"
+        f"Downtime: `{downtime_minutes} min`"
+    )
+
+    await send_telegram_alert(text, is_recovery=True)
+
+
+async def alert_ssl_expiring(domain: str, days_left: int):
+    """Alert when SSL certificate is expiring soon."""
+    text = (
+        f"*SSL Certificate Expiring*\n\n"
+        f"Domain: `{domain}`\n"
+        f"Days left: `{days_left}`\n\n"
+        f"Please renew the certificate!"
+    )
+
+    await send_telegram_alert(text, is_recovery=False)
+
+
+async def alert_ssl_expired(domain: str):
+    """Alert when SSL certificate has expired."""
+    text = (
+        f"*SSL Certificate EXPIRED*\n\n"
+        f"Domain: `{domain}`\n\n"
+        f"Certificate has expired! Site may show security warnings."
+    )
+
+    await send_telegram_alert(text, is_recovery=False)
diff --git a/status-service/database.py b/status-service/database.py
new file mode 100644
index 0000000..ed1013c
--- /dev/null
+++ b/status-service/database.py
@@ -0,0 +1,261 @@
+"""SQLite database for storing metrics history."""
+import sqlite3
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Optional
+import json
+
+
+DB_PATH = Path("/app/data/metrics.db")
+
+
+def get_connection() -> sqlite3.Connection:
+    """Get database connection."""
+    DB_PATH.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(str(DB_PATH))
+    conn.row_factory = sqlite3.Row
+    return conn
+
+
+def init_db():
+    """Initialize database tables."""
+    conn = get_connection()
+    cursor = conn.cursor()
+
+    # Metrics history table
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS metrics (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            service_name TEXT NOT NULL,
+            status TEXT NOT NULL,
+            latency_ms REAL,
+            message TEXT,
+            checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    """)
+
+    # Incidents table
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS incidents (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            service_name TEXT NOT NULL,
+            status TEXT NOT NULL,
+            message TEXT,
+            started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+            resolved_at TIMESTAMP,
+            notified BOOLEAN DEFAULT FALSE
+        )
+    """)
+
+    # SSL certificates table
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS ssl_certificates (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            domain TEXT NOT NULL UNIQUE,
+            issuer TEXT,
+            expires_at TIMESTAMP,
+            days_until_expiry INTEGER,
+            checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    """)
+
+    # Create indexes
+    cursor.execute("""
+        CREATE INDEX IF NOT EXISTS idx_metrics_service_time
+        ON metrics(service_name, checked_at DESC)
+    """)
+    cursor.execute("""
+        CREATE INDEX IF NOT EXISTS idx_incidents_service
+        ON incidents(service_name, started_at DESC)
+    """)
+
+    conn.commit()
+    conn.close()
+
+
+def save_metric(service_name: str, status: str, latency_ms: Optional[float], message: Optional[str]):
+    """Save a metric record."""
+    conn = get_connection()
+    cursor = conn.cursor()
+    cursor.execute(
+        "INSERT INTO metrics (service_name, status, latency_ms, message) VALUES (?, ?, ?, ?)",
+        (service_name, status, latency_ms, message)
+    )
+    conn.commit()
+    conn.close()
+
+
+def get_latency_history(service_name: str, hours: int = 24) -> list[dict]:
+    """Get latency history for a service."""
+    conn = get_connection()
+    cursor = conn.cursor()
+
+    since = datetime.now() - timedelta(hours=hours)
+    cursor.execute("""
+        SELECT latency_ms, status, checked_at
+        FROM metrics
+        WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
+        ORDER BY checked_at ASC
+    """, (service_name, since.isoformat()))
+
+    rows = cursor.fetchall()
+    conn.close()
+
+    return [
+        {
+            "latency_ms": row["latency_ms"],
+            "status": row["status"],
+            "checked_at": row["checked_at"]
+        }
+        for row in rows
+    ]
+
+
+def get_uptime_stats(service_name: str, hours: int = 24) -> dict:
+    """Calculate uptime statistics for a service."""
+    conn = get_connection()
+    cursor = conn.cursor()
+
+    since = datetime.now() - timedelta(hours=hours)
+
+    cursor.execute("""
+        SELECT COUNT(*) as total,
+               SUM(CASE WHEN status = 'operational' THEN 1 ELSE 0 END) as successful
+        FROM metrics
+        WHERE service_name = ? AND checked_at > ?
+    """, (service_name, since.isoformat()))
+
+    row = cursor.fetchone()
+    conn.close()
+
+    total = row["total"] or 0
+    successful = row["successful"] or 0
+
+    return {
+        "total_checks": total,
+        "successful_checks": successful,
+        "uptime_percent": (successful / total * 100) if total > 0 else 100.0
+    }
+
+
+def get_avg_latency(service_name: str, hours: int = 24) -> Optional[float]:
+    """Get average latency for a service."""
+    conn = get_connection()
+    cursor = conn.cursor()
+
+    since = datetime.now() - timedelta(hours=hours)
+    cursor.execute("""
+        SELECT AVG(latency_ms) as avg_latency
+        FROM metrics
+        WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
+    """, (service_name, since.isoformat()))
+
+    row = cursor.fetchone()
+    conn.close()
+
+    return row["avg_latency"]
+
+
+def create_incident(service_name: str, status: str, message: Optional[str]) -> int:
+    """Create a new incident."""
+    conn = get_connection()
+    cursor = conn.cursor()
+    cursor.execute(
+        "INSERT INTO incidents (service_name, status, message) VALUES (?, ?, ?)",
+        (service_name, status, message)
+    )
+    incident_id = cursor.lastrowid
+    conn.commit()
+    conn.close()
+    return incident_id
+
+
+def resolve_incident(service_name: str):
+    """Resolve open incidents for a service."""
+    conn = get_connection()
+    cursor = conn.cursor()
+    cursor.execute("""
+        UPDATE incidents
+        SET resolved_at = CURRENT_TIMESTAMP
+        WHERE service_name = ? AND resolved_at IS NULL
+    """, (service_name,))
+    conn.commit()
+    conn.close()
+
+
+def get_open_incident(service_name: str) -> Optional[dict]:
+    """Get open incident for a service."""
+    conn = get_connection()
+    cursor = conn.cursor()
+    cursor.execute("""
+        SELECT * FROM incidents
+        WHERE service_name = ? AND resolved_at IS NULL
+        ORDER BY started_at DESC LIMIT 1
+    """, (service_name,))
+    row = cursor.fetchone()
+    conn.close()
+
+    if row:
+        return dict(row)
+    return None
+
+
+def mark_incident_notified(incident_id: int):
+    """Mark incident as notified."""
+    conn = get_connection()
+    cursor = conn.cursor()
+    cursor.execute("UPDATE incidents SET notified = TRUE WHERE id = ?", (incident_id,))
+    conn.commit()
+    conn.close()
+
+
+def get_recent_incidents(limit: int = 10) -> list[dict]:
+    """Get recent incidents."""
+    conn = get_connection()
+    cursor = conn.cursor()
+    cursor.execute("""
+        SELECT * FROM incidents
+        ORDER BY started_at DESC
+        LIMIT ?
+    """, (limit,))
+    rows = cursor.fetchall()
+    conn.close()
+    return [dict(row) for row in rows]
+
+
+def save_ssl_info(domain: str, issuer: str, expires_at: datetime, days_until_expiry: int):
+    """Save SSL certificate info."""
+    conn = get_connection()
+    cursor = conn.cursor()
+    cursor.execute("""
+        INSERT OR REPLACE INTO ssl_certificates
+        (domain, issuer, expires_at, days_until_expiry, checked_at)
+        VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
+    """, (domain, issuer, expires_at.isoformat(), days_until_expiry))
+    conn.commit()
+    conn.close()
+
+
+def get_ssl_info(domain: str) -> Optional[dict]:
+    """Get SSL certificate info."""
+    conn = get_connection()
+    cursor = conn.cursor()
+    cursor.execute("SELECT * FROM ssl_certificates WHERE domain = ?", (domain,))
+    row = cursor.fetchone()
+    conn.close()
+
+    if row:
+        return dict(row)
+    return None
+
+
+def cleanup_old_metrics(days: int = 7):
+    """Delete metrics older than specified days."""
+    conn = get_connection()
+    cursor = conn.cursor()
+    cutoff = datetime.now() - timedelta(days=days)
+    cursor.execute("DELETE FROM metrics WHERE checked_at < ?", (cutoff.isoformat(),))
+    deleted = cursor.rowcount
+    conn.commit()
+    conn.close()
+    return deleted
diff --git a/status-service/main.py b/status-service/main.py
index 4713cec..f06d6f4 100644
--- a/status-service/main.py
+++ b/status-service/main.py
@@ -1,6 +1,7 @@
+"""Status monitoring service with persistence and alerting."""
 import os
 import asyncio
-from datetime import datetime, timedelta
+from datetime import datetime
 from typing import Optional
 from contextlib import asynccontextmanager
 
@@ -8,13 +9,16 @@ from fastapi import FastAPI, Request
 from fastapi.responses import HTMLResponse
 from fastapi.templating import Jinja2Templates
 
-from monitors import ServiceMonitor, ServiceStatus
+from monitors import ServiceMonitor
+from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
 
 
 # Configuration
 BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
 FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
 BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
+EXTERNAL_URL = os.getenv("EXTERNAL_URL", "")  # Public URL for external checks
+PUBLIC_URL = os.getenv("PUBLIC_URL", "")  # Public HTTPS URL for SSL checks
 CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30"))
 
 # Initialize monitor
@@ -22,38 +26,64 @@ monitor = ServiceMonitor()
 
 # Background task reference
 background_task: Optional[asyncio.Task] = None
+cleanup_task: Optional[asyncio.Task] = None
 
 
 async def periodic_health_check():
-    """Background task to check services periodically"""
+    """Background task to check services periodically."""
     while True:
-        await monitor.check_all_services(
-            backend_url=BACKEND_URL,
-            frontend_url=FRONTEND_URL,
-            bot_url=BOT_URL
-        )
+        try:
+            await monitor.check_all_services(
+                backend_url=BACKEND_URL,
+                frontend_url=FRONTEND_URL,
+                bot_url=BOT_URL,
+                external_url=EXTERNAL_URL,
+                public_url=PUBLIC_URL
+            )
+        except Exception as e:
+            print(f"Health check error: {e}")
         await asyncio.sleep(CHECK_INTERVAL)
 
 
+async def periodic_cleanup():
+    """Background task to cleanup old metrics (daily)."""
+    while True:
+        await asyncio.sleep(86400)  # 24 hours
+        try:
+            deleted = cleanup_old_metrics(days=7)
+            print(f"Cleaned up {deleted} old metrics")
+        except Exception as e:
+            print(f"Cleanup error: {e}")
+
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """Startup and shutdown events"""
-    global background_task
+    """Startup and shutdown events."""
+    global background_task, cleanup_task
+
+    # Initialize database
+    init_db()
+    print("Database initialized")
+
     # Start background health checks
     background_task = asyncio.create_task(periodic_health_check())
+    cleanup_task = asyncio.create_task(periodic_cleanup())
+
     yield
-    # Cancel background task on shutdown
-    if background_task:
-        background_task.cancel()
-        try:
-            await background_task
-        except asyncio.CancelledError:
-            pass
+
+    # Cancel background tasks on shutdown
+    for task in [background_task, cleanup_task]:
+        if task:
+            task.cancel()
+            try:
+                await task
+            except asyncio.CancelledError:
+                pass
 
 
 app = FastAPI(
     title="Status Monitor",
-    description="Service health monitoring",
+    description="Service health monitoring with persistence and alerting",
     lifespan=lifespan
 )
 
@@ -62,9 +92,11 @@ templates = Jinja2Templates(directory="templates")
 
 @app.get("/", response_class=HTMLResponse)
 async def status_page(request: Request):
-    """Main status page"""
+    """Main status page."""
     services = monitor.get_all_statuses()
     overall_status = monitor.get_overall_status()
+    ssl_status = monitor.get_ssl_status()
+    incidents = get_recent_incidents(limit=5)
 
     return templates.TemplateResponse(
         "index.html",
@@ -72,6 +104,8 @@ async def status_page(request: Request):
             "request": request,
             "services": services,
             "overall_status": overall_status,
+            "ssl_status": ssl_status,
+            "incidents": incidents,
             "last_check": monitor.last_check,
             "check_interval": CHECK_INTERVAL
         }
@@ -80,30 +114,52 @@ async def status_page(request: Request):
 
 @app.get("/api/status")
 async def api_status():
-    """API endpoint for service statuses"""
+    """API endpoint for service statuses."""
     services = monitor.get_all_statuses()
     overall_status = monitor.get_overall_status()
+    ssl_status = monitor.get_ssl_status()
 
     return {
-        "overall_status": overall_status,
+        "overall_status": overall_status.value,
         "services": {name: status.to_dict() for name, status in services.items()},
+        "ssl": ssl_status,
         "last_check": monitor.last_check.isoformat() if monitor.last_check else None,
         "check_interval_seconds": CHECK_INTERVAL
     }
 
 
+@app.get("/api/history/{service_name}")
+async def api_history(service_name: str, hours: int = 24):
+    """API endpoint for service latency history."""
+    history = get_latency_history(service_name, hours=hours)
+    return {
+        "service": service_name,
+        "hours": hours,
+        "data": history
+    }
+
+
+@app.get("/api/incidents")
+async def api_incidents(limit: int = 20):
+    """API endpoint for recent incidents."""
+    incidents = get_recent_incidents(limit=limit)
+    return {"incidents": incidents}
+
+
 @app.get("/api/health")
 async def health():
-    """Health check for this service"""
+    """Health check for this service."""
     return {"status": "ok", "service": "status-monitor"}
 
 
 @app.post("/api/refresh")
 async def refresh_status():
-    """Force refresh all service statuses"""
+    """Force refresh all service statuses."""
     await monitor.check_all_services(
         backend_url=BACKEND_URL,
         frontend_url=FRONTEND_URL,
-        bot_url=BOT_URL
+        bot_url=BOT_URL,
+        external_url=EXTERNAL_URL,
+        public_url=PUBLIC_URL
     )
     return {"status": "refreshed"}
diff --git a/status-service/monitors.py b/status-service/monitors.py
index 018cd01..9c7a730 100644
--- a/status-service/monitors.py
+++ b/status-service/monitors.py
@@ -1,11 +1,19 @@
+"""Service monitoring with persistence and alerting."""
 import asyncio
 from datetime import datetime, timedelta
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Optional
 from enum import Enum
 
 import httpx
 
+from database import (
+    save_metric, get_latency_history, get_uptime_stats, get_avg_latency,
+    create_incident, resolve_incident, get_open_incident, mark_incident_notified
+)
+from alerts import alert_service_down, alert_service_recovered
+from ssl_monitor import check_and_alert_ssl, SSLInfo
+
 
 class Status(str, Enum):
     OPERATIONAL = "operational"
@@ -25,11 +33,17 @@ class ServiceStatus:
     uptime_percent: float = 100.0
     message: Optional[str] = None
     version: Optional[str] = None
+    avg_latency_24h: Optional[float] = None
+    latency_history: list = None
 
-    # For uptime calculation
+    # For uptime calculation (in-memory, backed by DB)
     total_checks: int = 0
     successful_checks: int = 0
 
+    def __post_init__(self):
+        if self.latency_history is None:
+            self.latency_history = []
+
     def to_dict(self) -> dict:
         return {
             "name": self.name,
@@ -40,7 +54,8 @@ class ServiceStatus:
             "last_incident": self.last_incident.isoformat() if self.last_incident else None,
             "uptime_percent": round(self.uptime_percent, 2),
             "message": self.message,
-            "version": self.version
+            "version": self.version,
+            "avg_latency_24h": round(self.avg_latency_24h, 2) if self.avg_latency_24h else None,
         }
 
     def update_uptime(self, is_success: bool):
@@ -69,12 +84,17 @@ class ServiceMonitor:
             "bot": ServiceStatus(
                 name="bot",
                 display_name="Telegram Bot"
-            )
+            ),
+            "external": ServiceStatus(
+                name="external",
+                display_name="External Access"
+            ),
         }
         self.last_check: Optional[datetime] = None
+        self.ssl_info: Optional[SSLInfo] = None
 
     async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]:
-        """Check backend API health"""
+        """Check backend API health."""
         try:
             async with httpx.AsyncClient(timeout=10.0) as client:
                 start = datetime.now()
@@ -92,9 +112,7 @@ class ServiceMonitor:
             return Status.DOWN, None, str(e)[:100], None
 
     async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]:
-        """Check database through backend"""
-        # We check database indirectly - if backend is up, DB is likely up
-        # Could add a specific /health/db endpoint to backend later
+        """Check database through backend."""
         try:
             async with httpx.AsyncClient(timeout=10.0) as client:
                 start = datetime.now()
@@ -109,7 +127,7 @@ class ServiceMonitor:
             return Status.DOWN, None, "Cannot reach backend"
 
     async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
-        """Check frontend availability"""
+        """Check frontend availability."""
         try:
             async with httpx.AsyncClient(timeout=10.0) as client:
                 start = datetime.now()
@@ -126,7 +144,7 @@ class ServiceMonitor:
             return Status.DOWN, None, str(e)[:100]
 
     async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
-        """Check Telegram bot health"""
+        """Check Telegram bot health."""
         try:
             async with httpx.AsyncClient(timeout=10.0) as client:
                 start = datetime.now()
@@ -142,8 +160,93 @@ class ServiceMonitor:
         except Exception as e:
             return Status.DOWN, None, str(e)[:100]
 
-    async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str):
-        """Check all services concurrently"""
+    async def check_external(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
+        """Check external (public) URL availability."""
+        if not url:
+            return Status.UNKNOWN, None, "Not configured"
+
+        try:
+            async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
+                start = datetime.now()
+                response = await client.get(url)
+                latency = (datetime.now() - start).total_seconds() * 1000
+
+                if response.status_code == 200:
+                    return Status.OPERATIONAL, latency, None
+                else:
+                    return Status.DEGRADED, latency, f"HTTP {response.status_code}"
+        except httpx.TimeoutException:
+            return Status.DOWN, None, "Timeout"
+        except Exception as e:
+            return Status.DOWN, None, str(e)[:100]
+
+    async def _process_check_result(
+        self,
+        service_name: str,
+        result: tuple,
+        now: datetime
+    ):
+        """Process check result with DB persistence and alerting."""
+        if isinstance(result, Exception):
+            return
+
+        if len(result) == 4:
+            status, latency, message, version = result
+        else:
+            status, latency, message = result
+            version = None
+
+        svc = self.services[service_name]
+        was_down = svc.status in (Status.DOWN, Status.DEGRADED)
+        is_down = status in (Status.DOWN, Status.DEGRADED)
+
+        # Update service status
+        svc.status = status
+        svc.latency_ms = latency
+        svc.message = message
+        if version:
+            svc.version = version
+        svc.last_check = now
+        svc.update_uptime(status == Status.OPERATIONAL)
+
+        # Save metric to database
+        save_metric(service_name, status.value, latency, message)
+
+        # Load historical data
+        svc.latency_history = get_latency_history(service_name, hours=24)
+        svc.avg_latency_24h = get_avg_latency(service_name, hours=24)
+
+        # Update uptime from DB
+        stats = get_uptime_stats(service_name, hours=24)
+        if stats["total_checks"] > 0:
+            svc.uptime_percent = stats["uptime_percent"]
+
+        # Handle incident tracking and alerting
+        if is_down and not was_down:
+            # Service just went down
+            svc.last_incident = now
+            incident_id = create_incident(service_name, status.value, message)
+            await alert_service_down(service_name, svc.display_name, message)
+            mark_incident_notified(incident_id)
+
+        elif not is_down and was_down:
+            # Service recovered
+            open_incident = get_open_incident(service_name)
+            if open_incident:
+                started_at = datetime.fromisoformat(open_incident["started_at"])
+                downtime_minutes = int((now - started_at).total_seconds() / 60)
+                resolve_incident(service_name)
+                await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
+
+    async def check_all_services(
+        self,
+        backend_url: str,
+        frontend_url: str,
+        bot_url: str,
+        external_url: str = "",
+        public_url: str = ""
+    ):
+        """Check all services concurrently."""
         now = datetime.now()
 
         # Run all checks concurrently
@@ -152,61 +255,18 @@ class ServiceMonitor:
             self.check_database(backend_url),
             self.check_frontend(frontend_url),
             self.check_bot(bot_url),
+            self.check_external(external_url),
             return_exceptions=True
         )
 
-        # Process backend result
-        if not isinstance(results[0], Exception):
-            status, latency, message, version = results[0]
-            svc = self.services["backend"]
-            was_down = svc.status == Status.DOWN
-            svc.status = status
-            svc.latency_ms = latency
-            svc.message = message
-            svc.version = version
-            svc.last_check = now
-            svc.update_uptime(status == Status.OPERATIONAL)
-            if status != Status.OPERATIONAL and not was_down:
-                svc.last_incident = now
+        # Process results
+        service_names = ["backend", "database", "frontend", "bot", "external"]
+        for i, service_name in enumerate(service_names):
+            await self._process_check_result(service_name, results[i], now)
 
-        # Process database result
-        if not isinstance(results[1], Exception):
-            status, latency, message = results[1]
-            svc = self.services["database"]
-            was_down = svc.status == Status.DOWN
-            svc.status = status
-            svc.latency_ms = latency
-            svc.message = message
-            svc.last_check = now
-            svc.update_uptime(status == Status.OPERATIONAL)
-            if status != Status.OPERATIONAL and not was_down:
-                svc.last_incident = now
-
-        # Process frontend result
-        if not isinstance(results[2], Exception):
-            status, latency, message = results[2]
-            svc = self.services["frontend"]
-            was_down = svc.status == Status.DOWN
-            svc.status = status
-            svc.latency_ms = latency
-            svc.message = message
-            svc.last_check = now
-            svc.update_uptime(status == Status.OPERATIONAL)
-            if status != Status.OPERATIONAL and not was_down:
-                svc.last_incident = now
-
-        # Process bot result
-        if not isinstance(results[3], Exception):
-            status, latency, message = results[3]
-            svc = self.services["bot"]
-            was_down = svc.status == Status.DOWN
-            svc.status = status
-            svc.latency_ms = latency
-            svc.message = message
-            svc.last_check = now
-            svc.update_uptime(status == Status.OPERATIONAL)
-            if status != Status.OPERATIONAL and not was_down:
-                svc.last_incident = now
+        # Check SSL certificate (if public URL is HTTPS)
+        if public_url and public_url.startswith("https://"):
+            self.ssl_info = await check_and_alert_ssl(public_url)
 
         self.last_check = now
 
@@ -214,8 +274,12 @@ class ServiceMonitor:
         return self.services
 
     def get_overall_status(self) -> Status:
-        """Get overall system status based on all services"""
-        statuses = [svc.status for svc in self.services.values()]
+        """Get overall system status based on all services."""
+        # Exclude external from overall status if not configured
+        statuses = [
+            svc.status for name, svc in self.services.items()
+            if name != "external" or svc.status != Status.UNKNOWN
+        ]
 
         if all(s == Status.OPERATIONAL for s in statuses):
             return Status.OPERATIONAL
@@ -225,3 +289,17 @@ class ServiceMonitor:
             return Status.DEGRADED
         else:
             return Status.UNKNOWN
+
+    def get_ssl_status(self) -> Optional[dict]:
+        """Get SSL certificate status."""
+        if not self.ssl_info:
+            return None
+
+        return {
+            "domain": self.ssl_info.domain,
+            "issuer": self.ssl_info.issuer,
+            "expires_at": self.ssl_info.expires_at.isoformat(),
+            "days_until_expiry": self.ssl_info.days_until_expiry,
+            "is_valid": self.ssl_info.is_valid,
+            "error": self.ssl_info.error
+        }
diff --git a/status-service/ssl_monitor.py b/status-service/ssl_monitor.py
new file mode 100644
index 0000000..9837eed
--- /dev/null
+++ b/status-service/ssl_monitor.py
@@ -0,0 +1,140 @@
+"""SSL certificate monitoring."""
+import ssl
+import socket
+from datetime import datetime, timezone
+from dataclasses import dataclass
+from typing import Optional
+from urllib.parse import urlparse
+
+from database import save_ssl_info, get_ssl_info
+from alerts import alert_ssl_expiring, alert_ssl_expired
+
+
+@dataclass
+class SSLInfo:
+    domain: str
+    issuer: str
+    expires_at: datetime
+    days_until_expiry: int
+    is_valid: bool
+    error: Optional[str] = None
+
+
+def check_ssl_certificate(url: str) -> Optional[SSLInfo]:
+    """Check SSL certificate for a URL."""
+    try:
+        parsed = urlparse(url)
+        hostname = parsed.hostname
+
+        if not hostname:
+            return None
+
+        # Skip non-HTTPS or localhost
+        if parsed.scheme != "https" or hostname in ("localhost", "127.0.0.1"):
+            return None
+
+        context = ssl.create_default_context()
+        conn = context.wrap_socket(
+            socket.socket(socket.AF_INET),
+            server_hostname=hostname
+        )
+        conn.settimeout(10.0)
+
+        try:
+            conn.connect((hostname, parsed.port or 443))
+            cert = conn.getpeercert()
+        finally:
+            conn.close()
+
+        if not cert:
+            return SSLInfo(
+                domain=hostname,
+                issuer="Unknown",
+                expires_at=datetime.now(timezone.utc),
+                days_until_expiry=0,
+                is_valid=False,
+                error="No certificate found"
+            )
+
+        # Parse expiry date
+        not_after = cert.get("notAfter", "")
+        expires_at = datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z")
+        expires_at = expires_at.replace(tzinfo=timezone.utc)
+
+        # Calculate days until expiry
+        now = datetime.now(timezone.utc)
+        days_until_expiry = (expires_at - now).days
+
+        # Get issuer
+        issuer_parts = cert.get("issuer", ())
+        issuer = "Unknown"
+        for part in issuer_parts:
+            for key, value in part:
+                if key == "organizationName":
+                    issuer = value
+                    break
+
+        return SSLInfo(
+            domain=hostname,
+            issuer=issuer,
+            expires_at=expires_at,
+            days_until_expiry=days_until_expiry,
+            is_valid=days_until_expiry > 0
+        )
+
+    except ssl.SSLCertVerificationError as e:
+        hostname = urlparse(url).hostname or url
+        return SSLInfo(
+            domain=hostname,
+            issuer="Invalid",
+            expires_at=datetime.now(timezone.utc),
+            days_until_expiry=0,
+            is_valid=False,
+            error=f"SSL verification failed: {str(e)[:100]}"
+        )
+    except Exception as e:
+        hostname = urlparse(url).hostname or url
+        return SSLInfo(
+            domain=hostname,
+            issuer="Unknown",
+            expires_at=datetime.now(timezone.utc),
+            days_until_expiry=0,
+            is_valid=False,
+            error=str(e)[:100]
+        )
+
+
+async def check_and_alert_ssl(url: str, warn_days: int = 14) -> Optional[SSLInfo]:
+    """Check SSL and send alerts if needed."""
+    ssl_info = check_ssl_certificate(url)
+
+    if not ssl_info:
+        return None
+
+    # Save to database
+    save_ssl_info(
+        domain=ssl_info.domain,
+        issuer=ssl_info.issuer,
+        expires_at=ssl_info.expires_at,
+        days_until_expiry=ssl_info.days_until_expiry
+    )
+
+    # Check if we need to alert
+    prev_info = get_ssl_info(ssl_info.domain)
+
+    if ssl_info.days_until_expiry <= 0:
+        # Certificate expired
+        await alert_ssl_expired(ssl_info.domain)
+    elif ssl_info.days_until_expiry <= warn_days:
+        # Certificate expiring soon - alert once per day
+        should_alert = True
+        if prev_info and prev_info.get("checked_at"):
+            # Check if we already alerted today
+            last_check = datetime.fromisoformat(prev_info["checked_at"])
+            if (datetime.now() - last_check).days < 1:
+                should_alert = False
+
+        if should_alert:
+            await alert_ssl_expiring(ssl_info.domain, ssl_info.days_until_expiry)
+
+    return ssl_info
diff --git a/status-service/templates/index.html b/status-service/templates/index.html
index ab50cbf..cfadd2d 100644
--- a/status-service/templates/index.html
+++ b/status-service/templates/index.html
@@ -4,6 +4,7 @@
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <title>System Status</title>
+    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
     <style>
         * {
             margin: 0;
@@ -19,7 +20,7 @@
         }
 
         .container {
-            max-width: 900px;
+            max-width: 1100px;
             margin: 0 auto;
             padding: 40px 20px;
         }
@@ -39,6 +40,13 @@
             background-clip: text;
         }
 
+        h2 {
+            font-size: 1.3rem;
+            font-weight: 600;
+            margin: 30px 0 16px 0;
+            color: #94a3b8;
+        }
+
         .overall-status {
             display: inline-flex;
             align-items: center;
@@ -174,8 +182,9 @@
 
         .service-metrics {
             display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
+            grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
             gap: 12px;
+            margin-bottom: 16px;
         }
 
         .metric {
@@ -212,6 +221,132 @@
             color: #fca5a5;
         }
 
+        /* Latency chart */
+        .latency-chart {
+            height: 60px;
+            margin-top: 12px;
+        }
+
+        /* SSL Card */
+        .ssl-card {
+            background: rgba(30, 41, 59, 0.5);
+            border: 1px solid rgba(100, 116, 139, 0.2);
+            border-radius: 16px;
+            padding: 20px;
+            margin-bottom: 20px;
+        }
+
+        .ssl-card.warning {
+            border-color: rgba(250, 204, 21, 0.3);
+        }
+
+        .ssl-card.danger {
+            border-color: rgba(239, 68, 68, 0.3);
+        }
+
+        .ssl-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 12px;
+        }
+
+        .ssl-title {
+            font-size: 1.1rem;
+            font-weight: 600;
+            color: #f1f5f9;
+        }
+
+        .ssl-badge {
+            padding: 4px 12px;
+            border-radius: 20px;
+            font-size: 0.8rem;
+            font-weight: 500;
+        }
+
+        .ssl-badge.valid {
+            background: rgba(34, 197, 94, 0.15);
+            color: #22c55e;
+        }
+
+        .ssl-badge.expiring {
+            background: rgba(250, 204, 21, 0.15);
+            color: #facc15;
+        }
+
+        .ssl-badge.expired {
+            background: rgba(239, 68, 68, 0.15);
+            color: #ef4444;
+        }
+
+        .ssl-info {
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
+            gap: 12px;
+        }
+
+        /* Incidents */
+        .incidents-list {
+            background: rgba(30, 41, 59, 0.5);
+            border: 1px solid rgba(100, 116, 139, 0.2);
+            border-radius: 16px;
+            overflow: hidden;
+        }
+
+        .incident-item {
+            padding: 16px 20px;
+            border-bottom: 1px solid rgba(100, 116, 139, 0.1);
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+        }
+
+        .incident-item:last-child {
+            border-bottom: none;
+        }
+
+        .incident-info {
+            display: flex;
+            align-items: center;
+            gap: 12px;
+        }
+
+        .incident-dot {
+            width: 10px;
+            height: 10px;
+            border-radius: 50%;
+        }
+
+        .incident-dot.resolved {
+            background: #22c55e;
+        }
+
+        .incident-dot.open {
+            background: #ef4444;
+            animation: pulse 2s infinite;
+        }
+
+        .incident-service {
+            font-weight: 500;
+            color: #f1f5f9;
+        }
+
+        .incident-message {
+            font-size: 0.85rem;
+            color: #94a3b8;
+        }
+
+        .incident-time {
+            font-size: 0.85rem;
+            color: #64748b;
+        }
+
+        .no-incidents {
+            padding: 30px;
+            text-align: center;
+            color: #64748b;
+        }
+
         .refresh-btn {
             display: inline-flex;
             align-items: center;
@@ -292,8 +427,42 @@
             </p>
         </header>
 
+        {% if ssl_status %}
+        <div class="ssl-card {% if ssl_status.days_until_expiry <= 0 %}danger{% elif ssl_status.days_until_expiry <= 14 %}warning{% endif %}">
+            <div class="ssl-header">
+                <span class="ssl-title">SSL Certificate</span>
+                <span class="ssl-badge {% if ssl_status.days_until_expiry <= 0 %}expired{% elif ssl_status.days_until_expiry <= 14 %}expiring{% else %}valid{% endif %}">
+                    {% if ssl_status.days_until_expiry <= 0 %}
+                        Expired
+                    {% elif ssl_status.days_until_expiry <= 14 %}
+                        Expiring Soon
+                    {% else %}
+                        Valid
+                    {% endif %}
+                </span>
+            </div>
+            <div class="ssl-info">
+                <div class="metric">
+                    <div class="metric-label">Domain</div>
+                    <div class="metric-value">{{ ssl_status.domain }}</div>
+                </div>
+                <div class="metric">
+                    <div class="metric-label">Issuer</div>
+                    <div class="metric-value">{{ ssl_status.issuer }}</div>
+                </div>
+                <div class="metric">
+                    <div class="metric-label">Days Left</div>
+                    <div class="metric-value {% if ssl_status.days_until_expiry <= 0 %}bad{% elif ssl_status.days_until_expiry <= 14 %}warning{% else %}good{% endif %}">
+                        {{ ssl_status.days_until_expiry }}
+                    </div>
+                </div>
+            </div>
+        </div>
+        {% endif %}
+
         <div class="services-grid">
             {% for name, service in services.items() %}
+            {% if service.status.value != 'unknown' or name != 'external' %}
             <div class="service-card">
                 <div class="service-header">
                     <span class="service-name">{{ service.display_name }}</span>
@@ -322,7 +491,17 @@
                         </div>
                     </div>
                     <div class="metric">
-                        <div class="metric-label">Uptime</div>
+                        <div class="metric-label">Avg 24h</div>
+                        <div class="metric-value {% if service.avg_latency_24h and service.avg_latency_24h < 200 %}good{% elif service.avg_latency_24h and service.avg_latency_24h < 500 %}warning{% elif service.avg_latency_24h %}bad{% endif %}">
+                            {% if service.avg_latency_24h %}
+                                {{ "%.0f"|format(service.avg_latency_24h) }} ms
+                            {% else %}
+                                —
+                            {% endif %}
+                        </div>
+                    </div>
+                    <div class="metric">
+                        <div class="metric-label">Uptime 24h</div>
                         <div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}">
                             {{ "%.1f"|format(service.uptime_percent) }}%
                         </div>
@@ -333,20 +512,49 @@
                         <div class="metric-value">{{ service.version }}</div>
                     </div>
                     {% endif %}
-                    {% if service.last_incident %}
-                    <div class="metric">
-                        <div class="metric-label">Last Incident</div>
-                        <div class="metric-value warning">{{ service.last_incident.strftime('%d.%m %H:%M') }}</div>
-                    </div>
-                    {% endif %}
                 </div>
+                {% if service.latency_history and service.latency_history|length > 1 %}
+                <div class="latency-chart">
+                    <canvas id="chart-{{ name }}"></canvas>
+                </div>
+                {% endif %}
                 {% if service.message %}
                 <div class="service-message">{{ service.message }}</div>
                 {% endif %}
             </div>
+            {% endif %}
             {% endfor %}
         </div>
 
+        <h2>Recent Incidents</h2>
+        <div class="incidents-list">
+            {% if incidents and incidents|length > 0 %}
+                {% for incident in incidents %}
+                <div class="incident-item">
+                    <div class="incident-info">
+                        <span class="incident-dot {% if incident.resolved_at %}resolved{% else %}open{% endif %}"></span>
+                        <div>
+                            <div class="incident-service">{{ incident.service_name | title }}</div>
+                            <div class="incident-message">{{ incident.message or 'Service unavailable' }}</div>
+                        </div>
+                    </div>
+                    <div class="incident-time">
+                        {{ incident.started_at[:16].replace('T', ' ') }}
+                        {% if incident.resolved_at %}
+                            - Resolved
+                        {% else %}
+                            - Ongoing
+                        {% endif %}
+                    </div>
+                </div>
+                {% endfor %}
+            {% else %}
+                <div class="no-incidents">
+                    No recent incidents
+                </div>
+            {% endif %}
+        </div>
+
         <center>
             <button class="refresh-btn" onclick="refreshStatus(this)">
                 <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
@@ -363,6 +571,55 @@
     </div>
 
     <script>
+        // Initialize latency charts
+        {% for name, service in services.items() %}
+        {% if service.latency_history and service.latency_history|length > 1 %}
+        (function() {
+            const ctx = document.getElementById('chart-{{ name }}').getContext('2d');
+            const data = {{ service.latency_history | tojson }};
+
+            new Chart(ctx, {
+                type: 'line',
+                data: {
+                    labels: data.map(d => ''),
+                    datasets: [{
+                        data: data.map(d => d.latency_ms),
+                        borderColor: '#00d4ff',
+                        backgroundColor: 'rgba(0, 212, 255, 0.1)',
+                        fill: true,
+                        tension: 0.4,
+                        pointRadius: 0,
+                        borderWidth: 2
+                    }]
+                },
+                options: {
+                    responsive: true,
+                    maintainAspectRatio: false,
+                    plugins: {
+                        legend: { display: false },
+                        tooltip: {
+                            callbacks: {
+                                label: (ctx) => ctx.raw.toFixed(0) + ' ms'
+                            }
+                        }
+                    },
+                    scales: {
+                        x: { display: false },
+                        y: {
+                            display: false,
+                            beginAtZero: true
+                        }
+                    },
+                    interaction: {
+                        intersect: false,
+                        mode: 'index'
+                    }
+                }
+            });
+        })();
+        {% endif %}
+        {% endfor %}
+
         async function refreshStatus(btn) {
             btn.classList.add('loading');
             btn.disabled = true;