""" Enhanced Health Check Endpoints Comprehensive service health monitoring """ from fastapi import APIRouter, Depends, HTTPException from sqlalchemy import text from typing import Dict, Any import psutil import os from datetime import datetime, timezone import logging from app.core.database import database_manager from app.utils.circuit_breaker import circuit_breaker_registry from app.core.config import settings logger = logging.getLogger(__name__) router = APIRouter() async def check_database_health() -> Dict[str, Any]: """Check database connectivity and performance""" try: start_time = datetime.now(timezone.utc) async with database_manager.async_engine.begin() as conn: # Simple connectivity check await conn.execute(text("SELECT 1")) # Check if we can access training tables result = await conn.execute( text("SELECT COUNT(*) FROM trained_models") ) model_count = result.scalar() # Check connection pool stats pool = database_manager.async_engine.pool pool_size = pool.size() pool_checked_out = pool.checked_out_connections() response_time = (datetime.now(timezone.utc) - start_time).total_seconds() return { "status": "healthy", "response_time_seconds": round(response_time, 3), "model_count": model_count, "connection_pool": { "size": pool_size, "checked_out": pool_checked_out, "available": pool_size - pool_checked_out } } except Exception as e: logger.error(f"Database health check failed: {e}") return { "status": "unhealthy", "error": str(e) } def check_system_resources() -> Dict[str, Any]: """Check system resource usage""" try: cpu_percent = psutil.cpu_percent(interval=0.1) memory = psutil.virtual_memory() disk = psutil.disk_usage('/') return { "status": "healthy", "cpu": { "usage_percent": cpu_percent, "count": psutil.cpu_count() }, "memory": { "total_mb": round(memory.total / 1024 / 1024, 2), "used_mb": round(memory.used / 1024 / 1024, 2), "available_mb": round(memory.available / 1024 / 1024, 2), "usage_percent": memory.percent }, "disk": { "total_gb": round(disk.total / 1024 / 1024 / 1024, 2), "used_gb": round(disk.used / 1024 / 1024 / 1024, 2), "free_gb": round(disk.free / 1024 / 1024 / 1024, 2), "usage_percent": disk.percent } } except Exception as e: logger.error(f"System resource check failed: {e}") return { "status": "error", "error": str(e) } def check_model_storage() -> Dict[str, Any]: """Check model storage health""" try: storage_path = settings.MODEL_STORAGE_PATH if not os.path.exists(storage_path): return { "status": "warning", "message": f"Model storage path does not exist: {storage_path}" } # Check if writable test_file = os.path.join(storage_path, ".health_check") try: with open(test_file, 'w') as f: f.write("test") os.remove(test_file) writable = True except Exception: writable = False # Count model files model_files = 0 total_size = 0 for root, dirs, files in os.walk(storage_path): for file in files: if file.endswith('.pkl'): model_files += 1 file_path = os.path.join(root, file) total_size += os.path.getsize(file_path) return { "status": "healthy" if writable else "degraded", "path": storage_path, "writable": writable, "model_files": model_files, "total_size_mb": round(total_size / 1024 / 1024, 2) } except Exception as e: logger.error(f"Model storage check failed: {e}") return { "status": "error", "error": str(e) } @router.get("/health") async def health_check() -> Dict[str, Any]: """ Basic health check endpoint. Returns 200 if service is running. """ return { "status": "healthy", "service": "training-service", "timestamp": datetime.now(timezone.utc).isoformat() } @router.get("/health/detailed") async def detailed_health_check() -> Dict[str, Any]: """ Detailed health check with component status. Includes database, system resources, and dependencies. """ database_health = await check_database_health() system_health = check_system_resources() storage_health = check_model_storage() circuit_breakers = circuit_breaker_registry.get_all_states() # Determine overall status component_statuses = [ database_health.get("status"), system_health.get("status"), storage_health.get("status") ] if "unhealthy" in component_statuses or "error" in component_statuses: overall_status = "unhealthy" elif "degraded" in component_statuses or "warning" in component_statuses: overall_status = "degraded" else: overall_status = "healthy" return { "status": overall_status, "service": "training-service", "version": "1.0.0", "timestamp": datetime.now(timezone.utc).isoformat(), "components": { "database": database_health, "system": system_health, "storage": storage_health }, "circuit_breakers": circuit_breakers, "configuration": { "max_concurrent_jobs": settings.MAX_CONCURRENT_TRAINING_JOBS, "min_training_days": settings.MIN_TRAINING_DATA_DAYS, "pool_size": settings.DB_POOL_SIZE, "pool_max_overflow": settings.DB_MAX_OVERFLOW } } @router.get("/health/ready") async def readiness_check() -> Dict[str, Any]: """ Readiness check for Kubernetes. Returns 200 only if service is ready to accept traffic. """ database_health = await check_database_health() if database_health.get("status") != "healthy": raise HTTPException( status_code=503, detail="Service not ready: database unavailable" ) storage_health = check_model_storage() if storage_health.get("status") == "error": raise HTTPException( status_code=503, detail="Service not ready: model storage unavailable" ) return { "status": "ready", "timestamp": datetime.now(timezone.utc).isoformat() } @router.get("/health/live") async def liveness_check() -> Dict[str, Any]: """ Liveness check for Kubernetes. Returns 200 if service process is alive. """ return { "status": "alive", "timestamp": datetime.now(timezone.utc).isoformat(), "pid": os.getpid() } @router.get("/metrics/system") async def system_metrics() -> Dict[str, Any]: """ Detailed system metrics for monitoring. """ process = psutil.Process(os.getpid()) return { "timestamp": datetime.now(timezone.utc).isoformat(), "process": { "pid": os.getpid(), "cpu_percent": process.cpu_percent(interval=0.1), "memory_mb": round(process.memory_info().rss / 1024 / 1024, 2), "threads": process.num_threads(), "open_files": len(process.open_files()), "connections": len(process.connections()) }, "system": check_system_resources() }