""" Enhanced Health Check Endpoints Comprehensive service health monitoring """ from fastapi import APIRouter, Depends, HTTPException from sqlalchemy import text from typing import Dict, Any import psutil import os from datetime import datetime, timezone import logging from app.core.database import database_manager from app.utils.circuit_breaker import circuit_breaker_registry from app.core.config import settings logger = logging.getLogger(__name__) router = APIRouter() async def check_database_health() -> Dict[str, Any]: """Check database connectivity and performance""" try: start_time = datetime.now(timezone.utc) async with database_manager.async_engine.begin() as conn: # Simple connectivity check await conn.execute(text("SELECT 1")) # Check if we can access training tables result = await conn.execute( text("SELECT COUNT(*) FROM trained_models") ) model_count = result.scalar() # Check connection pool stats pool = database_manager.async_engine.pool pool_size = pool.size() pool_checked_out = pool.checked_out_connections() response_time = (datetime.now(timezone.utc) - start_time).total_seconds() return { "status": "healthy", "response_time_seconds": round(response_time, 3), "model_count": model_count, "connection_pool": { "size": pool_size, "checked_out": pool_checked_out, "available": pool_size - pool_checked_out } } except Exception as e: logger.error(f"Database health check failed: {e}") return { "status": "unhealthy", "error": str(e) } def check_system_resources() -> Dict[str, Any]: """Check system resource usage""" try: cpu_percent = psutil.cpu_percent(interval=0.1) memory = psutil.virtual_memory() disk = psutil.disk_usage('/') return { "status": "healthy", "cpu": { "usage_percent": cpu_percent, "count": psutil.cpu_count() }, "memory": { "total_mb": round(memory.total / 1024 / 1024, 2), "used_mb": round(memory.used / 1024 / 1024, 2), "available_mb": round(memory.available / 1024 / 1024, 2), "usage_percent": memory.percent }, "disk": { "total_gb": round(disk.total / 1024 / 1024 / 1024, 2), "used_gb": round(disk.used / 1024 / 1024 / 1024, 2), "free_gb": round(disk.free / 1024 / 1024 / 1024, 2), "usage_percent": disk.percent } } except Exception as e: logger.error(f"System resource check failed: {e}") return { "status": "error", "error": str(e) } def check_model_storage() -> Dict[str, Any]: """Check MinIO model storage health""" try: from shared.clients.minio_client import minio_client # Check MinIO connectivity if not minio_client.health_check(): return { "status": "unhealthy", "message": "MinIO service is not reachable", "storage_type": "minio" } bucket_name = settings.MINIO_MODEL_BUCKET # Check if bucket exists bucket_exists = minio_client.bucket_exists(bucket_name) if not bucket_exists: return { "status": "warning", "message": f"MinIO bucket does not exist: {bucket_name}", "storage_type": "minio" } # Count model files in MinIO model_objects = minio_client.list_objects(bucket_name, prefix="models/") model_files = [obj for obj in model_objects if obj.endswith('.pkl')] return { "status": "healthy", "storage_type": "minio", "endpoint": settings.MINIO_ENDPOINT, "bucket": bucket_name, "use_ssl": settings.MINIO_USE_SSL, "model_files": len(model_files), "bucket_exists": bucket_exists } except Exception as e: logger.error(f"MinIO storage check failed: {e}") return { "status": "error", "storage_type": "minio", "error": str(e) } @router.get("/health") async def health_check() -> Dict[str, Any]: """ Basic health check endpoint. Returns 200 if service is running. """ return { "status": "healthy", "service": "training-service", "timestamp": datetime.now(timezone.utc).isoformat() } @router.get("/health/detailed") async def detailed_health_check() -> Dict[str, Any]: """ Detailed health check with component status. Includes database, system resources, and dependencies. """ database_health = await check_database_health() system_health = check_system_resources() storage_health = check_model_storage() circuit_breakers = circuit_breaker_registry.get_all_states() # Determine overall status component_statuses = [ database_health.get("status"), system_health.get("status"), storage_health.get("status") ] if "unhealthy" in component_statuses or "error" in component_statuses: overall_status = "unhealthy" elif "degraded" in component_statuses or "warning" in component_statuses: overall_status = "degraded" else: overall_status = "healthy" return { "status": overall_status, "service": "training-service", "version": "1.0.0", "timestamp": datetime.now(timezone.utc).isoformat(), "components": { "database": database_health, "system": system_health, "storage": storage_health }, "circuit_breakers": circuit_breakers, "configuration": { "max_concurrent_jobs": settings.MAX_CONCURRENT_TRAINING_JOBS, "min_training_days": settings.MIN_TRAINING_DATA_DAYS, "pool_size": settings.DB_POOL_SIZE, "pool_max_overflow": settings.DB_MAX_OVERFLOW } } @router.get("/health/ready") async def readiness_check() -> Dict[str, Any]: """ Readiness check for Kubernetes. Returns 200 only if service is ready to accept traffic. """ database_health = await check_database_health() if database_health.get("status") != "healthy": raise HTTPException( status_code=503, detail="Service not ready: database unavailable" ) storage_health = check_model_storage() if storage_health.get("status") == "error": raise HTTPException( status_code=503, detail="Service not ready: model storage unavailable" ) return { "status": "ready", "timestamp": datetime.now(timezone.utc).isoformat() } @router.get("/health/live") async def liveness_check() -> Dict[str, Any]: """ Liveness check for Kubernetes. Returns 200 if service process is alive. """ return { "status": "alive", "timestamp": datetime.now(timezone.utc).isoformat(), "pid": os.getpid() } @router.get("/metrics/system") async def system_metrics() -> Dict[str, Any]: """ Detailed system metrics for monitoring. """ process = psutil.Process(os.getpid()) return { "timestamp": datetime.now(timezone.utc).isoformat(), "process": { "pid": os.getpid(), "cpu_percent": process.cpu_percent(interval=0.1), "memory_mb": round(process.memory_info().rss / 1024 / 1024, 2), "threads": process.num_threads(), "open_files": len(process.open_files()), "connections": len(process.connections()) }, "system": check_system_resources() }