Files
bakery-ia/services/training/app/api/health.py

262 lines
7.8 KiB
Python
Raw Normal View History

"""
Enhanced Health Check Endpoints
Comprehensive service health monitoring
"""
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy import text
from typing import Dict, Any
import psutil
import os
from datetime import datetime, timezone
import logging
from app.core.database import database_manager
from app.utils.circuit_breaker import circuit_breaker_registry
from app.core.config import settings
logger = logging.getLogger(__name__)
router = APIRouter()
async def check_database_health() -> Dict[str, Any]:
"""Check database connectivity and performance"""
try:
start_time = datetime.now(timezone.utc)
async with database_manager.async_engine.begin() as conn:
# Simple connectivity check
await conn.execute(text("SELECT 1"))
# Check if we can access training tables
result = await conn.execute(
text("SELECT COUNT(*) FROM trained_models")
)
model_count = result.scalar()
# Check connection pool stats
pool = database_manager.async_engine.pool
pool_size = pool.size()
pool_checked_out = pool.checked_out_connections()
response_time = (datetime.now(timezone.utc) - start_time).total_seconds()
return {
"status": "healthy",
"response_time_seconds": round(response_time, 3),
"model_count": model_count,
"connection_pool": {
"size": pool_size,
"checked_out": pool_checked_out,
"available": pool_size - pool_checked_out
}
}
except Exception as e:
logger.error(f"Database health check failed: {e}")
return {
"status": "unhealthy",
"error": str(e)
}
def check_system_resources() -> Dict[str, Any]:
"""Check system resource usage"""
try:
cpu_percent = psutil.cpu_percent(interval=0.1)
memory = psutil.virtual_memory()
disk = psutil.disk_usage('/')
return {
"status": "healthy",
"cpu": {
"usage_percent": cpu_percent,
"count": psutil.cpu_count()
},
"memory": {
"total_mb": round(memory.total / 1024 / 1024, 2),
"used_mb": round(memory.used / 1024 / 1024, 2),
"available_mb": round(memory.available / 1024 / 1024, 2),
"usage_percent": memory.percent
},
"disk": {
"total_gb": round(disk.total / 1024 / 1024 / 1024, 2),
"used_gb": round(disk.used / 1024 / 1024 / 1024, 2),
"free_gb": round(disk.free / 1024 / 1024 / 1024, 2),
"usage_percent": disk.percent
}
}
except Exception as e:
logger.error(f"System resource check failed: {e}")
return {
"status": "error",
"error": str(e)
}
def check_model_storage() -> Dict[str, Any]:
"""Check model storage health"""
try:
storage_path = settings.MODEL_STORAGE_PATH
if not os.path.exists(storage_path):
return {
"status": "warning",
"message": f"Model storage path does not exist: {storage_path}"
}
# Check if writable
test_file = os.path.join(storage_path, ".health_check")
try:
with open(test_file, 'w') as f:
f.write("test")
os.remove(test_file)
writable = True
except Exception:
writable = False
# Count model files
model_files = 0
total_size = 0
for root, dirs, files in os.walk(storage_path):
for file in files:
if file.endswith('.pkl'):
model_files += 1
file_path = os.path.join(root, file)
total_size += os.path.getsize(file_path)
return {
"status": "healthy" if writable else "degraded",
"path": storage_path,
"writable": writable,
"model_files": model_files,
"total_size_mb": round(total_size / 1024 / 1024, 2)
}
except Exception as e:
logger.error(f"Model storage check failed: {e}")
return {
"status": "error",
"error": str(e)
}
@router.get("/health")
async def health_check() -> Dict[str, Any]:
"""
Basic health check endpoint.
Returns 200 if service is running.
"""
return {
"status": "healthy",
"service": "training-service",
"timestamp": datetime.now(timezone.utc).isoformat()
}
@router.get("/health/detailed")
async def detailed_health_check() -> Dict[str, Any]:
"""
Detailed health check with component status.
Includes database, system resources, and dependencies.
"""
database_health = await check_database_health()
system_health = check_system_resources()
storage_health = check_model_storage()
circuit_breakers = circuit_breaker_registry.get_all_states()
# Determine overall status
component_statuses = [
database_health.get("status"),
system_health.get("status"),
storage_health.get("status")
]
if "unhealthy" in component_statuses or "error" in component_statuses:
overall_status = "unhealthy"
elif "degraded" in component_statuses or "warning" in component_statuses:
overall_status = "degraded"
else:
overall_status = "healthy"
return {
"status": overall_status,
"service": "training-service",
"version": "1.0.0",
"timestamp": datetime.now(timezone.utc).isoformat(),
"components": {
"database": database_health,
"system": system_health,
"storage": storage_health
},
"circuit_breakers": circuit_breakers,
"configuration": {
"max_concurrent_jobs": settings.MAX_CONCURRENT_TRAINING_JOBS,
"min_training_days": settings.MIN_TRAINING_DATA_DAYS,
"pool_size": settings.DB_POOL_SIZE,
"pool_max_overflow": settings.DB_MAX_OVERFLOW
}
}
@router.get("/health/ready")
async def readiness_check() -> Dict[str, Any]:
"""
Readiness check for Kubernetes.
Returns 200 only if service is ready to accept traffic.
"""
database_health = await check_database_health()
if database_health.get("status") != "healthy":
raise HTTPException(
status_code=503,
detail="Service not ready: database unavailable"
)
storage_health = check_model_storage()
if storage_health.get("status") == "error":
raise HTTPException(
status_code=503,
detail="Service not ready: model storage unavailable"
)
return {
"status": "ready",
"timestamp": datetime.now(timezone.utc).isoformat()
}
@router.get("/health/live")
async def liveness_check() -> Dict[str, Any]:
"""
Liveness check for Kubernetes.
Returns 200 if service process is alive.
"""
return {
"status": "alive",
"timestamp": datetime.now(timezone.utc).isoformat(),
"pid": os.getpid()
}
@router.get("/metrics/system")
async def system_metrics() -> Dict[str, Any]:
"""
Detailed system metrics for monitoring.
"""
process = psutil.Process(os.getpid())
return {
"timestamp": datetime.now(timezone.utc).isoformat(),
"process": {
"pid": os.getpid(),
"cpu_percent": process.cpu_percent(interval=0.1),
"memory_mb": round(process.memory_info().rss / 1024 / 1024, 2),
"threads": process.num_threads(),
"open_files": len(process.open_files()),
"connections": len(process.connections())
},
"system": check_system_resources()
}