262 lines
7.8 KiB
Python
262 lines
7.8 KiB
Python
|
|
"""
|
||
|
|
Enhanced Health Check Endpoints
|
||
|
|
Comprehensive service health monitoring
|
||
|
|
"""
|
||
|
|
|
||
|
|
from fastapi import APIRouter, Depends, HTTPException
|
||
|
|
from sqlalchemy import text
|
||
|
|
from typing import Dict, Any
|
||
|
|
import psutil
|
||
|
|
import os
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
import logging
|
||
|
|
|
||
|
|
from app.core.database import database_manager
|
||
|
|
from app.utils.circuit_breaker import circuit_breaker_registry
|
||
|
|
from app.core.config import settings
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
router = APIRouter()
|
||
|
|
|
||
|
|
|
||
|
|
async def check_database_health() -> Dict[str, Any]:
|
||
|
|
"""Check database connectivity and performance"""
|
||
|
|
try:
|
||
|
|
start_time = datetime.now(timezone.utc)
|
||
|
|
|
||
|
|
async with database_manager.async_engine.begin() as conn:
|
||
|
|
# Simple connectivity check
|
||
|
|
await conn.execute(text("SELECT 1"))
|
||
|
|
|
||
|
|
# Check if we can access training tables
|
||
|
|
result = await conn.execute(
|
||
|
|
text("SELECT COUNT(*) FROM trained_models")
|
||
|
|
)
|
||
|
|
model_count = result.scalar()
|
||
|
|
|
||
|
|
# Check connection pool stats
|
||
|
|
pool = database_manager.async_engine.pool
|
||
|
|
pool_size = pool.size()
|
||
|
|
pool_checked_out = pool.checked_out_connections()
|
||
|
|
|
||
|
|
response_time = (datetime.now(timezone.utc) - start_time).total_seconds()
|
||
|
|
|
||
|
|
return {
|
||
|
|
"status": "healthy",
|
||
|
|
"response_time_seconds": round(response_time, 3),
|
||
|
|
"model_count": model_count,
|
||
|
|
"connection_pool": {
|
||
|
|
"size": pool_size,
|
||
|
|
"checked_out": pool_checked_out,
|
||
|
|
"available": pool_size - pool_checked_out
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Database health check failed: {e}")
|
||
|
|
return {
|
||
|
|
"status": "unhealthy",
|
||
|
|
"error": str(e)
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def check_system_resources() -> Dict[str, Any]:
|
||
|
|
"""Check system resource usage"""
|
||
|
|
try:
|
||
|
|
cpu_percent = psutil.cpu_percent(interval=0.1)
|
||
|
|
memory = psutil.virtual_memory()
|
||
|
|
disk = psutil.disk_usage('/')
|
||
|
|
|
||
|
|
return {
|
||
|
|
"status": "healthy",
|
||
|
|
"cpu": {
|
||
|
|
"usage_percent": cpu_percent,
|
||
|
|
"count": psutil.cpu_count()
|
||
|
|
},
|
||
|
|
"memory": {
|
||
|
|
"total_mb": round(memory.total / 1024 / 1024, 2),
|
||
|
|
"used_mb": round(memory.used / 1024 / 1024, 2),
|
||
|
|
"available_mb": round(memory.available / 1024 / 1024, 2),
|
||
|
|
"usage_percent": memory.percent
|
||
|
|
},
|
||
|
|
"disk": {
|
||
|
|
"total_gb": round(disk.total / 1024 / 1024 / 1024, 2),
|
||
|
|
"used_gb": round(disk.used / 1024 / 1024 / 1024, 2),
|
||
|
|
"free_gb": round(disk.free / 1024 / 1024 / 1024, 2),
|
||
|
|
"usage_percent": disk.percent
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"System resource check failed: {e}")
|
||
|
|
return {
|
||
|
|
"status": "error",
|
||
|
|
"error": str(e)
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def check_model_storage() -> Dict[str, Any]:
|
||
|
|
"""Check model storage health"""
|
||
|
|
try:
|
||
|
|
storage_path = settings.MODEL_STORAGE_PATH
|
||
|
|
|
||
|
|
if not os.path.exists(storage_path):
|
||
|
|
return {
|
||
|
|
"status": "warning",
|
||
|
|
"message": f"Model storage path does not exist: {storage_path}"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Check if writable
|
||
|
|
test_file = os.path.join(storage_path, ".health_check")
|
||
|
|
try:
|
||
|
|
with open(test_file, 'w') as f:
|
||
|
|
f.write("test")
|
||
|
|
os.remove(test_file)
|
||
|
|
writable = True
|
||
|
|
except Exception:
|
||
|
|
writable = False
|
||
|
|
|
||
|
|
# Count model files
|
||
|
|
model_files = 0
|
||
|
|
total_size = 0
|
||
|
|
for root, dirs, files in os.walk(storage_path):
|
||
|
|
for file in files:
|
||
|
|
if file.endswith('.pkl'):
|
||
|
|
model_files += 1
|
||
|
|
file_path = os.path.join(root, file)
|
||
|
|
total_size += os.path.getsize(file_path)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"status": "healthy" if writable else "degraded",
|
||
|
|
"path": storage_path,
|
||
|
|
"writable": writable,
|
||
|
|
"model_files": model_files,
|
||
|
|
"total_size_mb": round(total_size / 1024 / 1024, 2)
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Model storage check failed: {e}")
|
||
|
|
return {
|
||
|
|
"status": "error",
|
||
|
|
"error": str(e)
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/health")
|
||
|
|
async def health_check() -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Basic health check endpoint.
|
||
|
|
Returns 200 if service is running.
|
||
|
|
"""
|
||
|
|
return {
|
||
|
|
"status": "healthy",
|
||
|
|
"service": "training-service",
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat()
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/health/detailed")
|
||
|
|
async def detailed_health_check() -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Detailed health check with component status.
|
||
|
|
Includes database, system resources, and dependencies.
|
||
|
|
"""
|
||
|
|
database_health = await check_database_health()
|
||
|
|
system_health = check_system_resources()
|
||
|
|
storage_health = check_model_storage()
|
||
|
|
circuit_breakers = circuit_breaker_registry.get_all_states()
|
||
|
|
|
||
|
|
# Determine overall status
|
||
|
|
component_statuses = [
|
||
|
|
database_health.get("status"),
|
||
|
|
system_health.get("status"),
|
||
|
|
storage_health.get("status")
|
||
|
|
]
|
||
|
|
|
||
|
|
if "unhealthy" in component_statuses or "error" in component_statuses:
|
||
|
|
overall_status = "unhealthy"
|
||
|
|
elif "degraded" in component_statuses or "warning" in component_statuses:
|
||
|
|
overall_status = "degraded"
|
||
|
|
else:
|
||
|
|
overall_status = "healthy"
|
||
|
|
|
||
|
|
return {
|
||
|
|
"status": overall_status,
|
||
|
|
"service": "training-service",
|
||
|
|
"version": "1.0.0",
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||
|
|
"components": {
|
||
|
|
"database": database_health,
|
||
|
|
"system": system_health,
|
||
|
|
"storage": storage_health
|
||
|
|
},
|
||
|
|
"circuit_breakers": circuit_breakers,
|
||
|
|
"configuration": {
|
||
|
|
"max_concurrent_jobs": settings.MAX_CONCURRENT_TRAINING_JOBS,
|
||
|
|
"min_training_days": settings.MIN_TRAINING_DATA_DAYS,
|
||
|
|
"pool_size": settings.DB_POOL_SIZE,
|
||
|
|
"pool_max_overflow": settings.DB_MAX_OVERFLOW
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/health/ready")
|
||
|
|
async def readiness_check() -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Readiness check for Kubernetes.
|
||
|
|
Returns 200 only if service is ready to accept traffic.
|
||
|
|
"""
|
||
|
|
database_health = await check_database_health()
|
||
|
|
|
||
|
|
if database_health.get("status") != "healthy":
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=503,
|
||
|
|
detail="Service not ready: database unavailable"
|
||
|
|
)
|
||
|
|
|
||
|
|
storage_health = check_model_storage()
|
||
|
|
if storage_health.get("status") == "error":
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=503,
|
||
|
|
detail="Service not ready: model storage unavailable"
|
||
|
|
)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"status": "ready",
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat()
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/health/live")
|
||
|
|
async def liveness_check() -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Liveness check for Kubernetes.
|
||
|
|
Returns 200 if service process is alive.
|
||
|
|
"""
|
||
|
|
return {
|
||
|
|
"status": "alive",
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||
|
|
"pid": os.getpid()
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/metrics/system")
|
||
|
|
async def system_metrics() -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Detailed system metrics for monitoring.
|
||
|
|
"""
|
||
|
|
process = psutil.Process(os.getpid())
|
||
|
|
|
||
|
|
return {
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||
|
|
"process": {
|
||
|
|
"pid": os.getpid(),
|
||
|
|
"cpu_percent": process.cpu_percent(interval=0.1),
|
||
|
|
"memory_mb": round(process.memory_info().rss / 1024 / 1024, 2),
|
||
|
|
"threads": process.num_threads(),
|
||
|
|
"open_files": len(process.open_files()),
|
||
|
|
"connections": len(process.connections())
|
||
|
|
},
|
||
|
|
"system": check_system_resources()
|
||
|
|
}
|