REFACTOR external service and improve websocket training
This commit is contained in:
261
services/training/app/api/health.py
Normal file
261
services/training/app/api/health.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
Enhanced Health Check Endpoints
|
||||
Comprehensive service health monitoring
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sqlalchemy import text
|
||||
from typing import Dict, Any
|
||||
import psutil
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
import logging
|
||||
|
||||
from app.core.database import database_manager
|
||||
from app.utils.circuit_breaker import circuit_breaker_registry
|
||||
from app.core.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
async def check_database_health() -> Dict[str, Any]:
|
||||
"""Check database connectivity and performance"""
|
||||
try:
|
||||
start_time = datetime.now(timezone.utc)
|
||||
|
||||
async with database_manager.async_engine.begin() as conn:
|
||||
# Simple connectivity check
|
||||
await conn.execute(text("SELECT 1"))
|
||||
|
||||
# Check if we can access training tables
|
||||
result = await conn.execute(
|
||||
text("SELECT COUNT(*) FROM trained_models")
|
||||
)
|
||||
model_count = result.scalar()
|
||||
|
||||
# Check connection pool stats
|
||||
pool = database_manager.async_engine.pool
|
||||
pool_size = pool.size()
|
||||
pool_checked_out = pool.checked_out_connections()
|
||||
|
||||
response_time = (datetime.now(timezone.utc) - start_time).total_seconds()
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"response_time_seconds": round(response_time, 3),
|
||||
"model_count": model_count,
|
||||
"connection_pool": {
|
||||
"size": pool_size,
|
||||
"checked_out": pool_checked_out,
|
||||
"available": pool_size - pool_checked_out
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Database health check failed: {e}")
|
||||
return {
|
||||
"status": "unhealthy",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
def check_system_resources() -> Dict[str, Any]:
|
||||
"""Check system resource usage"""
|
||||
try:
|
||||
cpu_percent = psutil.cpu_percent(interval=0.1)
|
||||
memory = psutil.virtual_memory()
|
||||
disk = psutil.disk_usage('/')
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"cpu": {
|
||||
"usage_percent": cpu_percent,
|
||||
"count": psutil.cpu_count()
|
||||
},
|
||||
"memory": {
|
||||
"total_mb": round(memory.total / 1024 / 1024, 2),
|
||||
"used_mb": round(memory.used / 1024 / 1024, 2),
|
||||
"available_mb": round(memory.available / 1024 / 1024, 2),
|
||||
"usage_percent": memory.percent
|
||||
},
|
||||
"disk": {
|
||||
"total_gb": round(disk.total / 1024 / 1024 / 1024, 2),
|
||||
"used_gb": round(disk.used / 1024 / 1024 / 1024, 2),
|
||||
"free_gb": round(disk.free / 1024 / 1024 / 1024, 2),
|
||||
"usage_percent": disk.percent
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"System resource check failed: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
def check_model_storage() -> Dict[str, Any]:
|
||||
"""Check model storage health"""
|
||||
try:
|
||||
storage_path = settings.MODEL_STORAGE_PATH
|
||||
|
||||
if not os.path.exists(storage_path):
|
||||
return {
|
||||
"status": "warning",
|
||||
"message": f"Model storage path does not exist: {storage_path}"
|
||||
}
|
||||
|
||||
# Check if writable
|
||||
test_file = os.path.join(storage_path, ".health_check")
|
||||
try:
|
||||
with open(test_file, 'w') as f:
|
||||
f.write("test")
|
||||
os.remove(test_file)
|
||||
writable = True
|
||||
except Exception:
|
||||
writable = False
|
||||
|
||||
# Count model files
|
||||
model_files = 0
|
||||
total_size = 0
|
||||
for root, dirs, files in os.walk(storage_path):
|
||||
for file in files:
|
||||
if file.endswith('.pkl'):
|
||||
model_files += 1
|
||||
file_path = os.path.join(root, file)
|
||||
total_size += os.path.getsize(file_path)
|
||||
|
||||
return {
|
||||
"status": "healthy" if writable else "degraded",
|
||||
"path": storage_path,
|
||||
"writable": writable,
|
||||
"model_files": model_files,
|
||||
"total_size_mb": round(total_size / 1024 / 1024, 2)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Model storage check failed: {e}")
|
||||
return {
|
||||
"status": "error",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def health_check() -> Dict[str, Any]:
|
||||
"""
|
||||
Basic health check endpoint.
|
||||
Returns 200 if service is running.
|
||||
"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": "training-service",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat()
|
||||
}
|
||||
|
||||
|
||||
@router.get("/health/detailed")
|
||||
async def detailed_health_check() -> Dict[str, Any]:
|
||||
"""
|
||||
Detailed health check with component status.
|
||||
Includes database, system resources, and dependencies.
|
||||
"""
|
||||
database_health = await check_database_health()
|
||||
system_health = check_system_resources()
|
||||
storage_health = check_model_storage()
|
||||
circuit_breakers = circuit_breaker_registry.get_all_states()
|
||||
|
||||
# Determine overall status
|
||||
component_statuses = [
|
||||
database_health.get("status"),
|
||||
system_health.get("status"),
|
||||
storage_health.get("status")
|
||||
]
|
||||
|
||||
if "unhealthy" in component_statuses or "error" in component_statuses:
|
||||
overall_status = "unhealthy"
|
||||
elif "degraded" in component_statuses or "warning" in component_statuses:
|
||||
overall_status = "degraded"
|
||||
else:
|
||||
overall_status = "healthy"
|
||||
|
||||
return {
|
||||
"status": overall_status,
|
||||
"service": "training-service",
|
||||
"version": "1.0.0",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"components": {
|
||||
"database": database_health,
|
||||
"system": system_health,
|
||||
"storage": storage_health
|
||||
},
|
||||
"circuit_breakers": circuit_breakers,
|
||||
"configuration": {
|
||||
"max_concurrent_jobs": settings.MAX_CONCURRENT_TRAINING_JOBS,
|
||||
"min_training_days": settings.MIN_TRAINING_DATA_DAYS,
|
||||
"pool_size": settings.DB_POOL_SIZE,
|
||||
"pool_max_overflow": settings.DB_MAX_OVERFLOW
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@router.get("/health/ready")
|
||||
async def readiness_check() -> Dict[str, Any]:
|
||||
"""
|
||||
Readiness check for Kubernetes.
|
||||
Returns 200 only if service is ready to accept traffic.
|
||||
"""
|
||||
database_health = await check_database_health()
|
||||
|
||||
if database_health.get("status") != "healthy":
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Service not ready: database unavailable"
|
||||
)
|
||||
|
||||
storage_health = check_model_storage()
|
||||
if storage_health.get("status") == "error":
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Service not ready: model storage unavailable"
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "ready",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat()
|
||||
}
|
||||
|
||||
|
||||
@router.get("/health/live")
|
||||
async def liveness_check() -> Dict[str, Any]:
|
||||
"""
|
||||
Liveness check for Kubernetes.
|
||||
Returns 200 if service process is alive.
|
||||
"""
|
||||
return {
|
||||
"status": "alive",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"pid": os.getpid()
|
||||
}
|
||||
|
||||
|
||||
@router.get("/metrics/system")
|
||||
async def system_metrics() -> Dict[str, Any]:
|
||||
"""
|
||||
Detailed system metrics for monitoring.
|
||||
"""
|
||||
process = psutil.Process(os.getpid())
|
||||
|
||||
return {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"process": {
|
||||
"pid": os.getpid(),
|
||||
"cpu_percent": process.cpu_percent(interval=0.1),
|
||||
"memory_mb": round(process.memory_info().rss / 1024 / 1024, 2),
|
||||
"threads": process.num_threads(),
|
||||
"open_files": len(process.open_files()),
|
||||
"connections": len(process.connections())
|
||||
},
|
||||
"system": check_system_resources()
|
||||
}
|
||||
Reference in New Issue
Block a user