Files
bakery-ia/shared/monitoring/health.py

177 lines
5.6 KiB
Python
Raw Normal View History

2026-01-21 17:17:16 +01:00
# ================================================================
# shared/monitoring/health.py
# ================================================================
"""
Health check utilities for microservices
"""
import asyncio
import logging
import time
from typing import Dict, List, Callable, Any, Optional
from dataclasses import dataclass
from enum import Enum
from fastapi import APIRouter
logger = logging.getLogger(__name__)
class HealthStatus(Enum):
HEALTHY = "healthy"
DEGRADED = "degraded"
UNHEALTHY = "unhealthy"
@dataclass
class HealthCheck:
name: str
check_function: Callable[[], Any]
timeout: float = 5.0
critical: bool = True
@dataclass
class HealthResult:
name: str
status: HealthStatus
message: str
duration: float
timestamp: float
class HealthChecker:
"""Health checker for microservices"""
def __init__(self, service_name: str):
self.service_name = service_name
self.checks: List[HealthCheck] = []
self.start_time = time.time()
def add_check(self, name: str, check_function: Callable, timeout: float = 5.0,
critical: bool = True) -> None:
"""Add a health check"""
self.checks.append(HealthCheck(name, check_function, timeout, critical))
async def run_check(self, check: HealthCheck) -> HealthResult:
"""Run a single health check"""
start_time = time.time()
try:
# Run the check with timeout
result = await asyncio.wait_for(
asyncio.create_task(self._execute_check(check.check_function)),
timeout=check.timeout
)
duration = time.time() - start_time
if result is True or (isinstance(result, dict) and result.get('healthy', False)):
return HealthResult(
name=check.name,
status=HealthStatus.HEALTHY,
message="OK",
duration=duration,
timestamp=time.time()
)
else:
message = str(result) if result else "Check failed"
return HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=message,
duration=duration,
timestamp=time.time()
)
except asyncio.TimeoutError:
duration = time.time() - start_time
return HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=f"Timeout after {check.timeout}s",
duration=duration,
timestamp=time.time()
)
except Exception as e:
duration = time.time() - start_time
return HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=f"Error: {str(e)}",
duration=duration,
timestamp=time.time()
)
async def _execute_check(self, check_function: Callable) -> Any:
"""Execute a check function (handles both sync and async)"""
if asyncio.iscoroutinefunction(check_function):
return await check_function()
else:
return check_function()
async def check_health(self) -> Dict[str, Any]:
"""Run all health checks and return status"""
if not self.checks:
return {
"service": self.service_name,
"status": HealthStatus.HEALTHY.value,
"uptime": time.time() - self.start_time,
"timestamp": time.time(),
"checks": {}
}
# Run all checks concurrently
results = await asyncio.gather(
*[self.run_check(check) for check in self.checks],
return_exceptions=True
)
# Process results
check_results = {}
overall_status = HealthStatus.HEALTHY
for i, result in enumerate(results):
check = self.checks[i]
if isinstance(result, Exception):
check_result = HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=f"Exception: {str(result)}",
duration=0.0,
timestamp=time.time()
)
else:
check_result = result
check_results[check.name] = {
"status": check_result.status.value,
"message": check_result.message,
"duration": check_result.duration,
"timestamp": check_result.timestamp
}
# Determine overall status
if check.critical and check_result.status == HealthStatus.UNHEALTHY:
overall_status = HealthStatus.UNHEALTHY
elif check_result.status == HealthStatus.DEGRADED and overall_status == HealthStatus.HEALTHY:
overall_status = HealthStatus.DEGRADED
return {
"service": self.service_name,
"status": overall_status.value,
"uptime": time.time() - self.start_time,
"timestamp": time.time(),
"checks": check_results
}
# Create FastAPI router for health endpoints
router = APIRouter()
@router.get("/")
async def health_check():
"""Basic health check endpoint"""
return {
"service": "service",
"status": "healthy",
"timestamp": time.time()
}