Files
bakery-ia/shared/monitoring/health.py
2025-07-18 12:34:28 +02:00

163 lines
5.3 KiB
Python

# ================================================================
# shared/monitoring/health.py
# ================================================================
"""
Health check utilities for microservices
"""
import asyncio
import logging
import time
from typing import Dict, List, Callable, Any, Optional
from dataclasses import dataclass
from enum import Enum
logger = logging.getLogger(__name__)
class HealthStatus(Enum):
HEALTHY = "healthy"
DEGRADED = "degraded"
UNHEALTHY = "unhealthy"
@dataclass
class HealthCheck:
name: str
check_function: Callable[[], Any]
timeout: float = 5.0
critical: bool = True
@dataclass
class HealthResult:
name: str
status: HealthStatus
message: str
duration: float
timestamp: float
class HealthChecker:
"""Health checker for microservices"""
def __init__(self, service_name: str):
self.service_name = service_name
self.checks: List[HealthCheck] = []
self.start_time = time.time()
def add_check(self, name: str, check_function: Callable, timeout: float = 5.0,
critical: bool = True) -> None:
"""Add a health check"""
self.checks.append(HealthCheck(name, check_function, timeout, critical))
async def run_check(self, check: HealthCheck) -> HealthResult:
"""Run a single health check"""
start_time = time.time()
try:
# Run the check with timeout
result = await asyncio.wait_for(
asyncio.create_task(self._execute_check(check.check_function)),
timeout=check.timeout
)
duration = time.time() - start_time
if result is True or (isinstance(result, dict) and result.get('healthy', False)):
return HealthResult(
name=check.name,
status=HealthStatus.HEALTHY,
message="OK",
duration=duration,
timestamp=time.time()
)
else:
message = str(result) if result else "Check failed"
return HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=message,
duration=duration,
timestamp=time.time()
)
except asyncio.TimeoutError:
duration = time.time() - start_time
return HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=f"Timeout after {check.timeout}s",
duration=duration,
timestamp=time.time()
)
except Exception as e:
duration = time.time() - start_time
return HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=f"Error: {str(e)}",
duration=duration,
timestamp=time.time()
)
async def _execute_check(self, check_function: Callable) -> Any:
"""Execute a check function (handles both sync and async)"""
if asyncio.iscoroutinefunction(check_function):
return await check_function()
else:
return check_function()
async def check_health(self) -> Dict[str, Any]:
"""Run all health checks and return status"""
if not self.checks:
return {
"service": self.service_name,
"status": HealthStatus.HEALTHY.value,
"uptime": time.time() - self.start_time,
"timestamp": time.time(),
"checks": {}
}
# Run all checks concurrently
results = await asyncio.gather(
*[self.run_check(check) for check in self.checks],
return_exceptions=True
)
# Process results
check_results = {}
overall_status = HealthStatus.HEALTHY
for i, result in enumerate(results):
check = self.checks[i]
if isinstance(result, Exception):
check_result = HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=f"Exception: {str(result)}",
duration=0.0,
timestamp=time.time()
)
else:
check_result = result
check_results[check.name] = {
"status": check_result.status.value,
"message": check_result.message,
"duration": check_result.duration,
"timestamp": check_result.timestamp
}
# Determine overall status
if check.critical and check_result.status == HealthStatus.UNHEALTHY:
overall_status = HealthStatus.UNHEALTHY
elif check_result.status == HealthStatus.DEGRADED and overall_status == HealthStatus.HEALTHY:
overall_status = HealthStatus.DEGRADED
return {
"service": self.service_name,
"status": overall_status.value,
"uptime": time.time() - self.start_time,
"timestamp": time.time(),
"checks": check_results
}