# ================================================================ # shared/monitoring/health.py # ================================================================ """ Health check utilities for microservices """ import asyncio import logging import time from typing import Dict, List, Callable, Any, Optional from dataclasses import dataclass from enum import Enum logger = logging.getLogger(__name__) class HealthStatus(Enum): HEALTHY = "healthy" DEGRADED = "degraded" UNHEALTHY = "unhealthy" @dataclass class HealthCheck: name: str check_function: Callable[[], Any] timeout: float = 5.0 critical: bool = True @dataclass class HealthResult: name: str status: HealthStatus message: str duration: float timestamp: float class HealthChecker: """Health checker for microservices""" def __init__(self, service_name: str): self.service_name = service_name self.checks: List[HealthCheck] = [] self.start_time = time.time() def add_check(self, name: str, check_function: Callable, timeout: float = 5.0, critical: bool = True) -> None: """Add a health check""" self.checks.append(HealthCheck(name, check_function, timeout, critical)) async def run_check(self, check: HealthCheck) -> HealthResult: """Run a single health check""" start_time = time.time() try: # Run the check with timeout result = await asyncio.wait_for( asyncio.create_task(self._execute_check(check.check_function)), timeout=check.timeout ) duration = time.time() - start_time if result is True or (isinstance(result, dict) and result.get('healthy', False)): return HealthResult( name=check.name, status=HealthStatus.HEALTHY, message="OK", duration=duration, timestamp=time.time() ) else: message = str(result) if result else "Check failed" return HealthResult( name=check.name, status=HealthStatus.UNHEALTHY, message=message, duration=duration, timestamp=time.time() ) except asyncio.TimeoutError: duration = time.time() - start_time return HealthResult( name=check.name, status=HealthStatus.UNHEALTHY, message=f"Timeout after {check.timeout}s", duration=duration, timestamp=time.time() ) except Exception as e: duration = time.time() - start_time return HealthResult( name=check.name, status=HealthStatus.UNHEALTHY, message=f"Error: {str(e)}", duration=duration, timestamp=time.time() ) async def _execute_check(self, check_function: Callable) -> Any: """Execute a check function (handles both sync and async)""" if asyncio.iscoroutinefunction(check_function): return await check_function() else: return check_function() async def check_health(self) -> Dict[str, Any]: """Run all health checks and return status""" if not self.checks: return { "service": self.service_name, "status": HealthStatus.HEALTHY.value, "uptime": time.time() - self.start_time, "timestamp": time.time(), "checks": {} } # Run all checks concurrently results = await asyncio.gather( *[self.run_check(check) for check in self.checks], return_exceptions=True ) # Process results check_results = {} overall_status = HealthStatus.HEALTHY for i, result in enumerate(results): check = self.checks[i] if isinstance(result, Exception): check_result = HealthResult( name=check.name, status=HealthStatus.UNHEALTHY, message=f"Exception: {str(result)}", duration=0.0, timestamp=time.time() ) else: check_result = result check_results[check.name] = { "status": check_result.status.value, "message": check_result.message, "duration": check_result.duration, "timestamp": check_result.timestamp } # Determine overall status if check.critical and check_result.status == HealthStatus.UNHEALTHY: overall_status = HealthStatus.UNHEALTHY elif check_result.status == HealthStatus.DEGRADED and overall_status == HealthStatus.HEALTHY: overall_status = HealthStatus.DEGRADED return { "service": self.service_name, "status": overall_status.value, "uptime": time.time() - self.start_time, "timestamp": time.time(), "checks": check_results }