177 lines
5.6 KiB
Python
Executable File
177 lines
5.6 KiB
Python
Executable File
# ================================================================
|
|
# shared/monitoring/health.py
|
|
# ================================================================
|
|
"""
|
|
Health check utilities for microservices
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import time
|
|
from typing import Dict, List, Callable, Any, Optional
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
from fastapi import APIRouter
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class HealthStatus(Enum):
|
|
HEALTHY = "healthy"
|
|
DEGRADED = "degraded"
|
|
UNHEALTHY = "unhealthy"
|
|
|
|
@dataclass
|
|
class HealthCheck:
|
|
name: str
|
|
check_function: Callable[[], Any]
|
|
timeout: float = 5.0
|
|
critical: bool = True
|
|
|
|
@dataclass
|
|
class HealthResult:
|
|
name: str
|
|
status: HealthStatus
|
|
message: str
|
|
duration: float
|
|
timestamp: float
|
|
|
|
class HealthChecker:
|
|
"""Health checker for microservices"""
|
|
|
|
def __init__(self, service_name: str):
|
|
self.service_name = service_name
|
|
self.checks: List[HealthCheck] = []
|
|
self.start_time = time.time()
|
|
|
|
def add_check(self, name: str, check_function: Callable, timeout: float = 5.0,
|
|
critical: bool = True) -> None:
|
|
"""Add a health check"""
|
|
self.checks.append(HealthCheck(name, check_function, timeout, critical))
|
|
|
|
async def run_check(self, check: HealthCheck) -> HealthResult:
|
|
"""Run a single health check"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
# Run the check with timeout
|
|
result = await asyncio.wait_for(
|
|
asyncio.create_task(self._execute_check(check.check_function)),
|
|
timeout=check.timeout
|
|
)
|
|
|
|
duration = time.time() - start_time
|
|
|
|
if result is True or (isinstance(result, dict) and result.get('healthy', False)):
|
|
return HealthResult(
|
|
name=check.name,
|
|
status=HealthStatus.HEALTHY,
|
|
message="OK",
|
|
duration=duration,
|
|
timestamp=time.time()
|
|
)
|
|
else:
|
|
message = str(result) if result else "Check failed"
|
|
return HealthResult(
|
|
name=check.name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message=message,
|
|
duration=duration,
|
|
timestamp=time.time()
|
|
)
|
|
|
|
except asyncio.TimeoutError:
|
|
duration = time.time() - start_time
|
|
return HealthResult(
|
|
name=check.name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message=f"Timeout after {check.timeout}s",
|
|
duration=duration,
|
|
timestamp=time.time()
|
|
)
|
|
except Exception as e:
|
|
duration = time.time() - start_time
|
|
return HealthResult(
|
|
name=check.name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message=f"Error: {str(e)}",
|
|
duration=duration,
|
|
timestamp=time.time()
|
|
)
|
|
|
|
async def _execute_check(self, check_function: Callable) -> Any:
|
|
"""Execute a check function (handles both sync and async)"""
|
|
if asyncio.iscoroutinefunction(check_function):
|
|
return await check_function()
|
|
else:
|
|
return check_function()
|
|
|
|
async def check_health(self) -> Dict[str, Any]:
|
|
"""Run all health checks and return status"""
|
|
if not self.checks:
|
|
return {
|
|
"service": self.service_name,
|
|
"status": HealthStatus.HEALTHY.value,
|
|
"uptime": time.time() - self.start_time,
|
|
"timestamp": time.time(),
|
|
"checks": {}
|
|
}
|
|
|
|
# Run all checks concurrently
|
|
results = await asyncio.gather(
|
|
*[self.run_check(check) for check in self.checks],
|
|
return_exceptions=True
|
|
)
|
|
|
|
# Process results
|
|
check_results = {}
|
|
overall_status = HealthStatus.HEALTHY
|
|
|
|
for i, result in enumerate(results):
|
|
check = self.checks[i]
|
|
|
|
if isinstance(result, Exception):
|
|
check_result = HealthResult(
|
|
name=check.name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message=f"Exception: {str(result)}",
|
|
duration=0.0,
|
|
timestamp=time.time()
|
|
)
|
|
else:
|
|
check_result = result
|
|
|
|
check_results[check.name] = {
|
|
"status": check_result.status.value,
|
|
"message": check_result.message,
|
|
"duration": check_result.duration,
|
|
"timestamp": check_result.timestamp
|
|
}
|
|
|
|
# Determine overall status
|
|
if check.critical and check_result.status == HealthStatus.UNHEALTHY:
|
|
overall_status = HealthStatus.UNHEALTHY
|
|
elif check_result.status == HealthStatus.DEGRADED and overall_status == HealthStatus.HEALTHY:
|
|
overall_status = HealthStatus.DEGRADED
|
|
|
|
return {
|
|
"service": self.service_name,
|
|
"status": overall_status.value,
|
|
"uptime": time.time() - self.start_time,
|
|
"timestamp": time.time(),
|
|
"checks": check_results
|
|
}
|
|
|
|
|
|
# Create FastAPI router for health endpoints
|
|
router = APIRouter()
|
|
|
|
@router.get("/")
|
|
async def health_check():
|
|
"""Basic health check endpoint"""
|
|
return {
|
|
"service": "service",
|
|
"status": "healthy",
|
|
"timestamp": time.time()
|
|
}
|
|
|