Fix shared issues
This commit is contained in:
162
shared/monitoring/health.py
Normal file
162
shared/monitoring/health.py
Normal file
@@ -0,0 +1,162 @@
|
||||
# ================================================================
|
||||
# shared/monitoring/health.py
|
||||
# ================================================================
|
||||
"""
|
||||
Health check utilities for microservices
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, List, Callable, Any, Optional
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class HealthStatus(Enum):
|
||||
HEALTHY = "healthy"
|
||||
DEGRADED = "degraded"
|
||||
UNHEALTHY = "unhealthy"
|
||||
|
||||
@dataclass
|
||||
class HealthCheck:
|
||||
name: str
|
||||
check_function: Callable[[], Any]
|
||||
timeout: float = 5.0
|
||||
critical: bool = True
|
||||
|
||||
@dataclass
|
||||
class HealthResult:
|
||||
name: str
|
||||
status: HealthStatus
|
||||
message: str
|
||||
duration: float
|
||||
timestamp: float
|
||||
|
||||
class HealthChecker:
|
||||
"""Health checker for microservices"""
|
||||
|
||||
def __init__(self, service_name: str):
|
||||
self.service_name = service_name
|
||||
self.checks: List[HealthCheck] = []
|
||||
self.start_time = time.time()
|
||||
|
||||
def add_check(self, name: str, check_function: Callable, timeout: float = 5.0,
|
||||
critical: bool = True) -> None:
|
||||
"""Add a health check"""
|
||||
self.checks.append(HealthCheck(name, check_function, timeout, critical))
|
||||
|
||||
async def run_check(self, check: HealthCheck) -> HealthResult:
|
||||
"""Run a single health check"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Run the check with timeout
|
||||
result = await asyncio.wait_for(
|
||||
asyncio.create_task(self._execute_check(check.check_function)),
|
||||
timeout=check.timeout
|
||||
)
|
||||
|
||||
duration = time.time() - start_time
|
||||
|
||||
if result is True or (isinstance(result, dict) and result.get('healthy', False)):
|
||||
return HealthResult(
|
||||
name=check.name,
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="OK",
|
||||
duration=duration,
|
||||
timestamp=time.time()
|
||||
)
|
||||
else:
|
||||
message = str(result) if result else "Check failed"
|
||||
return HealthResult(
|
||||
name=check.name,
|
||||
status=HealthStatus.UNHEALTHY,
|
||||
message=message,
|
||||
duration=duration,
|
||||
timestamp=time.time()
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
duration = time.time() - start_time
|
||||
return HealthResult(
|
||||
name=check.name,
|
||||
status=HealthStatus.UNHEALTHY,
|
||||
message=f"Timeout after {check.timeout}s",
|
||||
duration=duration,
|
||||
timestamp=time.time()
|
||||
)
|
||||
except Exception as e:
|
||||
duration = time.time() - start_time
|
||||
return HealthResult(
|
||||
name=check.name,
|
||||
status=HealthStatus.UNHEALTHY,
|
||||
message=f"Error: {str(e)}",
|
||||
duration=duration,
|
||||
timestamp=time.time()
|
||||
)
|
||||
|
||||
async def _execute_check(self, check_function: Callable) -> Any:
|
||||
"""Execute a check function (handles both sync and async)"""
|
||||
if asyncio.iscoroutinefunction(check_function):
|
||||
return await check_function()
|
||||
else:
|
||||
return check_function()
|
||||
|
||||
async def check_health(self) -> Dict[str, Any]:
|
||||
"""Run all health checks and return status"""
|
||||
if not self.checks:
|
||||
return {
|
||||
"service": self.service_name,
|
||||
"status": HealthStatus.HEALTHY.value,
|
||||
"uptime": time.time() - self.start_time,
|
||||
"timestamp": time.time(),
|
||||
"checks": {}
|
||||
}
|
||||
|
||||
# Run all checks concurrently
|
||||
results = await asyncio.gather(
|
||||
*[self.run_check(check) for check in self.checks],
|
||||
return_exceptions=True
|
||||
)
|
||||
|
||||
# Process results
|
||||
check_results = {}
|
||||
overall_status = HealthStatus.HEALTHY
|
||||
|
||||
for i, result in enumerate(results):
|
||||
check = self.checks[i]
|
||||
|
||||
if isinstance(result, Exception):
|
||||
check_result = HealthResult(
|
||||
name=check.name,
|
||||
status=HealthStatus.UNHEALTHY,
|
||||
message=f"Exception: {str(result)}",
|
||||
duration=0.0,
|
||||
timestamp=time.time()
|
||||
)
|
||||
else:
|
||||
check_result = result
|
||||
|
||||
check_results[check.name] = {
|
||||
"status": check_result.status.value,
|
||||
"message": check_result.message,
|
||||
"duration": check_result.duration,
|
||||
"timestamp": check_result.timestamp
|
||||
}
|
||||
|
||||
# Determine overall status
|
||||
if check.critical and check_result.status == HealthStatus.UNHEALTHY:
|
||||
overall_status = HealthStatus.UNHEALTHY
|
||||
elif check_result.status == HealthStatus.DEGRADED and overall_status == HealthStatus.HEALTHY:
|
||||
overall_status = HealthStatus.DEGRADED
|
||||
|
||||
return {
|
||||
"service": self.service_name,
|
||||
"status": overall_status.value,
|
||||
"uptime": time.time() - self.start_time,
|
||||
"timestamp": time.time(),
|
||||
"checks": check_results
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user