Initial commit - production deployment
This commit is contained in:
97
shared/monitoring/__init__.py
Executable file
97
shared/monitoring/__init__.py
Executable file
@@ -0,0 +1,97 @@
|
||||
"""
|
||||
Shared monitoring package for microservices
|
||||
|
||||
Provides unified OpenTelemetry-based observability:
|
||||
- Traces: Distributed tracing
|
||||
- Metrics: System and application metrics
|
||||
- Logs: Structured logging
|
||||
|
||||
All signals exported to SigNoz via OTLP.
|
||||
"""
|
||||
|
||||
# Core setup - START HERE
|
||||
from .logging import setup_logging
|
||||
from .telemetry import (
|
||||
setup_telemetry,
|
||||
setup_telemetry_simple,
|
||||
get_telemetry_status,
|
||||
TelemetryProviders
|
||||
)
|
||||
|
||||
# Configuration
|
||||
from .otel_config import OTelConfig, OTelEndpoints
|
||||
|
||||
# Individual signal setup (used by telemetry.py)
|
||||
from .tracing import (
|
||||
setup_tracing,
|
||||
get_current_trace_id,
|
||||
get_current_span_id,
|
||||
add_trace_attributes,
|
||||
add_trace_event,
|
||||
record_exception
|
||||
)
|
||||
from .logs_exporter import (
|
||||
setup_otel_logging,
|
||||
add_log_context,
|
||||
get_current_trace_context,
|
||||
StructlogOTELProcessor
|
||||
)
|
||||
from .metrics_exporter import (
|
||||
setup_otel_metrics,
|
||||
OTelMetricsCollector,
|
||||
create_dual_metrics_collector
|
||||
)
|
||||
from .system_metrics import (
|
||||
SystemMetricsCollector,
|
||||
ApplicationMetricsCollector,
|
||||
setup_all_metrics
|
||||
)
|
||||
|
||||
# Health checks
|
||||
from .health_checks import (
|
||||
HealthCheckManager,
|
||||
FastAPIHealthChecker,
|
||||
create_health_manager,
|
||||
setup_fastapi_health_checks
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# CORE - Start with these
|
||||
'setup_logging',
|
||||
'setup_telemetry',
|
||||
'setup_telemetry_simple',
|
||||
'get_telemetry_status',
|
||||
'TelemetryProviders',
|
||||
|
||||
# Configuration
|
||||
'OTelConfig',
|
||||
'OTelEndpoints',
|
||||
|
||||
# Tracing
|
||||
'setup_tracing',
|
||||
'get_current_trace_id',
|
||||
'get_current_span_id',
|
||||
'add_trace_attributes',
|
||||
'add_trace_event',
|
||||
'record_exception',
|
||||
|
||||
# Logs
|
||||
'setup_otel_logging',
|
||||
'add_log_context',
|
||||
'get_current_trace_context',
|
||||
'StructlogOTELProcessor',
|
||||
|
||||
# Metrics
|
||||
'setup_otel_metrics',
|
||||
'OTelMetricsCollector',
|
||||
'create_dual_metrics_collector',
|
||||
'SystemMetricsCollector',
|
||||
'ApplicationMetricsCollector',
|
||||
'setup_all_metrics',
|
||||
|
||||
# Health checks
|
||||
'HealthCheckManager',
|
||||
'FastAPIHealthChecker',
|
||||
'create_health_manager',
|
||||
'setup_fastapi_health_checks',
|
||||
]
|
||||
179
shared/monitoring/decorators.py
Executable file
179
shared/monitoring/decorators.py
Executable file
@@ -0,0 +1,179 @@
|
||||
# ================================================================
|
||||
# shared/monitoring/decorators.py
|
||||
# ================================================================
|
||||
"""
|
||||
Decorators for monitoring and metrics
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
import functools
|
||||
from typing import Callable, Any, Optional
|
||||
from .metrics import get_metrics_collector
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def track_execution_time(metric_name: str, service_name: str,
|
||||
labels: Optional[dict] = None):
|
||||
"""Decorator to track function execution time"""
|
||||
def decorator(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
async def async_wrapper(*args, **kwargs) -> Any:
|
||||
start_time = time.time()
|
||||
try:
|
||||
result = await func(*args, **kwargs)
|
||||
duration = time.time() - start_time
|
||||
|
||||
metrics_collector = get_metrics_collector(service_name)
|
||||
if metrics_collector:
|
||||
metrics_collector.observe_histogram(metric_name, duration, labels)
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
duration = time.time() - start_time
|
||||
logger.error(f"Function {func.__name__} failed after {duration:.2f}s: {e}")
|
||||
raise
|
||||
|
||||
@functools.wraps(func)
|
||||
def sync_wrapper(*args, **kwargs) -> Any:
|
||||
start_time = time.time()
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
duration = time.time() - start_time
|
||||
|
||||
metrics_collector = get_metrics_collector(service_name)
|
||||
if metrics_collector:
|
||||
metrics_collector.observe_histogram(metric_name, duration, labels)
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
duration = time.time() - start_time
|
||||
logger.error(f"Function {func.__name__} failed after {duration:.2f}s: {e}")
|
||||
raise
|
||||
|
||||
# Return appropriate wrapper based on function type
|
||||
import asyncio
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
return async_wrapper
|
||||
else:
|
||||
return sync_wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def count_calls(metric_name: str, service_name: str,
|
||||
labels: Optional[dict] = None):
|
||||
"""Decorator to count function calls"""
|
||||
def decorator(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
async def async_wrapper(*args, **kwargs) -> Any:
|
||||
metrics_collector = get_metrics_collector(service_name)
|
||||
if metrics_collector:
|
||||
metrics_collector.increment_counter(metric_name, labels=labels)
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
@functools.wraps(func)
|
||||
def sync_wrapper(*args, **kwargs) -> Any:
|
||||
metrics_collector = get_metrics_collector(service_name)
|
||||
if metrics_collector:
|
||||
metrics_collector.increment_counter(metric_name, labels=labels)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
# Return appropriate wrapper based on function type
|
||||
import asyncio
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
return async_wrapper
|
||||
else:
|
||||
return sync_wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def monitor_performance(operation_name: str, labels: Optional[dict] = None):
|
||||
"""
|
||||
General purpose performance monitoring decorator
|
||||
Tracks execution time and call counts for the given operation
|
||||
"""
|
||||
def decorator(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
async def async_wrapper(*args, **kwargs) -> Any:
|
||||
start_time = time.time()
|
||||
service_name = "orders-service" # Could be dynamic based on context
|
||||
|
||||
try:
|
||||
# Count the call
|
||||
metrics_collector = get_metrics_collector(service_name)
|
||||
if metrics_collector:
|
||||
call_labels = {**(labels or {}), "operation": operation_name}
|
||||
metrics_collector.increment_counter(f"{service_name}_operations_total", labels=call_labels)
|
||||
|
||||
# Execute the function
|
||||
result = await func(*args, **kwargs)
|
||||
|
||||
# Record success timing
|
||||
duration = time.time() - start_time
|
||||
if metrics_collector:
|
||||
timing_labels = {**(labels or {}), "operation": operation_name, "status": "success"}
|
||||
metrics_collector.observe_histogram(f"{service_name}_operation_duration_seconds", duration, timing_labels)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# Record failure timing
|
||||
duration = time.time() - start_time
|
||||
metrics_collector = get_metrics_collector(service_name)
|
||||
if metrics_collector:
|
||||
timing_labels = {**(labels or {}), "operation": operation_name, "status": "error"}
|
||||
metrics_collector.observe_histogram(f"{service_name}_operation_duration_seconds", duration, timing_labels)
|
||||
|
||||
error_labels = {**(labels or {}), "operation": operation_name, "error_type": type(e).__name__}
|
||||
metrics_collector.increment_counter(f"{service_name}_errors_total", labels=error_labels)
|
||||
|
||||
logger.error(f"Operation {operation_name} failed after {duration:.2f}s: {e}")
|
||||
raise
|
||||
|
||||
@functools.wraps(func)
|
||||
def sync_wrapper(*args, **kwargs) -> Any:
|
||||
start_time = time.time()
|
||||
service_name = "orders-service" # Could be dynamic based on context
|
||||
|
||||
try:
|
||||
# Count the call
|
||||
metrics_collector = get_metrics_collector(service_name)
|
||||
if metrics_collector:
|
||||
call_labels = {**(labels or {}), "operation": operation_name}
|
||||
metrics_collector.increment_counter(f"{service_name}_operations_total", labels=call_labels)
|
||||
|
||||
# Execute the function
|
||||
result = func(*args, **kwargs)
|
||||
|
||||
# Record success timing
|
||||
duration = time.time() - start_time
|
||||
if metrics_collector:
|
||||
timing_labels = {**(labels or {}), "operation": operation_name, "status": "success"}
|
||||
metrics_collector.observe_histogram(f"{service_name}_operation_duration_seconds", duration, timing_labels)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# Record failure timing
|
||||
duration = time.time() - start_time
|
||||
metrics_collector = get_metrics_collector(service_name)
|
||||
if metrics_collector:
|
||||
timing_labels = {**(labels or {}), "operation": operation_name, "status": "error"}
|
||||
metrics_collector.observe_histogram(f"{service_name}_operation_duration_seconds", duration, timing_labels)
|
||||
|
||||
error_labels = {**(labels or {}), "operation": operation_name, "error_type": type(e).__name__}
|
||||
metrics_collector.increment_counter(f"{service_name}_errors_total", labels=error_labels)
|
||||
|
||||
logger.error(f"Operation {operation_name} failed after {duration:.2f}s: {e}")
|
||||
raise
|
||||
|
||||
# Return appropriate wrapper based on function type
|
||||
import asyncio
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
return async_wrapper
|
||||
else:
|
||||
return sync_wrapper
|
||||
|
||||
return decorator
|
||||
176
shared/monitoring/health.py
Executable file
176
shared/monitoring/health.py
Executable file
@@ -0,0 +1,176 @@
|
||||
# ================================================================
|
||||
# shared/monitoring/health.py
|
||||
# ================================================================
|
||||
"""
|
||||
Health check utilities for microservices
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, List, Callable, Any, Optional
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from fastapi import APIRouter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class HealthStatus(Enum):
|
||||
HEALTHY = "healthy"
|
||||
DEGRADED = "degraded"
|
||||
UNHEALTHY = "unhealthy"
|
||||
|
||||
@dataclass
|
||||
class HealthCheck:
|
||||
name: str
|
||||
check_function: Callable[[], Any]
|
||||
timeout: float = 5.0
|
||||
critical: bool = True
|
||||
|
||||
@dataclass
|
||||
class HealthResult:
|
||||
name: str
|
||||
status: HealthStatus
|
||||
message: str
|
||||
duration: float
|
||||
timestamp: float
|
||||
|
||||
class HealthChecker:
|
||||
"""Health checker for microservices"""
|
||||
|
||||
def __init__(self, service_name: str):
|
||||
self.service_name = service_name
|
||||
self.checks: List[HealthCheck] = []
|
||||
self.start_time = time.time()
|
||||
|
||||
def add_check(self, name: str, check_function: Callable, timeout: float = 5.0,
|
||||
critical: bool = True) -> None:
|
||||
"""Add a health check"""
|
||||
self.checks.append(HealthCheck(name, check_function, timeout, critical))
|
||||
|
||||
async def run_check(self, check: HealthCheck) -> HealthResult:
|
||||
"""Run a single health check"""
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Run the check with timeout
|
||||
result = await asyncio.wait_for(
|
||||
asyncio.create_task(self._execute_check(check.check_function)),
|
||||
timeout=check.timeout
|
||||
)
|
||||
|
||||
duration = time.time() - start_time
|
||||
|
||||
if result is True or (isinstance(result, dict) and result.get('healthy', False)):
|
||||
return HealthResult(
|
||||
name=check.name,
|
||||
status=HealthStatus.HEALTHY,
|
||||
message="OK",
|
||||
duration=duration,
|
||||
timestamp=time.time()
|
||||
)
|
||||
else:
|
||||
message = str(result) if result else "Check failed"
|
||||
return HealthResult(
|
||||
name=check.name,
|
||||
status=HealthStatus.UNHEALTHY,
|
||||
message=message,
|
||||
duration=duration,
|
||||
timestamp=time.time()
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
duration = time.time() - start_time
|
||||
return HealthResult(
|
||||
name=check.name,
|
||||
status=HealthStatus.UNHEALTHY,
|
||||
message=f"Timeout after {check.timeout}s",
|
||||
duration=duration,
|
||||
timestamp=time.time()
|
||||
)
|
||||
except Exception as e:
|
||||
duration = time.time() - start_time
|
||||
return HealthResult(
|
||||
name=check.name,
|
||||
status=HealthStatus.UNHEALTHY,
|
||||
message=f"Error: {str(e)}",
|
||||
duration=duration,
|
||||
timestamp=time.time()
|
||||
)
|
||||
|
||||
async def _execute_check(self, check_function: Callable) -> Any:
|
||||
"""Execute a check function (handles both sync and async)"""
|
||||
if asyncio.iscoroutinefunction(check_function):
|
||||
return await check_function()
|
||||
else:
|
||||
return check_function()
|
||||
|
||||
async def check_health(self) -> Dict[str, Any]:
|
||||
"""Run all health checks and return status"""
|
||||
if not self.checks:
|
||||
return {
|
||||
"service": self.service_name,
|
||||
"status": HealthStatus.HEALTHY.value,
|
||||
"uptime": time.time() - self.start_time,
|
||||
"timestamp": time.time(),
|
||||
"checks": {}
|
||||
}
|
||||
|
||||
# Run all checks concurrently
|
||||
results = await asyncio.gather(
|
||||
*[self.run_check(check) for check in self.checks],
|
||||
return_exceptions=True
|
||||
)
|
||||
|
||||
# Process results
|
||||
check_results = {}
|
||||
overall_status = HealthStatus.HEALTHY
|
||||
|
||||
for i, result in enumerate(results):
|
||||
check = self.checks[i]
|
||||
|
||||
if isinstance(result, Exception):
|
||||
check_result = HealthResult(
|
||||
name=check.name,
|
||||
status=HealthStatus.UNHEALTHY,
|
||||
message=f"Exception: {str(result)}",
|
||||
duration=0.0,
|
||||
timestamp=time.time()
|
||||
)
|
||||
else:
|
||||
check_result = result
|
||||
|
||||
check_results[check.name] = {
|
||||
"status": check_result.status.value,
|
||||
"message": check_result.message,
|
||||
"duration": check_result.duration,
|
||||
"timestamp": check_result.timestamp
|
||||
}
|
||||
|
||||
# Determine overall status
|
||||
if check.critical and check_result.status == HealthStatus.UNHEALTHY:
|
||||
overall_status = HealthStatus.UNHEALTHY
|
||||
elif check_result.status == HealthStatus.DEGRADED and overall_status == HealthStatus.HEALTHY:
|
||||
overall_status = HealthStatus.DEGRADED
|
||||
|
||||
return {
|
||||
"service": self.service_name,
|
||||
"status": overall_status.value,
|
||||
"uptime": time.time() - self.start_time,
|
||||
"timestamp": time.time(),
|
||||
"checks": check_results
|
||||
}
|
||||
|
||||
|
||||
# Create FastAPI router for health endpoints
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/")
|
||||
async def health_check():
|
||||
"""Basic health check endpoint"""
|
||||
return {
|
||||
"service": "service",
|
||||
"status": "healthy",
|
||||
"timestamp": time.time()
|
||||
}
|
||||
|
||||
439
shared/monitoring/health_checks.py
Executable file
439
shared/monitoring/health_checks.py
Executable file
@@ -0,0 +1,439 @@
|
||||
"""
|
||||
Enhanced Health Check System for Microservices
|
||||
|
||||
Provides unified health check endpoints and database verification based on
|
||||
the comprehensive implementation from the training service.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional, Callable
|
||||
from contextlib import asynccontextmanager
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import text, inspect
|
||||
from fastapi import HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
import structlog
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from ..database.base import DatabaseManager
|
||||
from ..database.exceptions import DatabaseError, HealthCheckError
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class HealthCheckManager:
|
||||
"""
|
||||
Unified health check manager for microservices
|
||||
|
||||
Provides standardized health check endpoints:
|
||||
- /health - Basic service health
|
||||
- /health/ready - Kubernetes readiness probe with comprehensive checks
|
||||
- /health/live - Kubernetes liveness probe
|
||||
- /health/database - Detailed database health information
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
service_name: str,
|
||||
version: str = "1.0.0",
|
||||
database_manager: Optional[DatabaseManager] = None,
|
||||
expected_tables: Optional[List[str]] = None,
|
||||
custom_checks: Optional[Dict[str, Callable]] = None
|
||||
):
|
||||
self.service_name = service_name
|
||||
self.version = version
|
||||
self.database_manager = database_manager
|
||||
self.expected_tables = expected_tables or []
|
||||
self.custom_checks = custom_checks or {}
|
||||
self.ready_state = False
|
||||
|
||||
def set_ready(self, ready: bool = True):
|
||||
"""Set service ready state"""
|
||||
self.ready_state = ready
|
||||
logger.info(f"Service ready state changed",
|
||||
service=self.service_name, ready=ready)
|
||||
|
||||
async def basic_health_check(self, app_state=None) -> Dict[str, Any]:
|
||||
"""Basic health check endpoint (/health)"""
|
||||
# Check app state for ready status if available
|
||||
ready = self.ready_state
|
||||
if app_state and hasattr(app_state, 'ready'):
|
||||
ready = app_state.ready
|
||||
|
||||
return {
|
||||
"status": "healthy" if ready else "starting",
|
||||
"service": self.service_name,
|
||||
"version": self.version,
|
||||
"timestamp": datetime.datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
async def readiness_check(self, app_state=None) -> Dict[str, Any]:
|
||||
"""
|
||||
Kubernetes readiness probe endpoint (/health/ready)
|
||||
|
||||
Returns 200 if ready, 503 if not ready
|
||||
"""
|
||||
try:
|
||||
# Check app state for ready status if available
|
||||
ready = self.ready_state
|
||||
if app_state and hasattr(app_state, 'ready'):
|
||||
ready = app_state.ready
|
||||
|
||||
checks = {
|
||||
"application": ready
|
||||
}
|
||||
|
||||
database_details = {}
|
||||
|
||||
# Database connectivity and table verification
|
||||
if self.database_manager:
|
||||
db_health = await self._get_comprehensive_db_health()
|
||||
checks["database_connectivity"] = db_health["connectivity"]
|
||||
checks["database_tables"] = db_health["tables_exist"]
|
||||
|
||||
database_details = {
|
||||
"status": db_health["status"],
|
||||
"tables_verified": db_health["tables_verified"],
|
||||
"missing_tables": db_health["missing_tables"],
|
||||
"errors": db_health["errors"]
|
||||
}
|
||||
|
||||
# Execute custom checks
|
||||
for check_name, check_func in self.custom_checks.items():
|
||||
try:
|
||||
checks[check_name] = await check_func()
|
||||
except Exception as e:
|
||||
checks[check_name] = False
|
||||
logger.error(f"Custom check '{check_name}' failed", error=str(e))
|
||||
|
||||
# Service is ready only if all checks pass
|
||||
all_ready = all(checks.values())
|
||||
if self.database_manager:
|
||||
all_ready = all_ready and database_details.get("status") == "healthy"
|
||||
|
||||
response_data = {
|
||||
"status": "ready" if all_ready else "not ready",
|
||||
"checks": checks
|
||||
}
|
||||
|
||||
if database_details:
|
||||
response_data["database"] = database_details
|
||||
|
||||
if all_ready:
|
||||
return response_data
|
||||
else:
|
||||
raise HTTPException(status_code=503, detail=response_data)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Readiness check failed", error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail={
|
||||
"status": "not ready",
|
||||
"error": f"Health check failed: {str(e)}"
|
||||
}
|
||||
)
|
||||
|
||||
async def liveness_check(self) -> Dict[str, Any]:
|
||||
"""Kubernetes liveness probe endpoint (/health/live)"""
|
||||
return {"status": "alive"}
|
||||
|
||||
async def database_health_check(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Detailed database health endpoint (/health/database)
|
||||
|
||||
Returns 200 if healthy, 503 if unhealthy
|
||||
"""
|
||||
if not self.database_manager:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail={"error": "Database health check not available"}
|
||||
)
|
||||
|
||||
try:
|
||||
db_health = await self._get_comprehensive_db_health()
|
||||
status_code = 200 if db_health["status"] == "healthy" else 503
|
||||
|
||||
if status_code == 503:
|
||||
raise HTTPException(status_code=503, detail=db_health)
|
||||
return db_health
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Database health check failed", error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail={
|
||||
"status": "unhealthy",
|
||||
"error": f"Health check failed: {str(e)}"
|
||||
}
|
||||
)
|
||||
|
||||
async def _get_comprehensive_db_health(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Comprehensive database health check with table verification
|
||||
Based on training service implementation
|
||||
"""
|
||||
health_status = {
|
||||
"status": "healthy",
|
||||
"connectivity": False,
|
||||
"tables_exist": False,
|
||||
"tables_verified": [],
|
||||
"missing_tables": [],
|
||||
"errors": [],
|
||||
"connection_info": {},
|
||||
"response_time_ms": 0
|
||||
}
|
||||
|
||||
if not self.database_manager:
|
||||
health_status["status"] = "unhealthy"
|
||||
health_status["errors"].append("Database manager not configured")
|
||||
return health_status
|
||||
|
||||
try:
|
||||
# Test basic connectivity with timing
|
||||
start_time = time.time()
|
||||
health_status["connectivity"] = await self.database_manager.test_connection()
|
||||
response_time = (time.time() - start_time) * 1000
|
||||
health_status["response_time_ms"] = round(response_time, 2)
|
||||
|
||||
if not health_status["connectivity"]:
|
||||
health_status["status"] = "unhealthy"
|
||||
health_status["errors"].append("Database connectivity failed")
|
||||
return health_status
|
||||
|
||||
# Get connection pool information
|
||||
health_status["connection_info"] = await self.database_manager.get_connection_info()
|
||||
|
||||
# Check migration status
|
||||
migration_status = await self._check_migration_status()
|
||||
health_status.update(migration_status)
|
||||
|
||||
# Test table existence if expected tables are configured
|
||||
if self.expected_tables:
|
||||
tables_verified = await self._verify_tables_exist()
|
||||
health_status["tables_exist"] = tables_verified
|
||||
|
||||
if tables_verified:
|
||||
health_status["tables_verified"] = self.expected_tables.copy()
|
||||
else:
|
||||
health_status["status"] = "unhealthy"
|
||||
health_status["errors"].append("Required tables missing or inaccessible")
|
||||
|
||||
# Identify which specific tables are missing
|
||||
await self._identify_missing_tables(health_status)
|
||||
else:
|
||||
# If no expected tables configured, just mark as verified
|
||||
health_status["tables_exist"] = True
|
||||
|
||||
logger.debug("Comprehensive database health check completed",
|
||||
service=self.service_name,
|
||||
status=health_status["status"],
|
||||
connectivity=health_status["connectivity"],
|
||||
tables_exist=health_status["tables_exist"])
|
||||
|
||||
except Exception as e:
|
||||
health_status["status"] = "unhealthy"
|
||||
health_status["errors"].append(f"Health check failed: {str(e)}")
|
||||
logger.error("Comprehensive database health check failed",
|
||||
service=self.service_name, error=str(e))
|
||||
|
||||
return health_status
|
||||
|
||||
async def _verify_tables_exist(self) -> bool:
|
||||
"""Verify that all expected tables exist and are accessible"""
|
||||
try:
|
||||
async with self.database_manager.get_session() as session:
|
||||
for table_name in self.expected_tables:
|
||||
try:
|
||||
await session.execute(text(f"SELECT 1 FROM {table_name} LIMIT 1"))
|
||||
except Exception:
|
||||
return False
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error("Table verification failed", error=str(e))
|
||||
return False
|
||||
|
||||
async def _identify_missing_tables(self, health_status: Dict[str, Any]):
|
||||
"""Identify which specific tables are missing"""
|
||||
try:
|
||||
async with self.database_manager.get_session() as session:
|
||||
for table_name in self.expected_tables:
|
||||
try:
|
||||
await session.execute(text(f"SELECT 1 FROM {table_name} LIMIT 1"))
|
||||
health_status["tables_verified"].append(table_name)
|
||||
except Exception:
|
||||
health_status["missing_tables"].append(table_name)
|
||||
except Exception as e:
|
||||
health_status["errors"].append(f"Error checking individual tables: {str(e)}")
|
||||
|
||||
async def _check_migration_status(self) -> Dict[str, Any]:
|
||||
"""Check database migration status"""
|
||||
migration_info = {
|
||||
"migration_version": None,
|
||||
"migration_status": "unknown",
|
||||
"migration_errors": []
|
||||
}
|
||||
|
||||
try:
|
||||
async with self.database_manager.get_session() as session:
|
||||
# Check if alembic_version table exists
|
||||
result = await session.execute(
|
||||
text("SELECT version_num FROM alembic_version LIMIT 1")
|
||||
)
|
||||
version = result.scalar()
|
||||
|
||||
if version:
|
||||
migration_info["migration_version"] = version
|
||||
migration_info["migration_status"] = "healthy"
|
||||
logger.debug(f"Migration version found: {version}", service=self.service_name)
|
||||
else:
|
||||
migration_info["migration_status"] = "no_version"
|
||||
migration_info["migration_errors"].append("No migration version found in alembic_version table")
|
||||
|
||||
except Exception as e:
|
||||
migration_info["migration_status"] = "error"
|
||||
migration_info["migration_errors"].append(f"Migration check failed: {str(e)}")
|
||||
logger.error("Migration status check failed", service=self.service_name, error=str(e))
|
||||
|
||||
return migration_info
|
||||
|
||||
|
||||
class FastAPIHealthChecker:
|
||||
"""
|
||||
FastAPI integration for health checks
|
||||
|
||||
Provides router setup and endpoint registration
|
||||
"""
|
||||
|
||||
def __init__(self, health_manager: HealthCheckManager):
|
||||
self.health_manager = health_manager
|
||||
|
||||
def setup_health_routes(self, app):
|
||||
"""Setup health check routes on FastAPI app"""
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Basic health check endpoint"""
|
||||
return await self.health_manager.basic_health_check(app.state)
|
||||
|
||||
@app.get("/health/ready")
|
||||
async def readiness_check():
|
||||
"""Kubernetes readiness probe endpoint"""
|
||||
try:
|
||||
return await self.health_manager.readiness_check(app.state)
|
||||
except HTTPException as e:
|
||||
return JSONResponse(
|
||||
status_code=e.status_code,
|
||||
content=e.detail
|
||||
)
|
||||
|
||||
@app.get("/health/live")
|
||||
async def liveness_check():
|
||||
"""Kubernetes liveness probe endpoint"""
|
||||
return await self.health_manager.liveness_check()
|
||||
|
||||
@app.get("/health/database")
|
||||
async def database_health_check():
|
||||
"""Detailed database health endpoint"""
|
||||
try:
|
||||
return await self.health_manager.database_health_check()
|
||||
except HTTPException as e:
|
||||
return JSONResponse(
|
||||
status_code=e.status_code,
|
||||
content=e.detail
|
||||
)
|
||||
|
||||
|
||||
# Convenience functions for easy integration
|
||||
|
||||
async def check_database_health(db_manager: DatabaseManager) -> Dict[str, Any]:
|
||||
"""
|
||||
Enhanced database health check with migration status
|
||||
|
||||
Args:
|
||||
db_manager: DatabaseManager instance
|
||||
|
||||
Returns:
|
||||
Dict containing database health status including migration version
|
||||
"""
|
||||
try:
|
||||
async with db_manager.get_session() as session:
|
||||
# Basic connectivity test
|
||||
await session.execute(text("SELECT 1"))
|
||||
|
||||
# Get migration status
|
||||
migration_status = await session.execute(text("SELECT version_num FROM alembic_version"))
|
||||
version = migration_status.scalar()
|
||||
|
||||
return {
|
||||
"database": "healthy",
|
||||
"migration_version": version,
|
||||
"connectivity": True
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error("Database health check failed", error=str(e))
|
||||
return {
|
||||
"database": "unhealthy",
|
||||
"error": str(e),
|
||||
"connectivity": False,
|
||||
"migration_version": None
|
||||
}
|
||||
|
||||
|
||||
def create_health_manager(
|
||||
service_name: str,
|
||||
version: str = "1.0.0",
|
||||
database_manager: Optional[DatabaseManager] = None,
|
||||
expected_tables: Optional[List[str]] = None,
|
||||
custom_checks: Optional[Dict[str, Callable]] = None
|
||||
) -> HealthCheckManager:
|
||||
"""Factory function to create a HealthCheckManager"""
|
||||
return HealthCheckManager(
|
||||
service_name=service_name,
|
||||
version=version,
|
||||
database_manager=database_manager,
|
||||
expected_tables=expected_tables,
|
||||
custom_checks=custom_checks
|
||||
)
|
||||
|
||||
|
||||
def setup_fastapi_health_checks(
|
||||
app,
|
||||
service_name: str,
|
||||
version: str = "1.0.0",
|
||||
database_manager: Optional[DatabaseManager] = None,
|
||||
expected_tables: Optional[List[str]] = None,
|
||||
custom_checks: Optional[Dict[str, Callable]] = None
|
||||
) -> HealthCheckManager:
|
||||
"""
|
||||
Convenience function to setup health checks on a FastAPI app
|
||||
|
||||
Args:
|
||||
app: FastAPI application instance
|
||||
service_name: Name of the service
|
||||
version: Service version
|
||||
database_manager: Database manager instance
|
||||
expected_tables: List of tables that should exist
|
||||
custom_checks: Dict of custom check functions
|
||||
|
||||
Returns:
|
||||
HealthCheckManager instance for further configuration
|
||||
"""
|
||||
health_manager = create_health_manager(
|
||||
service_name=service_name,
|
||||
version=version,
|
||||
database_manager=database_manager,
|
||||
expected_tables=expected_tables,
|
||||
custom_checks=custom_checks
|
||||
)
|
||||
|
||||
fastapi_checker = FastAPIHealthChecker(health_manager)
|
||||
fastapi_checker.setup_health_routes(app)
|
||||
|
||||
return health_manager
|
||||
|
||||
|
||||
197
shared/monitoring/logging.py
Executable file
197
shared/monitoring/logging.py
Executable file
@@ -0,0 +1,197 @@
|
||||
# ================================================================
|
||||
# shared/monitoring/logging.py
|
||||
# ================================================================
|
||||
"""
|
||||
Centralized logging configuration for microservices
|
||||
"""
|
||||
|
||||
import logging
|
||||
import logging.config
|
||||
import os
|
||||
import sys
|
||||
import resource
|
||||
from typing import Dict, Any
|
||||
|
||||
def setup_logging(service_name: str, log_level: str = "INFO",
|
||||
enable_json: bool = False, enable_file: bool = True) -> None:
|
||||
"""
|
||||
Set up logging configuration for a microservice with improved error handling.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service for log identification
|
||||
log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
enable_json: Whether to use JSON formatting
|
||||
enable_file: Whether to enable file logging
|
||||
"""
|
||||
|
||||
# Check file descriptor limits
|
||||
try:
|
||||
soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
|
||||
if soft_limit < 1024:
|
||||
print(f"Warning: Low file descriptor limit ({soft_limit}). Consider increasing with 'ulimit -n'")
|
||||
if soft_limit < 256:
|
||||
print("Critical: File descriptor limit is very low. File logging may fail.")
|
||||
enable_file = False
|
||||
except Exception:
|
||||
# resource module might not be available on all platforms
|
||||
pass
|
||||
|
||||
# Create logs directory if it doesn't exist and file logging is enabled
|
||||
log_dir = "/var/log"
|
||||
if enable_file:
|
||||
try:
|
||||
# First try to create/write to /var/log
|
||||
test_file = os.path.join(log_dir, f".{service_name}_test")
|
||||
with open(test_file, 'w') as f:
|
||||
f.write("test")
|
||||
os.remove(test_file)
|
||||
except (PermissionError, OSError):
|
||||
# Fallback to local directory if can't write to /var/log
|
||||
log_dir = "./logs"
|
||||
print(f"Warning: Could not write to /var/log, using {log_dir}")
|
||||
|
||||
try:
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not create log directory {log_dir}: {e}")
|
||||
enable_file = False # Disable file logging if we can't create directory
|
||||
|
||||
# Define formatters
|
||||
formatters = {
|
||||
"standard": {
|
||||
"format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
"datefmt": "%Y-%m-%d %H:%M:%S"
|
||||
},
|
||||
"detailed": {
|
||||
"format": "%(asctime)s [%(levelname)s] %(name)s [%(filename)s:%(lineno)d] %(funcName)s(): %(message)s",
|
||||
"datefmt": "%Y-%m-%d %H:%M:%S"
|
||||
}
|
||||
}
|
||||
|
||||
# Add JSON formatter if requested and available
|
||||
if enable_json:
|
||||
try:
|
||||
import pythonjsonlogger.jsonlogger
|
||||
formatters["json"] = {
|
||||
"()": "pythonjsonlogger.jsonlogger.JsonFormatter",
|
||||
"format": "%(asctime)s %(name)s %(levelname)s %(message)s %(filename)s %(lineno)d"
|
||||
}
|
||||
except ImportError:
|
||||
print("Warning: pythonjsonlogger not available, falling back to standard formatting")
|
||||
enable_json = False
|
||||
|
||||
# Define handlers
|
||||
handlers = {
|
||||
"console": {
|
||||
"class": "logging.StreamHandler",
|
||||
"level": log_level,
|
||||
"formatter": "json" if enable_json else "standard",
|
||||
"stream": "ext://sys.stdout"
|
||||
}
|
||||
}
|
||||
|
||||
# Add file handler if enabled
|
||||
if enable_file:
|
||||
try:
|
||||
# Test if we can actually write to the log file location
|
||||
test_filename = f"{log_dir}/{service_name}.log"
|
||||
test_dir = os.path.dirname(test_filename)
|
||||
if not os.access(test_dir, os.W_OK):
|
||||
print(f"Warning: Cannot write to log directory {test_dir}, disabling file logging")
|
||||
enable_file = False
|
||||
else:
|
||||
handlers["file"] = {
|
||||
"class": "logging.FileHandler",
|
||||
"level": log_level,
|
||||
"formatter": "detailed",
|
||||
"filename": test_filename,
|
||||
"mode": "a",
|
||||
"encoding": "utf-8"
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not configure file handler: {e}")
|
||||
enable_file = False
|
||||
|
||||
# Add logstash handler if in production
|
||||
logstash_host = os.getenv("LOGSTASH_HOST")
|
||||
if logstash_host and os.getenv("ENVIRONMENT") == "production":
|
||||
try:
|
||||
handlers["logstash"] = {
|
||||
"class": "logstash.TCPLogstashHandler",
|
||||
"host": logstash_host,
|
||||
"port": int(os.getenv("LOGSTASH_PORT", "5000")),
|
||||
"version": 1,
|
||||
"message_type": "logstash",
|
||||
"fqdn": False,
|
||||
"tags": [service_name]
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not setup logstash handler: {e}")
|
||||
|
||||
# Define root logger configuration
|
||||
root_handlers = ["console"]
|
||||
if enable_file:
|
||||
root_handlers.append("file")
|
||||
if "logstash" in handlers:
|
||||
root_handlers.append("logstash")
|
||||
|
||||
# Complete logging configuration
|
||||
config: Dict[str, Any] = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
"formatters": formatters,
|
||||
"handlers": handlers,
|
||||
"loggers": {
|
||||
"": { # Root logger
|
||||
"handlers": root_handlers,
|
||||
"level": log_level,
|
||||
"propagate": False
|
||||
},
|
||||
"uvicorn": {
|
||||
"handlers": ["console"],
|
||||
"level": log_level,
|
||||
"propagate": False
|
||||
},
|
||||
"uvicorn.access": {
|
||||
"handlers": ["console"],
|
||||
"level": log_level,
|
||||
"propagate": False
|
||||
},
|
||||
"sqlalchemy": {
|
||||
"handlers": ["console"],
|
||||
"level": "WARNING", # Reduce SQL logging noise
|
||||
"propagate": False
|
||||
},
|
||||
"httpx": {
|
||||
"handlers": ["console"],
|
||||
"level": "WARNING", # Reduce HTTP client logging
|
||||
"propagate": False
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
logging.config.dictConfig(config)
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(f"Logging configured for {service_name} at level {log_level}")
|
||||
if enable_file:
|
||||
logger.info(f"File logging enabled at {log_dir}/{service_name}.log")
|
||||
else:
|
||||
logger.info("File logging disabled")
|
||||
except Exception as e:
|
||||
# Fallback to basic logging if configuration fails
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, log_level.upper()),
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
handlers=[logging.StreamHandler(sys.stdout)]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.error(f"Failed to configure advanced logging for {service_name}: {e}")
|
||||
logger.info(f"Using basic logging configuration for {service_name}")
|
||||
|
||||
# Additional debugging for file handler issues
|
||||
if "file" in str(e).lower() or "handler" in str(e).lower():
|
||||
logger.error(f"File handler configuration failed. Check permissions for {log_dir}")
|
||||
logger.error(f"Current working directory: {os.getcwd()}")
|
||||
logger.error(f"Attempting to write to: {log_dir}/{service_name}.log")
|
||||
|
||||
221
shared/monitoring/logs_exporter.py
Normal file
221
shared/monitoring/logs_exporter.py
Normal file
@@ -0,0 +1,221 @@
|
||||
"""
|
||||
OpenTelemetry Logs Integration for SigNoz
|
||||
Exports structured logs to SigNoz via OpenTelemetry Collector using HTTP protocol
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import structlog
|
||||
from typing import Optional
|
||||
from opentelemetry._logs import set_logger_provider
|
||||
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
|
||||
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
|
||||
# Try to import HTTP log exporter (logs always use HTTP)
|
||||
try:
|
||||
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
|
||||
HTTP_LOG_EXPORTER_AVAILABLE = True
|
||||
except ImportError:
|
||||
try:
|
||||
from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
|
||||
HTTP_LOG_EXPORTER_AVAILABLE = True
|
||||
except ImportError:
|
||||
OTLPLogExporter = None
|
||||
HTTP_LOG_EXPORTER_AVAILABLE = False
|
||||
|
||||
from .otel_config import OTelConfig
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def setup_otel_logging(
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
otel_endpoint: Optional[str] = None,
|
||||
enable_console: bool = True
|
||||
) -> Optional[LoggingHandler]:
|
||||
"""
|
||||
Setup OpenTelemetry logging to export logs to SigNoz.
|
||||
|
||||
Uses HTTP protocol (port 4318) for sending logs to SigNoz.
|
||||
Integrates with Python's standard logging to automatically export
|
||||
all log records to SigNoz via the OTLP HTTP protocol.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
otel_endpoint: Optional override for OTLP endpoint (HTTP format with path)
|
||||
enable_console: Whether to also log to console (default: True)
|
||||
|
||||
Returns:
|
||||
LoggingHandler instance if successful, None otherwise
|
||||
|
||||
Example:
|
||||
from shared.monitoring.logs_exporter import setup_otel_logging
|
||||
|
||||
# Setup during service initialization
|
||||
handler = setup_otel_logging("auth-service", "1.0.0")
|
||||
|
||||
# Now all standard logging calls will be exported to SigNoz
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("This will appear in SigNoz!")
|
||||
"""
|
||||
|
||||
# Check if logging export is enabled
|
||||
if not OTelConfig.is_enabled("logs"):
|
||||
logger.info(
|
||||
"OpenTelemetry logs export disabled",
|
||||
service=service_name,
|
||||
reason="OTEL_LOGS_EXPORTER not set to 'otlp'"
|
||||
)
|
||||
return None
|
||||
|
||||
# Check if HTTP log exporter is available
|
||||
if not HTTP_LOG_EXPORTER_AVAILABLE or OTLPLogExporter is None:
|
||||
logger.warning(
|
||||
"OpenTelemetry HTTP log exporter not available",
|
||||
service=service_name,
|
||||
reason="opentelemetry-exporter-otlp-proto-http package not installed"
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
# Get endpoints from centralized config
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
|
||||
# Use provided endpoint or get from config
|
||||
if otel_endpoint:
|
||||
http_endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/logs")
|
||||
else:
|
||||
http_endpoint = endpoints.logs_http
|
||||
|
||||
# Get resource attributes
|
||||
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
|
||||
resource = Resource(attributes=resource_attrs)
|
||||
|
||||
# Configure logger provider
|
||||
logger_provider = LoggerProvider(resource=resource)
|
||||
set_logger_provider(logger_provider)
|
||||
|
||||
# Configure OTLP HTTP exporter for logs
|
||||
otlp_exporter = OTLPLogExporter(
|
||||
endpoint=http_endpoint,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
# Add log record processor with batching
|
||||
log_processor = BatchLogRecordProcessor(otlp_exporter)
|
||||
logger_provider.add_log_record_processor(log_processor)
|
||||
|
||||
# Create logging handler that bridges standard logging to OpenTelemetry
|
||||
otel_handler = LoggingHandler(
|
||||
level=logging.NOTSET, # Capture all levels
|
||||
logger_provider=logger_provider
|
||||
)
|
||||
|
||||
# Add handler to root logger
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.addHandler(otel_handler)
|
||||
|
||||
logger.info(
|
||||
"OpenTelemetry logs export configured successfully",
|
||||
service=service_name,
|
||||
http_endpoint=http_endpoint,
|
||||
protocol="http",
|
||||
console_logging=enable_console
|
||||
)
|
||||
|
||||
return otel_handler
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to setup OpenTelemetry logs export",
|
||||
service=service_name,
|
||||
error=str(e)
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def add_log_context(**context):
|
||||
"""
|
||||
Add contextual information to logs that will be sent to SigNoz.
|
||||
|
||||
This is useful for adding request IDs, user IDs, tenant IDs, etc.
|
||||
that help with filtering and correlation in SigNoz.
|
||||
|
||||
Args:
|
||||
**context: Key-value pairs to add to log context
|
||||
|
||||
Example:
|
||||
from shared.monitoring.logs_exporter import add_log_context
|
||||
|
||||
# Add context for current request
|
||||
add_log_context(
|
||||
request_id="req_123",
|
||||
user_id="user_456",
|
||||
tenant_id="tenant_789"
|
||||
)
|
||||
|
||||
# Now all logs will include this context
|
||||
logger.info("Processing order") # Will include request_id, user_id, tenant_id
|
||||
"""
|
||||
# This works with structlog's context binding
|
||||
bound_logger = structlog.get_logger()
|
||||
return bound_logger.bind(**context)
|
||||
|
||||
|
||||
def get_current_trace_context() -> dict:
|
||||
"""
|
||||
Get current trace context for log correlation.
|
||||
|
||||
Returns a dict with trace_id and span_id if available,
|
||||
which can be added to log records for correlation with traces.
|
||||
|
||||
Returns:
|
||||
Dict with trace_id and span_id, or empty dict if no active trace
|
||||
|
||||
Example:
|
||||
from shared.monitoring.logs_exporter import get_current_trace_context
|
||||
|
||||
# Get trace context and add to logs
|
||||
trace_ctx = get_current_trace_context()
|
||||
logger.info("Processing request", **trace_ctx)
|
||||
"""
|
||||
from opentelemetry import trace
|
||||
|
||||
span = trace.get_current_span()
|
||||
if span and span.get_span_context().is_valid:
|
||||
return {
|
||||
"trace_id": format(span.get_span_context().trace_id, '032x'),
|
||||
"span_id": format(span.get_span_context().span_id, '016x'),
|
||||
}
|
||||
return {}
|
||||
|
||||
|
||||
class StructlogOTELProcessor:
|
||||
"""
|
||||
Structlog processor that adds OpenTelemetry trace context to logs.
|
||||
|
||||
This automatically adds trace_id and span_id to all log records,
|
||||
enabling correlation between logs and traces in SigNoz.
|
||||
|
||||
Usage:
|
||||
import structlog
|
||||
from shared.monitoring.logs_exporter import StructlogOTELProcessor
|
||||
|
||||
structlog.configure(
|
||||
processors=[
|
||||
StructlogOTELProcessor(),
|
||||
# ... other processors
|
||||
]
|
||||
)
|
||||
"""
|
||||
|
||||
def __call__(self, logger, method_name, event_dict):
|
||||
"""Add trace context to log event"""
|
||||
trace_ctx = get_current_trace_context()
|
||||
if trace_ctx:
|
||||
event_dict.update(trace_ctx)
|
||||
return event_dict
|
||||
400
shared/monitoring/metrics.py
Executable file
400
shared/monitoring/metrics.py
Executable file
@@ -0,0 +1,400 @@
|
||||
"""
|
||||
OpenTelemetry Metrics Collection for Microservices
|
||||
Replaces Prometheus with native OpenTelemetry metrics export to SigNoz
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
import structlog
|
||||
from typing import Dict, Any, Optional
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
||||
from fastapi import Request, Response
|
||||
from threading import Lock
|
||||
import os
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
# Global registry for metrics collectors
|
||||
_metrics_registry: Dict[str, 'MetricsCollector'] = {}
|
||||
_registry_lock = Lock()
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
"""
|
||||
OpenTelemetry-based metrics collector for microservices.
|
||||
Exports metrics directly to SigNoz via OTLP (no Prometheus).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
):
|
||||
self.service_name = service_name
|
||||
self.service_version = service_version
|
||||
self.start_time = time.time()
|
||||
|
||||
# Use provided meter provider or get global
|
||||
if meter_provider:
|
||||
self.meter = meter_provider.get_meter(__name__)
|
||||
else:
|
||||
self.meter = metrics.get_meter(__name__)
|
||||
|
||||
# Store created instruments
|
||||
self._counters: Dict[str, Any] = {}
|
||||
self._histograms: Dict[str, Any] = {}
|
||||
self._up_down_counters: Dict[str, Any] = {}
|
||||
self._lock = Lock()
|
||||
|
||||
# Register in global registry
|
||||
with _registry_lock:
|
||||
_metrics_registry[service_name] = self
|
||||
|
||||
# Create default HTTP metrics
|
||||
self._setup_default_metrics()
|
||||
|
||||
logger.info(
|
||||
"OpenTelemetry metrics collector initialized",
|
||||
service=service_name
|
||||
)
|
||||
|
||||
def _setup_default_metrics(self):
|
||||
"""Setup default HTTP metrics"""
|
||||
self._counters["http_requests_total"] = self.meter.create_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_http_requests_total",
|
||||
description="Total HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
self._histograms["http_request_duration"] = self.meter.create_histogram(
|
||||
name=f"{self.service_name.replace('-', '_')}_http_request_duration_seconds",
|
||||
description="HTTP request duration in seconds",
|
||||
unit="s"
|
||||
)
|
||||
|
||||
self._up_down_counters["active_requests"] = self.meter.create_up_down_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_active_requests",
|
||||
description="Number of active HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
def register_counter(self, name: str, documentation: str, labels: list = None) -> Any:
|
||||
"""Register a custom Counter metric"""
|
||||
with self._lock:
|
||||
if name in self._counters:
|
||||
logger.warning(f"Counter '{name}' already registered for {self.service_name}")
|
||||
return self._counters[name]
|
||||
|
||||
try:
|
||||
counter = self.meter.create_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=documentation,
|
||||
unit="1"
|
||||
)
|
||||
self._counters[name] = counter
|
||||
logger.info(f"Registered counter: {name} for {self.service_name}")
|
||||
return counter
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register counter {name} for {self.service_name}: {e}")
|
||||
raise
|
||||
|
||||
def register_histogram(
|
||||
self,
|
||||
name: str,
|
||||
documentation: str,
|
||||
labels: list = None,
|
||||
buckets: tuple = None
|
||||
) -> Any:
|
||||
"""Register a custom Histogram metric"""
|
||||
with self._lock:
|
||||
if name in self._histograms:
|
||||
logger.warning(f"Histogram '{name}' already registered for {self.service_name}")
|
||||
return self._histograms[name]
|
||||
|
||||
try:
|
||||
histogram = self.meter.create_histogram(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=documentation,
|
||||
unit="1"
|
||||
)
|
||||
self._histograms[name] = histogram
|
||||
logger.info(f"Registered histogram: {name} for {self.service_name}")
|
||||
return histogram
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register histogram {name} for {self.service_name}: {e}")
|
||||
raise
|
||||
|
||||
def register_gauge(self, name: str, documentation: str, labels: list = None) -> Any:
|
||||
"""Register a custom Gauge metric (using UpDownCounter)"""
|
||||
with self._lock:
|
||||
if name in self._up_down_counters:
|
||||
logger.warning(f"Gauge '{name}' already registered for {self.service_name}")
|
||||
return self._up_down_counters[name]
|
||||
|
||||
try:
|
||||
gauge = self.meter.create_up_down_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=documentation,
|
||||
unit="1"
|
||||
)
|
||||
self._up_down_counters[name] = gauge
|
||||
logger.info(f"Registered gauge: {name} for {self.service_name}")
|
||||
return gauge
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register gauge {name} for {self.service_name}: {e}")
|
||||
raise
|
||||
|
||||
def increment_counter(self, name: str, value: int = 1, labels: Dict[str, str] = None):
|
||||
"""Increment a counter metric"""
|
||||
if name not in self._counters:
|
||||
logger.error(f"Counter '{name}' not registered for {self.service_name}")
|
||||
return
|
||||
|
||||
if labels is None:
|
||||
labels = {"service": self.service_name}
|
||||
elif "service" not in labels:
|
||||
labels["service"] = self.service_name
|
||||
|
||||
try:
|
||||
self._counters[name].add(value, labels)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to increment counter {name} for {self.service_name}: {e}")
|
||||
|
||||
def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None):
|
||||
"""Observe a histogram metric"""
|
||||
if name not in self._histograms:
|
||||
logger.error(f"Histogram '{name}' not registered for {self.service_name}")
|
||||
return
|
||||
|
||||
if labels is None:
|
||||
labels = {"service": self.service_name}
|
||||
elif "service" not in labels:
|
||||
labels["service"] = self.service_name
|
||||
|
||||
try:
|
||||
self._histograms[name].record(value, labels)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to observe histogram {name} for {self.service_name}: {e}")
|
||||
|
||||
def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None):
|
||||
"""Set a gauge metric (using add for UpDownCounter)"""
|
||||
if name not in self._up_down_counters:
|
||||
logger.error(f"Gauge '{name}' not registered for {self.service_name}")
|
||||
return
|
||||
|
||||
if labels is None:
|
||||
labels = {"service": self.service_name}
|
||||
elif "service" not in labels:
|
||||
labels["service"] = self.service_name
|
||||
|
||||
try:
|
||||
# For UpDownCounter, we need to track the delta
|
||||
# Store current value and calculate delta
|
||||
key = f"{name}_{str(sorted(labels.items()))}"
|
||||
if not hasattr(self, '_gauge_values'):
|
||||
self._gauge_values = {}
|
||||
|
||||
old_value = self._gauge_values.get(key, 0)
|
||||
delta = value - old_value
|
||||
self._gauge_values[key] = value
|
||||
|
||||
self._up_down_counters[name].add(delta, labels)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to set gauge {name} for {self.service_name}: {e}")
|
||||
|
||||
def record_request(self, method: str, endpoint: str, status_code: int, duration: float):
|
||||
"""Record HTTP request metrics"""
|
||||
try:
|
||||
attributes = {
|
||||
"service": self.service_name,
|
||||
"http.method": method,
|
||||
"http.route": endpoint,
|
||||
"http.status_code": str(status_code)
|
||||
}
|
||||
|
||||
self._counters["http_requests_total"].add(1, attributes)
|
||||
self._histograms["http_request_duration"].record(duration, attributes)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to record request metrics for {self.service_name}: {e}")
|
||||
|
||||
def increment_active_requests(self):
|
||||
"""Increment active request counter"""
|
||||
try:
|
||||
self._up_down_counters["active_requests"].add(1, {"service": self.service_name})
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to increment active requests: {e}")
|
||||
|
||||
def decrement_active_requests(self):
|
||||
"""Decrement active request counter"""
|
||||
try:
|
||||
self._up_down_counters["active_requests"].add(-1, {"service": self.service_name})
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to decrement active requests: {e}")
|
||||
|
||||
def set_active_connections(self, count: int):
|
||||
"""Set active database connections"""
|
||||
self.set_gauge("active_connections", count)
|
||||
|
||||
|
||||
def get_metrics_collector(service_name: str) -> Optional[MetricsCollector]:
|
||||
"""Get metrics collector by service name from global registry"""
|
||||
with _registry_lock:
|
||||
return _metrics_registry.get(service_name)
|
||||
|
||||
|
||||
def create_metrics_collector(
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
) -> MetricsCollector:
|
||||
"""
|
||||
Create metrics collector.
|
||||
This should be called BEFORE app startup, not during lifespan.
|
||||
"""
|
||||
# Get existing or create new
|
||||
existing = get_metrics_collector(service_name)
|
||||
if existing:
|
||||
return existing
|
||||
|
||||
return MetricsCollector(service_name, service_version, meter_provider)
|
||||
|
||||
|
||||
def add_metrics_middleware(app, metrics_collector: MetricsCollector):
|
||||
"""
|
||||
Add metrics middleware to app. Must be called BEFORE app startup.
|
||||
"""
|
||||
@app.middleware("http")
|
||||
async def metrics_middleware(request: Request, call_next):
|
||||
# Increment active requests
|
||||
metrics_collector.increment_active_requests()
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
response = await call_next(request)
|
||||
duration = time.time() - start_time
|
||||
|
||||
# Record request metrics
|
||||
metrics_collector.record_request(
|
||||
method=request.method,
|
||||
endpoint=request.url.path,
|
||||
status_code=response.status_code,
|
||||
duration=duration
|
||||
)
|
||||
|
||||
# Decrement active requests
|
||||
metrics_collector.decrement_active_requests()
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
duration = time.time() - start_time
|
||||
|
||||
# Record failed request
|
||||
metrics_collector.record_request(
|
||||
method=request.method,
|
||||
endpoint=request.url.path,
|
||||
status_code=500,
|
||||
duration=duration
|
||||
)
|
||||
|
||||
# Decrement active requests
|
||||
metrics_collector.decrement_active_requests()
|
||||
raise
|
||||
|
||||
return metrics_collector
|
||||
|
||||
|
||||
def track_user_activity(user_id: str, action: str, service_name: str = "unknown-service", metadata: dict = None):
|
||||
"""Track user activity metrics using the appropriate metrics collector"""
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
|
||||
# Add user-specific attributes
|
||||
attributes = {
|
||||
"user.id": user_id,
|
||||
"action": action,
|
||||
**metadata
|
||||
}
|
||||
|
||||
# Get the metrics collector for the specified service
|
||||
metrics_collector = get_metrics_collector(service_name)
|
||||
|
||||
if metrics_collector:
|
||||
# Use the collector's counter registration system
|
||||
counter_name = "user_activity_total"
|
||||
|
||||
# Check if counter already exists, if not register it
|
||||
if counter_name not in metrics_collector._counters:
|
||||
metrics_collector.register_counter(
|
||||
name=counter_name,
|
||||
documentation="Total user activity events"
|
||||
)
|
||||
|
||||
# Increment the counter with attributes
|
||||
metrics_collector.increment_counter(counter_name, value=1, labels=attributes)
|
||||
else:
|
||||
# Fallback: create a temporary counter if no collector exists
|
||||
from opentelemetry import metrics
|
||||
|
||||
meter = metrics.get_meter(__name__)
|
||||
user_activity_counter = meter.create_counter(
|
||||
name="user_activity_total",
|
||||
description="User activity events",
|
||||
unit="events"
|
||||
)
|
||||
user_activity_counter.add(1, attributes)
|
||||
|
||||
|
||||
def setup_metrics_early(
|
||||
app,
|
||||
service_name: str = None,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
) -> MetricsCollector:
|
||||
"""
|
||||
Setup metrics collection BEFORE app startup.
|
||||
This must be called before adding any middleware or starting the app.
|
||||
|
||||
Note: No Prometheus endpoint is created - all metrics go to SigNoz via OTLP
|
||||
"""
|
||||
if service_name is None:
|
||||
service_name = getattr(app, 'title', 'unknown-service').lower().replace(' ', '-').replace('.', '_')
|
||||
|
||||
# Create metrics collector
|
||||
metrics_collector = create_metrics_collector(service_name, service_version, meter_provider)
|
||||
|
||||
# Add middleware (must be before app starts)
|
||||
add_metrics_middleware(app, metrics_collector)
|
||||
|
||||
# Store in app state for access from routes
|
||||
app.state.metrics_collector = metrics_collector
|
||||
|
||||
logger.info(f"OpenTelemetry metrics setup completed for service: {service_name}")
|
||||
return metrics_collector
|
||||
|
||||
|
||||
# Helper function for endpoint tracking (kept for backward compatibility)
|
||||
def track_endpoint_metrics(endpoint_name: str = None, service_name: str = None):
|
||||
"""Decorator for tracking endpoint metrics - metrics handled by middleware"""
|
||||
def decorator(func):
|
||||
import asyncio
|
||||
from functools import wraps
|
||||
|
||||
@wraps(func)
|
||||
async def async_wrapper(*args, **kwargs):
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
@wraps(func)
|
||||
def sync_wrapper(*args, **kwargs):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
# Return appropriate wrapper based on function type
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
return async_wrapper
|
||||
else:
|
||||
return sync_wrapper
|
||||
return decorator
|
||||
304
shared/monitoring/metrics_exporter.py
Normal file
304
shared/monitoring/metrics_exporter.py
Normal file
@@ -0,0 +1,304 @@
|
||||
"""
|
||||
OpenTelemetry Metrics Integration for SigNoz
|
||||
Exports metrics to SigNoz via OpenTelemetry Collector using gRPC protocol
|
||||
"""
|
||||
|
||||
import os
|
||||
import structlog
|
||||
from typing import Optional
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
|
||||
# Import both gRPC and HTTP exporters
|
||||
try:
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GrpcMetricExporter
|
||||
GRPC_AVAILABLE = True
|
||||
except ImportError:
|
||||
GRPC_AVAILABLE = False
|
||||
GrpcMetricExporter = None
|
||||
|
||||
try:
|
||||
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HttpMetricExporter
|
||||
HTTP_AVAILABLE = True
|
||||
except ImportError:
|
||||
HTTP_AVAILABLE = False
|
||||
HttpMetricExporter = None
|
||||
|
||||
from .otel_config import OTelConfig
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def setup_otel_metrics(
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
otel_endpoint: Optional[str] = None,
|
||||
export_interval_millis: int = 60000, # Export every 60 seconds
|
||||
protocol: Optional[str] = None # "grpc" or "http", defaults to grpc
|
||||
) -> Optional[MeterProvider]:
|
||||
"""
|
||||
Setup OpenTelemetry metrics to export to SigNoz.
|
||||
|
||||
Supports both gRPC (recommended, port 4317) and HTTP (port 4318) protocols.
|
||||
Default protocol is gRPC for better performance.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
otel_endpoint: Optional override for OTLP endpoint
|
||||
export_interval_millis: How often to push metrics in milliseconds (default 60s)
|
||||
protocol: Protocol to use ("grpc" or "http"). Defaults to "grpc"
|
||||
|
||||
Returns:
|
||||
MeterProvider instance if successful, None otherwise
|
||||
|
||||
Example:
|
||||
from shared.monitoring.metrics_exporter import setup_otel_metrics
|
||||
|
||||
# Setup with gRPC (default)
|
||||
meter_provider = setup_otel_metrics("auth-service", "1.0.0")
|
||||
|
||||
# Or with HTTP
|
||||
meter_provider = setup_otel_metrics("auth-service", "1.0.0", protocol="http")
|
||||
|
||||
# Create meters for your metrics
|
||||
meter = meter_provider.get_meter(__name__)
|
||||
request_counter = meter.create_counter(
|
||||
"http.server.requests",
|
||||
description="Total HTTP requests",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
# Record metrics
|
||||
request_counter.add(1, {"method": "GET", "status": "200"})
|
||||
"""
|
||||
|
||||
# Check if metrics export is enabled
|
||||
if not OTelConfig.is_enabled("metrics"):
|
||||
logger.info(
|
||||
"OpenTelemetry metrics export disabled",
|
||||
service=service_name,
|
||||
reason="ENABLE_OTEL_METRICS not set to 'true'"
|
||||
)
|
||||
return None
|
||||
|
||||
# Determine protocol to use
|
||||
if protocol is None:
|
||||
protocol = OTelConfig.get_protocol("metrics")
|
||||
|
||||
# Validate protocol is available
|
||||
if protocol == "grpc" and not GRPC_AVAILABLE:
|
||||
logger.warning(
|
||||
"gRPC exporter not available, falling back to HTTP",
|
||||
service=service_name
|
||||
)
|
||||
protocol = "http"
|
||||
elif protocol == "http" and not HTTP_AVAILABLE:
|
||||
logger.warning(
|
||||
"HTTP exporter not available, falling back to gRPC",
|
||||
service=service_name
|
||||
)
|
||||
protocol = "grpc"
|
||||
|
||||
if protocol not in ["grpc", "http"]:
|
||||
logger.error(
|
||||
"Invalid protocol specified",
|
||||
service=service_name,
|
||||
protocol=protocol
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
# Get endpoints from centralized config
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
|
||||
# Determine which endpoint to use
|
||||
if otel_endpoint:
|
||||
# User provided override
|
||||
if protocol == "grpc":
|
||||
endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
|
||||
else:
|
||||
endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/metrics")
|
||||
else:
|
||||
# Use config-determined endpoint
|
||||
if protocol == "grpc":
|
||||
endpoint = endpoints.metrics_grpc
|
||||
else:
|
||||
endpoint = endpoints.metrics_http
|
||||
|
||||
# Get resource attributes
|
||||
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
|
||||
resource = Resource(attributes=resource_attrs)
|
||||
|
||||
# Configure OTLP exporter based on protocol
|
||||
if protocol == "grpc":
|
||||
otlp_exporter = GrpcMetricExporter(
|
||||
endpoint=endpoint,
|
||||
insecure=True, # Use secure=False in production with proper TLS
|
||||
timeout=10
|
||||
)
|
||||
else: # http
|
||||
otlp_exporter = HttpMetricExporter(
|
||||
endpoint=endpoint,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
# Create periodic metric reader
|
||||
metric_reader = PeriodicExportingMetricReader(
|
||||
exporter=otlp_exporter,
|
||||
export_interval_millis=export_interval_millis
|
||||
)
|
||||
|
||||
# Configure meter provider
|
||||
meter_provider = MeterProvider(
|
||||
resource=resource,
|
||||
metric_readers=[metric_reader]
|
||||
)
|
||||
|
||||
# Set global meter provider
|
||||
metrics.set_meter_provider(meter_provider)
|
||||
|
||||
logger.info(
|
||||
"OpenTelemetry metrics export configured successfully",
|
||||
service=service_name,
|
||||
endpoint=endpoint,
|
||||
protocol=protocol,
|
||||
export_interval_seconds=export_interval_millis / 1000
|
||||
)
|
||||
|
||||
return meter_provider
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to setup OpenTelemetry metrics export",
|
||||
service=service_name,
|
||||
error=str(e),
|
||||
protocol=protocol
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
class OTelMetricsCollector:
|
||||
"""
|
||||
Wrapper for OpenTelemetry metrics that provides a similar interface
|
||||
to the Prometheus MetricsCollector.
|
||||
|
||||
This allows services to emit metrics that go to both Prometheus and SigNoz.
|
||||
"""
|
||||
|
||||
def __init__(self, service_name: str, meter_provider: MeterProvider):
|
||||
self.service_name = service_name
|
||||
self.meter_provider = meter_provider
|
||||
self.meter = meter_provider.get_meter(__name__)
|
||||
|
||||
# Store created instruments
|
||||
self._counters = {}
|
||||
self._histograms = {}
|
||||
self._gauges = {}
|
||||
|
||||
def create_counter(self, name: str, description: str = "", unit: str = "1"):
|
||||
"""Create or get an OpenTelemetry Counter"""
|
||||
if name not in self._counters:
|
||||
self._counters[name] = self.meter.create_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=description,
|
||||
unit=unit
|
||||
)
|
||||
return self._counters[name]
|
||||
|
||||
def create_histogram(self, name: str, description: str = "", unit: str = "1"):
|
||||
"""Create or get an OpenTelemetry Histogram"""
|
||||
if name not in self._histograms:
|
||||
self._histograms[name] = self.meter.create_histogram(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=description,
|
||||
unit=unit
|
||||
)
|
||||
return self._histograms[name]
|
||||
|
||||
def create_gauge(self, name: str, description: str = "", unit: str = "1"):
|
||||
"""
|
||||
Create or get an OpenTelemetry observable gauge.
|
||||
Note: Gauges in OTEL require a callback function.
|
||||
"""
|
||||
if name not in self._gauges:
|
||||
# Store gauge reference for callback registration
|
||||
self._gauges[name] = {
|
||||
"name": f"{self.service_name.replace('-', '_')}_{name}",
|
||||
"description": description,
|
||||
"unit": unit,
|
||||
"value": 0,
|
||||
"attributes": {}
|
||||
}
|
||||
return self._gauges[name]
|
||||
|
||||
def increment_counter(self, name: str, value: int = 1, attributes: dict = None):
|
||||
"""Increment a counter with optional attributes"""
|
||||
if name in self._counters:
|
||||
if attributes is None:
|
||||
attributes = {"service": self.service_name}
|
||||
elif "service" not in attributes:
|
||||
attributes["service"] = self.service_name
|
||||
|
||||
self._counters[name].add(value, attributes)
|
||||
|
||||
def observe_histogram(self, name: str, value: float, attributes: dict = None):
|
||||
"""Record a histogram observation with optional attributes"""
|
||||
if name in self._histograms:
|
||||
if attributes is None:
|
||||
attributes = {"service": self.service_name}
|
||||
elif "service" not in attributes:
|
||||
attributes["service"] = self.service_name
|
||||
|
||||
self._histograms[name].record(value, attributes)
|
||||
|
||||
def set_gauge(self, name: str, value: float, attributes: dict = None):
|
||||
"""Set a gauge value (stores for next callback)"""
|
||||
if name in self._gauges:
|
||||
if attributes is None:
|
||||
attributes = {"service": self.service_name}
|
||||
elif "service" not in attributes:
|
||||
attributes["service"] = self.service_name
|
||||
|
||||
self._gauges[name]["value"] = value
|
||||
self._gauges[name]["attributes"] = attributes
|
||||
|
||||
|
||||
def create_dual_metrics_collector(service_name: str, service_version: str = "1.0.0"):
|
||||
"""
|
||||
Create a metrics collector that exports to both Prometheus and SigNoz.
|
||||
|
||||
This function sets up both collection strategies:
|
||||
1. Prometheus client library (for /metrics endpoint scraping)
|
||||
2. OpenTelemetry metrics (for OTLP push to SigNoz)
|
||||
|
||||
Returns a tuple: (prometheus_collector, otel_collector)
|
||||
Both collectors can be used independently or together.
|
||||
|
||||
Example:
|
||||
from shared.monitoring.metrics_exporter import create_dual_metrics_collector
|
||||
|
||||
prom_collector, otel_collector = create_dual_metrics_collector("auth-service")
|
||||
|
||||
# Prometheus counter
|
||||
prom_collector.register_counter("requests_total", "Total requests")
|
||||
prom_collector.increment_counter("requests_total", labels={"status": "200"})
|
||||
|
||||
# OpenTelemetry counter (pushed to SigNoz)
|
||||
counter = otel_collector.create_counter("requests_total", "Total requests")
|
||||
counter.add(1, {"status": "200"})
|
||||
"""
|
||||
from shared.monitoring.metrics import MetricsCollector
|
||||
|
||||
# Create Prometheus collector
|
||||
prom_collector = MetricsCollector(service_name)
|
||||
|
||||
# Create OpenTelemetry collector
|
||||
meter_provider = setup_otel_metrics(service_name, service_version)
|
||||
otel_collector = None
|
||||
if meter_provider:
|
||||
otel_collector = OTelMetricsCollector(service_name, meter_provider)
|
||||
|
||||
return prom_collector, otel_collector
|
||||
293
shared/monitoring/otel_config.py
Normal file
293
shared/monitoring/otel_config.py
Normal file
@@ -0,0 +1,293 @@
|
||||
"""
|
||||
Centralized OpenTelemetry Configuration
|
||||
Manages OTEL endpoints and settings for traces, metrics, and logs
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional, Tuple
|
||||
from dataclasses import dataclass
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
@dataclass
|
||||
class OTelEndpoints:
|
||||
"""
|
||||
Container for OpenTelemetry endpoints.
|
||||
|
||||
SigNoz uses different protocols for different signals:
|
||||
- Traces: gRPC (port 4317)
|
||||
- Metrics: gRPC (port 4317) or HTTP (port 4318)
|
||||
- Logs: HTTP (port 4318)
|
||||
"""
|
||||
traces_grpc: str # gRPC endpoint for traces (e.g., "host:4317")
|
||||
metrics_grpc: str # gRPC endpoint for metrics (e.g., "host:4317")
|
||||
metrics_http: str # HTTP endpoint for metrics (e.g., "http://host:4318/v1/metrics")
|
||||
logs_http: str # HTTP endpoint for logs (e.g., "http://host:4318/v1/logs")
|
||||
|
||||
|
||||
class OTelConfig:
|
||||
"""
|
||||
Centralized configuration for OpenTelemetry exporters.
|
||||
|
||||
This class manages endpoint URLs and ensures proper protocol usage:
|
||||
- gRPC endpoints: host:port (no protocol prefix)
|
||||
- HTTP endpoints: http://host:port/path (with protocol and path)
|
||||
"""
|
||||
|
||||
# Default base endpoint (can be overridden by environment variables)
|
||||
DEFAULT_OTEL_COLLECTOR_HOST = "signoz-otel-collector.bakery-ia.svc.cluster.local"
|
||||
DEFAULT_GRPC_PORT = 4317
|
||||
DEFAULT_HTTP_PORT = 4318
|
||||
|
||||
@classmethod
|
||||
def get_endpoints(cls) -> OTelEndpoints:
|
||||
"""
|
||||
Get OpenTelemetry endpoints from environment variables with proper fallbacks.
|
||||
|
||||
Environment variables (in order of precedence):
|
||||
1. OTEL_EXPORTER_OTLP_ENDPOINT - Base endpoint (gRPC format: host:port)
|
||||
2. OTEL_EXPORTER_OTLP_TRACES_ENDPOINT - Specific traces endpoint
|
||||
3. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT - Specific metrics endpoint
|
||||
4. OTEL_EXPORTER_OTLP_LOGS_ENDPOINT - Specific logs endpoint
|
||||
5. OTEL_COLLECTOR_ENDPOINT - Legacy variable (HTTP format)
|
||||
|
||||
Returns:
|
||||
OTelEndpoints with all configured endpoints
|
||||
"""
|
||||
# Get base endpoint from environment
|
||||
base_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
|
||||
if base_endpoint:
|
||||
# Clean and parse base endpoint
|
||||
base_grpc = cls._clean_grpc_endpoint(base_endpoint)
|
||||
base_http_host = cls._extract_host(base_endpoint)
|
||||
|
||||
# Validate that the endpoint doesn't contain secret references or malformed data
|
||||
if cls._contains_secret_reference(base_grpc):
|
||||
logger.error("OTEL endpoint contains secret reference, falling back to default",
|
||||
malformed_endpoint=base_endpoint)
|
||||
base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
|
||||
base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
|
||||
else:
|
||||
# Use default collector
|
||||
base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
|
||||
base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
|
||||
|
||||
# Get signal-specific endpoints (or use base endpoint)
|
||||
traces_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", base_grpc)
|
||||
metrics_endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", base_grpc)
|
||||
logs_endpoint = os.getenv("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT")
|
||||
|
||||
# Validate and clean signal-specific endpoints
|
||||
traces_grpc = cls._clean_and_validate_grpc_endpoint(traces_endpoint)
|
||||
metrics_grpc = cls._clean_and_validate_grpc_endpoint(metrics_endpoint)
|
||||
|
||||
# For metrics HTTP, convert gRPC endpoint to HTTP if needed
|
||||
metrics_http = cls._grpc_to_http_endpoint(metrics_grpc, "/v1/metrics")
|
||||
|
||||
# For logs, use HTTP endpoint
|
||||
if logs_endpoint:
|
||||
logs_http = cls._ensure_http_endpoint(logs_endpoint, "/v1/logs")
|
||||
else:
|
||||
logs_http = cls._grpc_to_http_endpoint(base_grpc, "/v1/logs")
|
||||
|
||||
endpoints = OTelEndpoints(
|
||||
traces_grpc=traces_grpc,
|
||||
metrics_grpc=metrics_grpc,
|
||||
metrics_http=metrics_http,
|
||||
logs_http=logs_http
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"OpenTelemetry endpoints configured",
|
||||
traces_grpc=endpoints.traces_grpc,
|
||||
metrics_grpc=endpoints.metrics_grpc,
|
||||
metrics_http=endpoints.metrics_http,
|
||||
logs_http=endpoints.logs_http
|
||||
)
|
||||
|
||||
return endpoints
|
||||
|
||||
@staticmethod
|
||||
def _clean_grpc_endpoint(endpoint: str) -> str:
|
||||
"""
|
||||
Clean endpoint for gRPC usage (remove protocol, paths).
|
||||
|
||||
Args:
|
||||
endpoint: Raw endpoint string
|
||||
|
||||
Returns:
|
||||
Cleaned endpoint in format "host:port"
|
||||
"""
|
||||
# Remove protocol prefixes
|
||||
endpoint = endpoint.replace("http://", "").replace("https://", "")
|
||||
|
||||
# Remove paths (gRPC doesn't use paths)
|
||||
if "/" in endpoint:
|
||||
endpoint = endpoint.split("/")[0]
|
||||
|
||||
# Ensure it has a port
|
||||
if ":" not in endpoint:
|
||||
endpoint = f"{endpoint}:4317"
|
||||
|
||||
return endpoint
|
||||
|
||||
@staticmethod
|
||||
def _extract_host(endpoint: str) -> str:
|
||||
"""
|
||||
Extract host and convert to HTTP endpoint.
|
||||
|
||||
Args:
|
||||
endpoint: Raw endpoint string
|
||||
|
||||
Returns:
|
||||
HTTP endpoint without path (e.g., "http://host:4318")
|
||||
"""
|
||||
# Remove protocol if present
|
||||
clean = endpoint.replace("http://", "").replace("https://", "")
|
||||
|
||||
# Remove path if present
|
||||
if "/" in clean:
|
||||
clean = clean.split("/")[0]
|
||||
|
||||
# Extract host without port
|
||||
if ":" in clean:
|
||||
host = clean.split(":")[0]
|
||||
else:
|
||||
host = clean
|
||||
|
||||
return f"http://{host}:4318"
|
||||
|
||||
@staticmethod
|
||||
def _grpc_to_http_endpoint(grpc_endpoint: str, path: str) -> str:
|
||||
"""
|
||||
Convert gRPC endpoint to HTTP endpoint with path.
|
||||
|
||||
Args:
|
||||
grpc_endpoint: gRPC endpoint (e.g., "host:4317")
|
||||
path: HTTP path (e.g., "/v1/metrics")
|
||||
|
||||
Returns:
|
||||
HTTP endpoint (e.g., "http://host:4318/v1/metrics")
|
||||
"""
|
||||
# Extract host from gRPC endpoint
|
||||
if ":" in grpc_endpoint:
|
||||
host = grpc_endpoint.split(":")[0]
|
||||
else:
|
||||
host = grpc_endpoint
|
||||
|
||||
# Build HTTP endpoint with port 4318
|
||||
return f"http://{host}:4318{path}"
|
||||
|
||||
@staticmethod
|
||||
def _ensure_http_endpoint(endpoint: str, path: str) -> str:
|
||||
"""
|
||||
Ensure endpoint is in HTTP format with proper path.
|
||||
|
||||
Args:
|
||||
endpoint: Raw endpoint string
|
||||
path: Required path (e.g., "/v1/logs")
|
||||
|
||||
Returns:
|
||||
HTTP endpoint with protocol and path
|
||||
"""
|
||||
# Add protocol if missing
|
||||
if not endpoint.startswith(("http://", "https://")):
|
||||
endpoint = f"http://{endpoint}"
|
||||
|
||||
# Ensure it has the correct port for HTTP
|
||||
if ":4317" in endpoint:
|
||||
endpoint = endpoint.replace(":4317", ":4318")
|
||||
elif ":4318" not in endpoint and ":" in endpoint:
|
||||
# Has a port but not the right one, replace it
|
||||
parts = endpoint.split(":")
|
||||
if len(parts) >= 2:
|
||||
# Remove existing port and path
|
||||
base = ":".join(parts[:-1])
|
||||
endpoint = f"{base}:4318"
|
||||
elif ":" not in endpoint.replace("http://", "").replace("https://", ""):
|
||||
# No port at all, add it
|
||||
endpoint = f"{endpoint}:4318"
|
||||
|
||||
# Ensure path is present
|
||||
if not endpoint.endswith(path):
|
||||
# Remove any existing path first
|
||||
if "/" in endpoint.split("://")[1]:
|
||||
base = endpoint.split("://")[0] + "://" + endpoint.split("://")[1].split("/")[0]
|
||||
endpoint = base
|
||||
endpoint = f"{endpoint}{path}"
|
||||
|
||||
return endpoint
|
||||
|
||||
@classmethod
|
||||
def get_resource_attributes(
|
||||
cls,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0"
|
||||
) -> dict:
|
||||
"""
|
||||
Get common resource attributes for all OTEL signals.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service
|
||||
service_version: Version of the service
|
||||
|
||||
Returns:
|
||||
Dictionary of resource attributes
|
||||
"""
|
||||
return {
|
||||
"service.name": service_name,
|
||||
"service.version": service_version,
|
||||
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
|
||||
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
|
||||
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
|
||||
"k8s.cluster.name": os.getenv("K8S_CLUSTER_NAME", "bakery-ia-cluster"),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def is_enabled(cls, signal: str) -> bool:
|
||||
"""
|
||||
Check if a specific telemetry signal is enabled.
|
||||
|
||||
Args:
|
||||
signal: One of "traces", "metrics", "logs"
|
||||
|
||||
Returns:
|
||||
True if signal is enabled, False otherwise
|
||||
"""
|
||||
signal = signal.lower()
|
||||
|
||||
if signal == "traces":
|
||||
return os.getenv("ENABLE_TRACING", "true").lower() == "true"
|
||||
elif signal == "metrics":
|
||||
return os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
|
||||
elif signal == "logs":
|
||||
return os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp"
|
||||
else:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_protocol(cls, signal: str) -> str:
|
||||
"""
|
||||
Get the preferred protocol for a signal.
|
||||
|
||||
Args:
|
||||
signal: One of "traces", "metrics", "logs"
|
||||
|
||||
Returns:
|
||||
Protocol name ("grpc" or "http")
|
||||
"""
|
||||
protocol = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc")
|
||||
|
||||
# Signal-specific overrides
|
||||
if signal == "traces":
|
||||
return os.getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", protocol)
|
||||
elif signal == "metrics":
|
||||
return os.getenv("OTEL_EXPORTER_OTLP_METRICS_PROTOCOL", protocol)
|
||||
elif signal == "logs":
|
||||
# Logs always use HTTP in our setup
|
||||
return "http"
|
||||
|
||||
return protocol
|
||||
433
shared/monitoring/system_metrics.py
Normal file
433
shared/monitoring/system_metrics.py
Normal file
@@ -0,0 +1,433 @@
|
||||
"""
|
||||
System Metrics Collection for SigNoz
|
||||
Collects CPU, memory, disk, and process metrics via OpenTelemetry
|
||||
"""
|
||||
|
||||
import os
|
||||
import psutil
|
||||
import structlog
|
||||
from typing import Optional
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class SystemMetricsCollector:
|
||||
"""
|
||||
Collects system-level metrics (CPU, memory, disk, network, process info)
|
||||
and exports them to SigNoz via OpenTelemetry.
|
||||
|
||||
These metrics help monitor service health and resource utilization.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
):
|
||||
self.service_name = service_name
|
||||
self.service_version = service_version
|
||||
self.process = psutil.Process()
|
||||
|
||||
# Use provided meter provider or get global
|
||||
if meter_provider:
|
||||
self.meter = meter_provider.get_meter(__name__)
|
||||
else:
|
||||
self.meter = metrics.get_meter(__name__)
|
||||
|
||||
# Initialize metric instruments
|
||||
self._setup_metrics()
|
||||
|
||||
logger.info(
|
||||
"System metrics collector initialized",
|
||||
service=service_name,
|
||||
pid=os.getpid()
|
||||
)
|
||||
|
||||
def _setup_metrics(self):
|
||||
"""Setup all system metric instruments"""
|
||||
|
||||
# Process CPU metrics
|
||||
self.process_cpu_percent = self.meter.create_observable_gauge(
|
||||
name="process.cpu.utilization",
|
||||
description="Process CPU utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_process_cpu]
|
||||
)
|
||||
|
||||
# Process memory metrics
|
||||
self.process_memory_usage = self.meter.create_observable_gauge(
|
||||
name="process.memory.usage",
|
||||
description="Process memory usage in bytes",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_process_memory]
|
||||
)
|
||||
|
||||
self.process_memory_percent = self.meter.create_observable_gauge(
|
||||
name="process.memory.utilization",
|
||||
description="Process memory utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_process_memory_percent]
|
||||
)
|
||||
|
||||
# Process thread count
|
||||
self.process_threads = self.meter.create_observable_gauge(
|
||||
name="process.threads.count",
|
||||
description="Number of threads in the process",
|
||||
unit="threads",
|
||||
callbacks=[self._observe_process_threads]
|
||||
)
|
||||
|
||||
# Process file descriptors (Unix only)
|
||||
if hasattr(self.process, 'num_fds'):
|
||||
self.process_fds = self.meter.create_observable_gauge(
|
||||
name="process.open_file_descriptors",
|
||||
description="Number of open file descriptors",
|
||||
unit="fds",
|
||||
callbacks=[self._observe_process_fds]
|
||||
)
|
||||
|
||||
# System-wide CPU metrics
|
||||
self.system_cpu_percent = self.meter.create_observable_gauge(
|
||||
name="system.cpu.utilization",
|
||||
description="System-wide CPU utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_system_cpu]
|
||||
)
|
||||
|
||||
# System-wide memory metrics
|
||||
self.system_memory_usage = self.meter.create_observable_gauge(
|
||||
name="system.memory.usage",
|
||||
description="System memory usage in bytes",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_system_memory]
|
||||
)
|
||||
|
||||
self.system_memory_percent = self.meter.create_observable_gauge(
|
||||
name="system.memory.utilization",
|
||||
description="System memory utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_system_memory_percent]
|
||||
)
|
||||
|
||||
# Disk I/O metrics
|
||||
self.disk_io_read = self.meter.create_observable_counter(
|
||||
name="system.disk.io.read",
|
||||
description="Disk bytes read",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_disk_io_read]
|
||||
)
|
||||
|
||||
self.disk_io_write = self.meter.create_observable_counter(
|
||||
name="system.disk.io.write",
|
||||
description="Disk bytes written",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_disk_io_write]
|
||||
)
|
||||
|
||||
# Network I/O metrics
|
||||
self.network_io_sent = self.meter.create_observable_counter(
|
||||
name="system.network.io.sent",
|
||||
description="Network bytes sent",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_network_io_sent]
|
||||
)
|
||||
|
||||
self.network_io_recv = self.meter.create_observable_counter(
|
||||
name="system.network.io.received",
|
||||
description="Network bytes received",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_network_io_recv]
|
||||
)
|
||||
|
||||
# Callback methods for observable instruments
|
||||
|
||||
def _observe_process_cpu(self, options):
|
||||
"""Observe process CPU usage"""
|
||||
try:
|
||||
cpu_percent = self.process.cpu_percent(interval=None)
|
||||
yield metrics.Observation(
|
||||
cpu_percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process CPU metrics: {e}")
|
||||
|
||||
def _observe_process_memory(self, options):
|
||||
"""Observe process memory usage"""
|
||||
try:
|
||||
mem_info = self.process.memory_info()
|
||||
yield metrics.Observation(
|
||||
mem_info.rss, # Resident Set Size
|
||||
{"service": self.service_name, "type": "rss"}
|
||||
)
|
||||
yield metrics.Observation(
|
||||
mem_info.vms, # Virtual Memory Size
|
||||
{"service": self.service_name, "type": "vms"}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process memory metrics: {e}")
|
||||
|
||||
def _observe_process_memory_percent(self, options):
|
||||
"""Observe process memory percentage"""
|
||||
try:
|
||||
mem_percent = self.process.memory_percent()
|
||||
yield metrics.Observation(
|
||||
mem_percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process memory percent: {e}")
|
||||
|
||||
def _observe_process_threads(self, options):
|
||||
"""Observe process thread count"""
|
||||
try:
|
||||
num_threads = self.process.num_threads()
|
||||
yield metrics.Observation(
|
||||
num_threads,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process thread count: {e}")
|
||||
|
||||
def _observe_process_fds(self, options):
|
||||
"""Observe process file descriptors (Unix only)"""
|
||||
try:
|
||||
num_fds = self.process.num_fds()
|
||||
yield metrics.Observation(
|
||||
num_fds,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process FDs: {e}")
|
||||
|
||||
def _observe_system_cpu(self, options):
|
||||
"""Observe system-wide CPU usage"""
|
||||
try:
|
||||
cpu_percent = psutil.cpu_percent(interval=None)
|
||||
yield metrics.Observation(
|
||||
cpu_percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect system CPU metrics: {e}")
|
||||
|
||||
def _observe_system_memory(self, options):
|
||||
"""Observe system memory usage"""
|
||||
try:
|
||||
mem = psutil.virtual_memory()
|
||||
yield metrics.Observation(
|
||||
mem.used,
|
||||
{"service": self.service_name, "type": "used"}
|
||||
)
|
||||
yield metrics.Observation(
|
||||
mem.available,
|
||||
{"service": self.service_name, "type": "available"}
|
||||
)
|
||||
yield metrics.Observation(
|
||||
mem.total,
|
||||
{"service": self.service_name, "type": "total"}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect system memory metrics: {e}")
|
||||
|
||||
def _observe_system_memory_percent(self, options):
|
||||
"""Observe system memory percentage"""
|
||||
try:
|
||||
mem = psutil.virtual_memory()
|
||||
yield metrics.Observation(
|
||||
mem.percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect system memory percent: {e}")
|
||||
|
||||
def _observe_disk_io_read(self, options):
|
||||
"""Observe disk I/O read bytes"""
|
||||
try:
|
||||
disk_io = psutil.disk_io_counters()
|
||||
if disk_io:
|
||||
yield metrics.Observation(
|
||||
disk_io.read_bytes,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect disk I/O read metrics: {e}")
|
||||
|
||||
def _observe_disk_io_write(self, options):
|
||||
"""Observe disk I/O write bytes"""
|
||||
try:
|
||||
disk_io = psutil.disk_io_counters()
|
||||
if disk_io:
|
||||
yield metrics.Observation(
|
||||
disk_io.write_bytes,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect disk I/O write metrics: {e}")
|
||||
|
||||
def _observe_network_io_sent(self, options):
|
||||
"""Observe network bytes sent"""
|
||||
try:
|
||||
net_io = psutil.net_io_counters()
|
||||
yield metrics.Observation(
|
||||
net_io.bytes_sent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect network sent metrics: {e}")
|
||||
|
||||
def _observe_network_io_recv(self, options):
|
||||
"""Observe network bytes received"""
|
||||
try:
|
||||
net_io = psutil.net_io_counters()
|
||||
yield metrics.Observation(
|
||||
net_io.bytes_recv,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect network recv metrics: {e}")
|
||||
|
||||
|
||||
class ApplicationMetricsCollector:
|
||||
"""
|
||||
Collects application-level metrics (HTTP requests, database connections, etc.)
|
||||
using OpenTelemetry metrics API only (no Prometheus).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
):
|
||||
self.service_name = service_name
|
||||
|
||||
# Use provided meter provider or get global
|
||||
if meter_provider:
|
||||
self.meter = meter_provider.get_meter(__name__)
|
||||
else:
|
||||
self.meter = metrics.get_meter(__name__)
|
||||
|
||||
# HTTP metrics
|
||||
self.http_requests = self.meter.create_counter(
|
||||
name="http.server.requests",
|
||||
description="Total HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
self.http_request_duration = self.meter.create_histogram(
|
||||
name="http.server.request.duration",
|
||||
description="HTTP request duration",
|
||||
unit="ms"
|
||||
)
|
||||
|
||||
self.http_active_requests = self.meter.create_up_down_counter(
|
||||
name="http.server.active_requests",
|
||||
description="Active HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
# Database metrics
|
||||
self.db_connections = self.meter.create_up_down_counter(
|
||||
name="db.client.connections.usage",
|
||||
description="Database connections in use",
|
||||
unit="connections"
|
||||
)
|
||||
|
||||
self.db_query_duration = self.meter.create_histogram(
|
||||
name="db.client.operation.duration",
|
||||
description="Database query duration",
|
||||
unit="ms"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Application metrics collector initialized",
|
||||
service=service_name
|
||||
)
|
||||
|
||||
def record_http_request(
|
||||
self,
|
||||
method: str,
|
||||
endpoint: str,
|
||||
status_code: int,
|
||||
duration_ms: float
|
||||
):
|
||||
"""Record an HTTP request"""
|
||||
attributes = {
|
||||
"service": self.service_name,
|
||||
"http.method": method,
|
||||
"http.route": endpoint,
|
||||
"http.status_code": status_code
|
||||
}
|
||||
|
||||
self.http_requests.add(1, attributes)
|
||||
self.http_request_duration.record(duration_ms, attributes)
|
||||
|
||||
def increment_active_requests(self):
|
||||
"""Increment active request count"""
|
||||
self.http_active_requests.add(1, {"service": self.service_name})
|
||||
|
||||
def decrement_active_requests(self):
|
||||
"""Decrement active request count"""
|
||||
self.http_active_requests.add(-1, {"service": self.service_name})
|
||||
|
||||
def set_db_connections(self, count: int, state: str = "used"):
|
||||
"""Set database connection count"""
|
||||
self.db_connections.add(
|
||||
count,
|
||||
{"service": self.service_name, "state": state}
|
||||
)
|
||||
|
||||
def record_db_query(self, operation: str, duration_ms: float, table: str = ""):
|
||||
"""Record a database query"""
|
||||
attributes = {
|
||||
"service": self.service_name,
|
||||
"db.operation": operation
|
||||
}
|
||||
if table:
|
||||
attributes["db.table"] = table
|
||||
|
||||
self.db_query_duration.record(duration_ms, attributes)
|
||||
|
||||
|
||||
def setup_all_metrics(
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
) -> tuple[SystemMetricsCollector, ApplicationMetricsCollector]:
|
||||
"""
|
||||
Setup both system and application metrics collection.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service
|
||||
service_version: Version of the service
|
||||
meter_provider: Optional meter provider (will use global if not provided)
|
||||
|
||||
Returns:
|
||||
Tuple of (SystemMetricsCollector, ApplicationMetricsCollector)
|
||||
|
||||
Example:
|
||||
from shared.monitoring.system_metrics import setup_all_metrics
|
||||
|
||||
system_metrics, app_metrics = setup_all_metrics("auth-service", "1.0.0")
|
||||
|
||||
# Metrics are automatically collected
|
||||
# Use app_metrics to record custom application events:
|
||||
app_metrics.record_http_request("GET", "/api/users", 200, 45.2)
|
||||
"""
|
||||
system_metrics = SystemMetricsCollector(service_name, service_version, meter_provider)
|
||||
app_metrics = ApplicationMetricsCollector(service_name, service_version, meter_provider)
|
||||
|
||||
logger.info(
|
||||
"All metrics collectors initialized",
|
||||
service=service_name,
|
||||
collectors=["system", "application"]
|
||||
)
|
||||
|
||||
return system_metrics, app_metrics
|
||||
271
shared/monitoring/telemetry.py
Normal file
271
shared/monitoring/telemetry.py
Normal file
@@ -0,0 +1,271 @@
|
||||
"""
|
||||
Unified OpenTelemetry Telemetry Setup
|
||||
|
||||
Provides a single entry point to configure all telemetry signals:
|
||||
- Traces: Distributed tracing across services
|
||||
- Metrics: OTLP metrics export + system metrics collection
|
||||
- Logs: Structured logs with trace correlation
|
||||
|
||||
All signals are exported to SigNoz via OTLP.
|
||||
"""
|
||||
|
||||
import os
|
||||
import structlog
|
||||
from typing import Optional, Dict, Any, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .otel_config import OTelConfig
|
||||
from .tracing import setup_tracing
|
||||
from .metrics_exporter import setup_otel_metrics
|
||||
from .logs_exporter import setup_otel_logging
|
||||
from .system_metrics import setup_all_metrics, SystemMetricsCollector, ApplicationMetricsCollector
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
@dataclass
|
||||
class TelemetryProviders:
|
||||
"""
|
||||
Container for all OpenTelemetry providers and collectors.
|
||||
|
||||
Attributes:
|
||||
tracer_provider: Provider for distributed tracing
|
||||
meter_provider: Provider for metrics export
|
||||
logging_handler: Handler for structured logs
|
||||
system_metrics: Collector for system-level metrics (CPU, memory, disk, network)
|
||||
app_metrics: Collector for application-level metrics (HTTP, DB)
|
||||
"""
|
||||
tracer_provider: Optional[Any] = None
|
||||
meter_provider: Optional[Any] = None
|
||||
logging_handler: Optional[Any] = None
|
||||
system_metrics: Optional[SystemMetricsCollector] = None
|
||||
app_metrics: Optional[ApplicationMetricsCollector] = None
|
||||
|
||||
|
||||
def setup_telemetry(
|
||||
app,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
enable_traces: bool = True,
|
||||
enable_metrics: bool = True,
|
||||
enable_logs: bool = True,
|
||||
enable_system_metrics: bool = True,
|
||||
metrics_protocol: Optional[str] = None, # "grpc" or "http", defaults to grpc
|
||||
export_interval_millis: int = 60000
|
||||
) -> TelemetryProviders:
|
||||
"""
|
||||
Setup all OpenTelemetry telemetry signals (traces, metrics, logs) for a service.
|
||||
|
||||
This is the UNIFIED setup function that configures everything:
|
||||
- Distributed tracing (gRPC, port 4317)
|
||||
- Metrics export (gRPC by default, port 4317)
|
||||
- System metrics collection (CPU, memory, disk, network)
|
||||
- Application metrics (HTTP requests, DB queries)
|
||||
- Structured logs export (HTTP, port 4318)
|
||||
|
||||
All signals use the centralized OTelConfig for endpoint management.
|
||||
|
||||
Args:
|
||||
app: FastAPI application instance
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
enable_traces: Enable distributed tracing (default: True)
|
||||
enable_metrics: Enable metrics export to OTLP (default: True)
|
||||
enable_logs: Enable logs export to OTLP (default: True)
|
||||
enable_system_metrics: Enable system metrics collection (default: True, can be disabled via ENABLE_SYSTEM_METRICS env)
|
||||
metrics_protocol: Protocol for metrics ("grpc" or "http", default: "grpc")
|
||||
export_interval_millis: How often to export metrics in milliseconds
|
||||
|
||||
Returns:
|
||||
TelemetryProviders containing all initialized providers and collectors
|
||||
|
||||
Example:
|
||||
from shared.monitoring.telemetry import setup_telemetry
|
||||
|
||||
app = FastAPI(title="Auth Service")
|
||||
providers = setup_telemetry(
|
||||
app,
|
||||
service_name="auth-service",
|
||||
service_version="1.0.0"
|
||||
)
|
||||
|
||||
# All telemetry is now configured:
|
||||
# - Traces automatically captured for HTTP requests
|
||||
# - System metrics automatically collected
|
||||
# - Application metrics via providers.app_metrics
|
||||
# - Logs automatically correlated with traces
|
||||
"""
|
||||
|
||||
logger.info(
|
||||
"Setting up unified OpenTelemetry telemetry",
|
||||
service=service_name,
|
||||
version=service_version,
|
||||
traces=enable_traces,
|
||||
metrics=enable_metrics,
|
||||
logs=enable_logs,
|
||||
system_metrics=enable_system_metrics
|
||||
)
|
||||
|
||||
providers = TelemetryProviders()
|
||||
|
||||
# Setup distributed tracing
|
||||
if enable_traces and OTelConfig.is_enabled("traces"):
|
||||
try:
|
||||
providers.tracer_provider = setup_tracing(
|
||||
app,
|
||||
service_name=service_name,
|
||||
service_version=service_version
|
||||
)
|
||||
if providers.tracer_provider:
|
||||
logger.info("✓ Distributed tracing configured", service=service_name)
|
||||
else:
|
||||
logger.warning("✗ Distributed tracing setup returned None", service=service_name)
|
||||
except Exception as e:
|
||||
logger.error("✗ Failed to setup distributed tracing", service=service_name, error=str(e))
|
||||
|
||||
# Setup OTLP metrics export
|
||||
if enable_metrics and OTelConfig.is_enabled("metrics"):
|
||||
try:
|
||||
providers.meter_provider = setup_otel_metrics(
|
||||
service_name=service_name,
|
||||
service_version=service_version,
|
||||
protocol=metrics_protocol,
|
||||
export_interval_millis=export_interval_millis
|
||||
)
|
||||
if providers.meter_provider:
|
||||
logger.info("✓ OTLP metrics export configured", service=service_name)
|
||||
|
||||
# Setup system and application metrics collectors
|
||||
if enable_system_metrics:
|
||||
enable_system_env = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
|
||||
if enable_system_env:
|
||||
try:
|
||||
providers.system_metrics, providers.app_metrics = setup_all_metrics(
|
||||
service_name=service_name,
|
||||
service_version=service_version,
|
||||
meter_provider=providers.meter_provider
|
||||
)
|
||||
logger.info(
|
||||
"✓ System and application metrics collectors initialized",
|
||||
service=service_name,
|
||||
system_metrics=["cpu", "memory", "disk", "network"],
|
||||
app_metrics=["http_requests", "db_queries"]
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("✗ Failed to setup metrics collectors", service=service_name, error=str(e))
|
||||
else:
|
||||
logger.warning("✗ OTLP metrics export setup returned None", service=service_name)
|
||||
except Exception as e:
|
||||
logger.error("✗ Failed to setup OTLP metrics export", service=service_name, error=str(e))
|
||||
|
||||
# Setup logs export
|
||||
if enable_logs and OTelConfig.is_enabled("logs"):
|
||||
try:
|
||||
providers.logging_handler = setup_otel_logging(
|
||||
service_name=service_name,
|
||||
service_version=service_version
|
||||
)
|
||||
if providers.logging_handler:
|
||||
logger.info("✓ Structured logs export configured", service=service_name)
|
||||
else:
|
||||
logger.warning("✗ Logs export setup returned None", service=service_name)
|
||||
except Exception as e:
|
||||
logger.error("✗ Failed to setup logs export", service=service_name, error=str(e))
|
||||
|
||||
# Log endpoint configuration summary
|
||||
try:
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
summary = {
|
||||
"service": service_name,
|
||||
"version": service_version,
|
||||
"traces": {
|
||||
"enabled": bool(providers.tracer_provider),
|
||||
"endpoint": endpoints.traces_grpc if providers.tracer_provider else "disabled"
|
||||
},
|
||||
"metrics": {
|
||||
"enabled": bool(providers.meter_provider),
|
||||
"endpoint": (endpoints.metrics_grpc if metrics_protocol != "http" else endpoints.metrics_http) if providers.meter_provider else "disabled",
|
||||
"system_metrics": bool(providers.system_metrics),
|
||||
"app_metrics": bool(providers.app_metrics)
|
||||
},
|
||||
"logs": {
|
||||
"enabled": bool(providers.logging_handler),
|
||||
"endpoint": endpoints.logs_http if providers.logging_handler else "disabled"
|
||||
}
|
||||
}
|
||||
logger.info("🎉 Telemetry setup complete", **summary)
|
||||
except Exception as e:
|
||||
logger.warning("Could not log endpoint summary", error=str(e))
|
||||
|
||||
return providers
|
||||
|
||||
|
||||
def setup_telemetry_simple(
|
||||
app,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0"
|
||||
) -> TelemetryProviders:
|
||||
"""
|
||||
Simplified telemetry setup with all defaults.
|
||||
|
||||
Uses:
|
||||
- gRPC for traces (port 4317)
|
||||
- gRPC for metrics (port 4317)
|
||||
- HTTP for logs (port 4318)
|
||||
|
||||
All settings are read from environment variables and OTelConfig.
|
||||
|
||||
Args:
|
||||
app: FastAPI application instance
|
||||
service_name: Name of the service
|
||||
service_version: Version of the service
|
||||
|
||||
Returns:
|
||||
TelemetryProviders containing all initialized providers
|
||||
|
||||
Example:
|
||||
from shared.monitoring.telemetry import setup_telemetry_simple
|
||||
|
||||
app = FastAPI(title="Auth Service")
|
||||
providers = setup_telemetry_simple(app, "auth-service")
|
||||
"""
|
||||
return setup_telemetry(
|
||||
app=app,
|
||||
service_name=service_name,
|
||||
service_version=service_version
|
||||
)
|
||||
|
||||
|
||||
def get_telemetry_status() -> Dict[str, Any]:
|
||||
"""
|
||||
Get current telemetry configuration status.
|
||||
|
||||
Returns:
|
||||
Dictionary with telemetry status information
|
||||
|
||||
Example:
|
||||
from shared.monitoring.telemetry import get_telemetry_status
|
||||
|
||||
status = get_telemetry_status()
|
||||
print(f"Tracing enabled: {status['traces']['enabled']}")
|
||||
"""
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
|
||||
return {
|
||||
"traces": {
|
||||
"enabled": OTelConfig.is_enabled("traces"),
|
||||
"protocol": "grpc",
|
||||
"endpoint": endpoints.traces_grpc
|
||||
},
|
||||
"metrics": {
|
||||
"enabled": OTelConfig.is_enabled("metrics"),
|
||||
"protocol": OTelConfig.get_protocol("metrics"),
|
||||
"grpc_endpoint": endpoints.metrics_grpc,
|
||||
"http_endpoint": endpoints.metrics_http
|
||||
},
|
||||
"logs": {
|
||||
"enabled": OTelConfig.is_enabled("logs"),
|
||||
"protocol": "http",
|
||||
"endpoint": endpoints.logs_http
|
||||
}
|
||||
}
|
||||
227
shared/monitoring/tracing.py
Executable file
227
shared/monitoring/tracing.py
Executable file
@@ -0,0 +1,227 @@
|
||||
"""
|
||||
OpenTelemetry distributed tracing integration
|
||||
Provides end-to-end request tracking across all services
|
||||
"""
|
||||
|
||||
import os
|
||||
import structlog
|
||||
from typing import Optional
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||
|
||||
# Core instrumentations (should always be available)
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
|
||||
# Optional instrumentations (may not be installed in all services)
|
||||
try:
|
||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||
HTTPX_AVAILABLE = True
|
||||
except ImportError:
|
||||
HTTPX_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
REDIS_AVAILABLE = True
|
||||
except ImportError:
|
||||
REDIS_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
||||
SQLALCHEMY_AVAILABLE = True
|
||||
except ImportError:
|
||||
SQLALCHEMY_AVAILABLE = False
|
||||
|
||||
from .otel_config import OTelConfig
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def setup_tracing(
|
||||
app,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
otel_endpoint: Optional[str] = None
|
||||
) -> Optional[TracerProvider]:
|
||||
"""
|
||||
Setup OpenTelemetry distributed tracing for a FastAPI service.
|
||||
|
||||
Automatically instruments:
|
||||
- FastAPI endpoints
|
||||
- HTTPX client requests (inter-service calls)
|
||||
- Redis operations
|
||||
- PostgreSQL/SQLAlchemy queries
|
||||
|
||||
Uses gRPC protocol (port 4317) for sending traces to SigNoz.
|
||||
|
||||
Args:
|
||||
app: FastAPI application instance
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
otel_endpoint: Optional override for OTLP endpoint (gRPC format: host:port)
|
||||
|
||||
Returns:
|
||||
TracerProvider instance if successful, None otherwise
|
||||
|
||||
Example:
|
||||
from shared.monitoring.tracing import setup_tracing
|
||||
|
||||
app = FastAPI(title="Auth Service")
|
||||
tracer_provider = setup_tracing(app, "auth-service", "1.0.0")
|
||||
"""
|
||||
|
||||
# Check if tracing is enabled
|
||||
if not OTelConfig.is_enabled("traces"):
|
||||
logger.info(
|
||||
"Distributed tracing disabled",
|
||||
service=service_name,
|
||||
reason="ENABLE_TRACING not set to 'true'"
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
# Get endpoints from centralized config
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
|
||||
# Use provided endpoint or get from config
|
||||
if otel_endpoint:
|
||||
# Clean user-provided endpoint for gRPC
|
||||
grpc_endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
|
||||
else:
|
||||
grpc_endpoint = endpoints.traces_grpc
|
||||
|
||||
# Get resource attributes
|
||||
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
|
||||
resource = Resource(attributes=resource_attrs)
|
||||
|
||||
# Configure tracer provider
|
||||
tracer_provider = TracerProvider(resource=resource)
|
||||
trace.set_tracer_provider(tracer_provider)
|
||||
|
||||
# Configure OTLP gRPC exporter for traces
|
||||
otlp_exporter = OTLPSpanExporter(
|
||||
endpoint=grpc_endpoint,
|
||||
insecure=True # Use secure=False in production with proper TLS
|
||||
)
|
||||
|
||||
# Add span processor with batching for performance
|
||||
span_processor = BatchSpanProcessor(otlp_exporter)
|
||||
tracer_provider.add_span_processor(span_processor)
|
||||
|
||||
# Auto-instrument FastAPI
|
||||
FastAPIInstrumentor.instrument_app(
|
||||
app,
|
||||
tracer_provider=tracer_provider,
|
||||
excluded_urls="health,metrics" # Don't trace health/metrics endpoints
|
||||
)
|
||||
|
||||
# Auto-instrument HTTPX (inter-service communication) if available
|
||||
if HTTPX_AVAILABLE:
|
||||
try:
|
||||
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||
logger.debug("HTTPX instrumentation enabled")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to instrument HTTPX: {e}")
|
||||
|
||||
# Auto-instrument Redis if available
|
||||
if REDIS_AVAILABLE:
|
||||
try:
|
||||
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||
logger.debug("Redis instrumentation enabled")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to instrument Redis: {e}")
|
||||
|
||||
# Auto-instrument SQLAlchemy if available
|
||||
if SQLALCHEMY_AVAILABLE:
|
||||
try:
|
||||
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||
logger.debug("SQLAlchemy instrumentation enabled")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
|
||||
|
||||
logger.info(
|
||||
"Distributed tracing configured successfully",
|
||||
service=service_name,
|
||||
grpc_endpoint=grpc_endpoint,
|
||||
protocol="grpc"
|
||||
)
|
||||
|
||||
return tracer_provider
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to setup tracing - continuing without it",
|
||||
service=service_name,
|
||||
error=str(e)
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def get_current_trace_id() -> Optional[str]:
|
||||
"""
|
||||
Get the current trace ID for correlation with logs.
|
||||
|
||||
Returns:
|
||||
Trace ID as hex string, or None if no active trace
|
||||
"""
|
||||
span = trace.get_current_span()
|
||||
if span and span.get_span_context().is_valid:
|
||||
return format(span.get_span_context().trace_id, '032x')
|
||||
return None
|
||||
|
||||
|
||||
def get_current_span_id() -> Optional[str]:
|
||||
"""
|
||||
Get the current span ID.
|
||||
|
||||
Returns:
|
||||
Span ID as hex string, or None if no active span
|
||||
"""
|
||||
span = trace.get_current_span()
|
||||
if span and span.get_span_context().is_valid:
|
||||
return format(span.get_span_context().span_id, '016x')
|
||||
return None
|
||||
|
||||
|
||||
def add_trace_attributes(**attributes):
|
||||
"""
|
||||
Add custom attributes to the current span.
|
||||
|
||||
Example:
|
||||
add_trace_attributes(
|
||||
user_id="123",
|
||||
tenant_id="abc",
|
||||
operation="user_registration"
|
||||
)
|
||||
"""
|
||||
span = trace.get_current_span()
|
||||
if span and span.get_span_context().is_valid:
|
||||
for key, value in attributes.items():
|
||||
span.set_attribute(key, str(value))
|
||||
|
||||
|
||||
def add_trace_event(name: str, **attributes):
|
||||
"""
|
||||
Add an event to the current span (for important operations).
|
||||
|
||||
Example:
|
||||
add_trace_event("user_authenticated", user_id="123", method="jwt")
|
||||
"""
|
||||
span = trace.get_current_span()
|
||||
if span and span.get_span_context().is_valid:
|
||||
span.add_event(name, attributes)
|
||||
|
||||
|
||||
def record_exception(exception: Exception):
|
||||
"""
|
||||
Record an exception in the current span.
|
||||
|
||||
Args:
|
||||
exception: The exception to record
|
||||
"""
|
||||
span = trace.get_current_span()
|
||||
if span and span.get_span_context().is_valid:
|
||||
span.record_exception(exception)
|
||||
span.set_status(trace.Status(trace.StatusCode.ERROR, str(exception)))
|
||||
Reference in New Issue
Block a user