Initial commit - production deployment

This commit is contained in:
2026-01-21 17:17:16 +01:00
commit c23d00dd92
2289 changed files with 638440 additions and 0 deletions

97
shared/monitoring/__init__.py Executable file
View File

@@ -0,0 +1,97 @@
"""
Shared monitoring package for microservices
Provides unified OpenTelemetry-based observability:
- Traces: Distributed tracing
- Metrics: System and application metrics
- Logs: Structured logging
All signals exported to SigNoz via OTLP.
"""
# Core setup - START HERE
from .logging import setup_logging
from .telemetry import (
setup_telemetry,
setup_telemetry_simple,
get_telemetry_status,
TelemetryProviders
)
# Configuration
from .otel_config import OTelConfig, OTelEndpoints
# Individual signal setup (used by telemetry.py)
from .tracing import (
setup_tracing,
get_current_trace_id,
get_current_span_id,
add_trace_attributes,
add_trace_event,
record_exception
)
from .logs_exporter import (
setup_otel_logging,
add_log_context,
get_current_trace_context,
StructlogOTELProcessor
)
from .metrics_exporter import (
setup_otel_metrics,
OTelMetricsCollector,
create_dual_metrics_collector
)
from .system_metrics import (
SystemMetricsCollector,
ApplicationMetricsCollector,
setup_all_metrics
)
# Health checks
from .health_checks import (
HealthCheckManager,
FastAPIHealthChecker,
create_health_manager,
setup_fastapi_health_checks
)
__all__ = [
# CORE - Start with these
'setup_logging',
'setup_telemetry',
'setup_telemetry_simple',
'get_telemetry_status',
'TelemetryProviders',
# Configuration
'OTelConfig',
'OTelEndpoints',
# Tracing
'setup_tracing',
'get_current_trace_id',
'get_current_span_id',
'add_trace_attributes',
'add_trace_event',
'record_exception',
# Logs
'setup_otel_logging',
'add_log_context',
'get_current_trace_context',
'StructlogOTELProcessor',
# Metrics
'setup_otel_metrics',
'OTelMetricsCollector',
'create_dual_metrics_collector',
'SystemMetricsCollector',
'ApplicationMetricsCollector',
'setup_all_metrics',
# Health checks
'HealthCheckManager',
'FastAPIHealthChecker',
'create_health_manager',
'setup_fastapi_health_checks',
]

179
shared/monitoring/decorators.py Executable file
View File

@@ -0,0 +1,179 @@
# ================================================================
# shared/monitoring/decorators.py
# ================================================================
"""
Decorators for monitoring and metrics
"""
import time
import logging
import functools
from typing import Callable, Any, Optional
from .metrics import get_metrics_collector
logger = logging.getLogger(__name__)
def track_execution_time(metric_name: str, service_name: str,
labels: Optional[dict] = None):
"""Decorator to track function execution time"""
def decorator(func: Callable) -> Callable:
@functools.wraps(func)
async def async_wrapper(*args, **kwargs) -> Any:
start_time = time.time()
try:
result = await func(*args, **kwargs)
duration = time.time() - start_time
metrics_collector = get_metrics_collector(service_name)
if metrics_collector:
metrics_collector.observe_histogram(metric_name, duration, labels)
return result
except Exception as e:
duration = time.time() - start_time
logger.error(f"Function {func.__name__} failed after {duration:.2f}s: {e}")
raise
@functools.wraps(func)
def sync_wrapper(*args, **kwargs) -> Any:
start_time = time.time()
try:
result = func(*args, **kwargs)
duration = time.time() - start_time
metrics_collector = get_metrics_collector(service_name)
if metrics_collector:
metrics_collector.observe_histogram(metric_name, duration, labels)
return result
except Exception as e:
duration = time.time() - start_time
logger.error(f"Function {func.__name__} failed after {duration:.2f}s: {e}")
raise
# Return appropriate wrapper based on function type
import asyncio
if asyncio.iscoroutinefunction(func):
return async_wrapper
else:
return sync_wrapper
return decorator
def count_calls(metric_name: str, service_name: str,
labels: Optional[dict] = None):
"""Decorator to count function calls"""
def decorator(func: Callable) -> Callable:
@functools.wraps(func)
async def async_wrapper(*args, **kwargs) -> Any:
metrics_collector = get_metrics_collector(service_name)
if metrics_collector:
metrics_collector.increment_counter(metric_name, labels=labels)
return await func(*args, **kwargs)
@functools.wraps(func)
def sync_wrapper(*args, **kwargs) -> Any:
metrics_collector = get_metrics_collector(service_name)
if metrics_collector:
metrics_collector.increment_counter(metric_name, labels=labels)
return func(*args, **kwargs)
# Return appropriate wrapper based on function type
import asyncio
if asyncio.iscoroutinefunction(func):
return async_wrapper
else:
return sync_wrapper
return decorator
def monitor_performance(operation_name: str, labels: Optional[dict] = None):
"""
General purpose performance monitoring decorator
Tracks execution time and call counts for the given operation
"""
def decorator(func: Callable) -> Callable:
@functools.wraps(func)
async def async_wrapper(*args, **kwargs) -> Any:
start_time = time.time()
service_name = "orders-service" # Could be dynamic based on context
try:
# Count the call
metrics_collector = get_metrics_collector(service_name)
if metrics_collector:
call_labels = {**(labels or {}), "operation": operation_name}
metrics_collector.increment_counter(f"{service_name}_operations_total", labels=call_labels)
# Execute the function
result = await func(*args, **kwargs)
# Record success timing
duration = time.time() - start_time
if metrics_collector:
timing_labels = {**(labels or {}), "operation": operation_name, "status": "success"}
metrics_collector.observe_histogram(f"{service_name}_operation_duration_seconds", duration, timing_labels)
return result
except Exception as e:
# Record failure timing
duration = time.time() - start_time
metrics_collector = get_metrics_collector(service_name)
if metrics_collector:
timing_labels = {**(labels or {}), "operation": operation_name, "status": "error"}
metrics_collector.observe_histogram(f"{service_name}_operation_duration_seconds", duration, timing_labels)
error_labels = {**(labels or {}), "operation": operation_name, "error_type": type(e).__name__}
metrics_collector.increment_counter(f"{service_name}_errors_total", labels=error_labels)
logger.error(f"Operation {operation_name} failed after {duration:.2f}s: {e}")
raise
@functools.wraps(func)
def sync_wrapper(*args, **kwargs) -> Any:
start_time = time.time()
service_name = "orders-service" # Could be dynamic based on context
try:
# Count the call
metrics_collector = get_metrics_collector(service_name)
if metrics_collector:
call_labels = {**(labels or {}), "operation": operation_name}
metrics_collector.increment_counter(f"{service_name}_operations_total", labels=call_labels)
# Execute the function
result = func(*args, **kwargs)
# Record success timing
duration = time.time() - start_time
if metrics_collector:
timing_labels = {**(labels or {}), "operation": operation_name, "status": "success"}
metrics_collector.observe_histogram(f"{service_name}_operation_duration_seconds", duration, timing_labels)
return result
except Exception as e:
# Record failure timing
duration = time.time() - start_time
metrics_collector = get_metrics_collector(service_name)
if metrics_collector:
timing_labels = {**(labels or {}), "operation": operation_name, "status": "error"}
metrics_collector.observe_histogram(f"{service_name}_operation_duration_seconds", duration, timing_labels)
error_labels = {**(labels or {}), "operation": operation_name, "error_type": type(e).__name__}
metrics_collector.increment_counter(f"{service_name}_errors_total", labels=error_labels)
logger.error(f"Operation {operation_name} failed after {duration:.2f}s: {e}")
raise
# Return appropriate wrapper based on function type
import asyncio
if asyncio.iscoroutinefunction(func):
return async_wrapper
else:
return sync_wrapper
return decorator

176
shared/monitoring/health.py Executable file
View File

@@ -0,0 +1,176 @@
# ================================================================
# shared/monitoring/health.py
# ================================================================
"""
Health check utilities for microservices
"""
import asyncio
import logging
import time
from typing import Dict, List, Callable, Any, Optional
from dataclasses import dataclass
from enum import Enum
from fastapi import APIRouter
logger = logging.getLogger(__name__)
class HealthStatus(Enum):
HEALTHY = "healthy"
DEGRADED = "degraded"
UNHEALTHY = "unhealthy"
@dataclass
class HealthCheck:
name: str
check_function: Callable[[], Any]
timeout: float = 5.0
critical: bool = True
@dataclass
class HealthResult:
name: str
status: HealthStatus
message: str
duration: float
timestamp: float
class HealthChecker:
"""Health checker for microservices"""
def __init__(self, service_name: str):
self.service_name = service_name
self.checks: List[HealthCheck] = []
self.start_time = time.time()
def add_check(self, name: str, check_function: Callable, timeout: float = 5.0,
critical: bool = True) -> None:
"""Add a health check"""
self.checks.append(HealthCheck(name, check_function, timeout, critical))
async def run_check(self, check: HealthCheck) -> HealthResult:
"""Run a single health check"""
start_time = time.time()
try:
# Run the check with timeout
result = await asyncio.wait_for(
asyncio.create_task(self._execute_check(check.check_function)),
timeout=check.timeout
)
duration = time.time() - start_time
if result is True or (isinstance(result, dict) and result.get('healthy', False)):
return HealthResult(
name=check.name,
status=HealthStatus.HEALTHY,
message="OK",
duration=duration,
timestamp=time.time()
)
else:
message = str(result) if result else "Check failed"
return HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=message,
duration=duration,
timestamp=time.time()
)
except asyncio.TimeoutError:
duration = time.time() - start_time
return HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=f"Timeout after {check.timeout}s",
duration=duration,
timestamp=time.time()
)
except Exception as e:
duration = time.time() - start_time
return HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=f"Error: {str(e)}",
duration=duration,
timestamp=time.time()
)
async def _execute_check(self, check_function: Callable) -> Any:
"""Execute a check function (handles both sync and async)"""
if asyncio.iscoroutinefunction(check_function):
return await check_function()
else:
return check_function()
async def check_health(self) -> Dict[str, Any]:
"""Run all health checks and return status"""
if not self.checks:
return {
"service": self.service_name,
"status": HealthStatus.HEALTHY.value,
"uptime": time.time() - self.start_time,
"timestamp": time.time(),
"checks": {}
}
# Run all checks concurrently
results = await asyncio.gather(
*[self.run_check(check) for check in self.checks],
return_exceptions=True
)
# Process results
check_results = {}
overall_status = HealthStatus.HEALTHY
for i, result in enumerate(results):
check = self.checks[i]
if isinstance(result, Exception):
check_result = HealthResult(
name=check.name,
status=HealthStatus.UNHEALTHY,
message=f"Exception: {str(result)}",
duration=0.0,
timestamp=time.time()
)
else:
check_result = result
check_results[check.name] = {
"status": check_result.status.value,
"message": check_result.message,
"duration": check_result.duration,
"timestamp": check_result.timestamp
}
# Determine overall status
if check.critical and check_result.status == HealthStatus.UNHEALTHY:
overall_status = HealthStatus.UNHEALTHY
elif check_result.status == HealthStatus.DEGRADED and overall_status == HealthStatus.HEALTHY:
overall_status = HealthStatus.DEGRADED
return {
"service": self.service_name,
"status": overall_status.value,
"uptime": time.time() - self.start_time,
"timestamp": time.time(),
"checks": check_results
}
# Create FastAPI router for health endpoints
router = APIRouter()
@router.get("/")
async def health_check():
"""Basic health check endpoint"""
return {
"service": "service",
"status": "healthy",
"timestamp": time.time()
}

View File

@@ -0,0 +1,439 @@
"""
Enhanced Health Check System for Microservices
Provides unified health check endpoints and database verification based on
the comprehensive implementation from the training service.
"""
from typing import Dict, Any, List, Optional, Callable
from contextlib import asynccontextmanager
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import text, inspect
from fastapi import HTTPException
from fastapi.responses import JSONResponse
import structlog
import time
import datetime
from ..database.base import DatabaseManager
from ..database.exceptions import DatabaseError, HealthCheckError
logger = structlog.get_logger()
class HealthCheckManager:
"""
Unified health check manager for microservices
Provides standardized health check endpoints:
- /health - Basic service health
- /health/ready - Kubernetes readiness probe with comprehensive checks
- /health/live - Kubernetes liveness probe
- /health/database - Detailed database health information
"""
def __init__(
self,
service_name: str,
version: str = "1.0.0",
database_manager: Optional[DatabaseManager] = None,
expected_tables: Optional[List[str]] = None,
custom_checks: Optional[Dict[str, Callable]] = None
):
self.service_name = service_name
self.version = version
self.database_manager = database_manager
self.expected_tables = expected_tables or []
self.custom_checks = custom_checks or {}
self.ready_state = False
def set_ready(self, ready: bool = True):
"""Set service ready state"""
self.ready_state = ready
logger.info(f"Service ready state changed",
service=self.service_name, ready=ready)
async def basic_health_check(self, app_state=None) -> Dict[str, Any]:
"""Basic health check endpoint (/health)"""
# Check app state for ready status if available
ready = self.ready_state
if app_state and hasattr(app_state, 'ready'):
ready = app_state.ready
return {
"status": "healthy" if ready else "starting",
"service": self.service_name,
"version": self.version,
"timestamp": datetime.datetime.utcnow().isoformat()
}
async def readiness_check(self, app_state=None) -> Dict[str, Any]:
"""
Kubernetes readiness probe endpoint (/health/ready)
Returns 200 if ready, 503 if not ready
"""
try:
# Check app state for ready status if available
ready = self.ready_state
if app_state and hasattr(app_state, 'ready'):
ready = app_state.ready
checks = {
"application": ready
}
database_details = {}
# Database connectivity and table verification
if self.database_manager:
db_health = await self._get_comprehensive_db_health()
checks["database_connectivity"] = db_health["connectivity"]
checks["database_tables"] = db_health["tables_exist"]
database_details = {
"status": db_health["status"],
"tables_verified": db_health["tables_verified"],
"missing_tables": db_health["missing_tables"],
"errors": db_health["errors"]
}
# Execute custom checks
for check_name, check_func in self.custom_checks.items():
try:
checks[check_name] = await check_func()
except Exception as e:
checks[check_name] = False
logger.error(f"Custom check '{check_name}' failed", error=str(e))
# Service is ready only if all checks pass
all_ready = all(checks.values())
if self.database_manager:
all_ready = all_ready and database_details.get("status") == "healthy"
response_data = {
"status": "ready" if all_ready else "not ready",
"checks": checks
}
if database_details:
response_data["database"] = database_details
if all_ready:
return response_data
else:
raise HTTPException(status_code=503, detail=response_data)
except HTTPException:
raise
except Exception as e:
logger.error("Readiness check failed", error=str(e))
raise HTTPException(
status_code=503,
detail={
"status": "not ready",
"error": f"Health check failed: {str(e)}"
}
)
async def liveness_check(self) -> Dict[str, Any]:
"""Kubernetes liveness probe endpoint (/health/live)"""
return {"status": "alive"}
async def database_health_check(self) -> Dict[str, Any]:
"""
Detailed database health endpoint (/health/database)
Returns 200 if healthy, 503 if unhealthy
"""
if not self.database_manager:
raise HTTPException(
status_code=404,
detail={"error": "Database health check not available"}
)
try:
db_health = await self._get_comprehensive_db_health()
status_code = 200 if db_health["status"] == "healthy" else 503
if status_code == 503:
raise HTTPException(status_code=503, detail=db_health)
return db_health
except HTTPException:
raise
except Exception as e:
logger.error("Database health check failed", error=str(e))
raise HTTPException(
status_code=503,
detail={
"status": "unhealthy",
"error": f"Health check failed: {str(e)}"
}
)
async def _get_comprehensive_db_health(self) -> Dict[str, Any]:
"""
Comprehensive database health check with table verification
Based on training service implementation
"""
health_status = {
"status": "healthy",
"connectivity": False,
"tables_exist": False,
"tables_verified": [],
"missing_tables": [],
"errors": [],
"connection_info": {},
"response_time_ms": 0
}
if not self.database_manager:
health_status["status"] = "unhealthy"
health_status["errors"].append("Database manager not configured")
return health_status
try:
# Test basic connectivity with timing
start_time = time.time()
health_status["connectivity"] = await self.database_manager.test_connection()
response_time = (time.time() - start_time) * 1000
health_status["response_time_ms"] = round(response_time, 2)
if not health_status["connectivity"]:
health_status["status"] = "unhealthy"
health_status["errors"].append("Database connectivity failed")
return health_status
# Get connection pool information
health_status["connection_info"] = await self.database_manager.get_connection_info()
# Check migration status
migration_status = await self._check_migration_status()
health_status.update(migration_status)
# Test table existence if expected tables are configured
if self.expected_tables:
tables_verified = await self._verify_tables_exist()
health_status["tables_exist"] = tables_verified
if tables_verified:
health_status["tables_verified"] = self.expected_tables.copy()
else:
health_status["status"] = "unhealthy"
health_status["errors"].append("Required tables missing or inaccessible")
# Identify which specific tables are missing
await self._identify_missing_tables(health_status)
else:
# If no expected tables configured, just mark as verified
health_status["tables_exist"] = True
logger.debug("Comprehensive database health check completed",
service=self.service_name,
status=health_status["status"],
connectivity=health_status["connectivity"],
tables_exist=health_status["tables_exist"])
except Exception as e:
health_status["status"] = "unhealthy"
health_status["errors"].append(f"Health check failed: {str(e)}")
logger.error("Comprehensive database health check failed",
service=self.service_name, error=str(e))
return health_status
async def _verify_tables_exist(self) -> bool:
"""Verify that all expected tables exist and are accessible"""
try:
async with self.database_manager.get_session() as session:
for table_name in self.expected_tables:
try:
await session.execute(text(f"SELECT 1 FROM {table_name} LIMIT 1"))
except Exception:
return False
return True
except Exception as e:
logger.error("Table verification failed", error=str(e))
return False
async def _identify_missing_tables(self, health_status: Dict[str, Any]):
"""Identify which specific tables are missing"""
try:
async with self.database_manager.get_session() as session:
for table_name in self.expected_tables:
try:
await session.execute(text(f"SELECT 1 FROM {table_name} LIMIT 1"))
health_status["tables_verified"].append(table_name)
except Exception:
health_status["missing_tables"].append(table_name)
except Exception as e:
health_status["errors"].append(f"Error checking individual tables: {str(e)}")
async def _check_migration_status(self) -> Dict[str, Any]:
"""Check database migration status"""
migration_info = {
"migration_version": None,
"migration_status": "unknown",
"migration_errors": []
}
try:
async with self.database_manager.get_session() as session:
# Check if alembic_version table exists
result = await session.execute(
text("SELECT version_num FROM alembic_version LIMIT 1")
)
version = result.scalar()
if version:
migration_info["migration_version"] = version
migration_info["migration_status"] = "healthy"
logger.debug(f"Migration version found: {version}", service=self.service_name)
else:
migration_info["migration_status"] = "no_version"
migration_info["migration_errors"].append("No migration version found in alembic_version table")
except Exception as e:
migration_info["migration_status"] = "error"
migration_info["migration_errors"].append(f"Migration check failed: {str(e)}")
logger.error("Migration status check failed", service=self.service_name, error=str(e))
return migration_info
class FastAPIHealthChecker:
"""
FastAPI integration for health checks
Provides router setup and endpoint registration
"""
def __init__(self, health_manager: HealthCheckManager):
self.health_manager = health_manager
def setup_health_routes(self, app):
"""Setup health check routes on FastAPI app"""
@app.get("/health")
async def health_check():
"""Basic health check endpoint"""
return await self.health_manager.basic_health_check(app.state)
@app.get("/health/ready")
async def readiness_check():
"""Kubernetes readiness probe endpoint"""
try:
return await self.health_manager.readiness_check(app.state)
except HTTPException as e:
return JSONResponse(
status_code=e.status_code,
content=e.detail
)
@app.get("/health/live")
async def liveness_check():
"""Kubernetes liveness probe endpoint"""
return await self.health_manager.liveness_check()
@app.get("/health/database")
async def database_health_check():
"""Detailed database health endpoint"""
try:
return await self.health_manager.database_health_check()
except HTTPException as e:
return JSONResponse(
status_code=e.status_code,
content=e.detail
)
# Convenience functions for easy integration
async def check_database_health(db_manager: DatabaseManager) -> Dict[str, Any]:
"""
Enhanced database health check with migration status
Args:
db_manager: DatabaseManager instance
Returns:
Dict containing database health status including migration version
"""
try:
async with db_manager.get_session() as session:
# Basic connectivity test
await session.execute(text("SELECT 1"))
# Get migration status
migration_status = await session.execute(text("SELECT version_num FROM alembic_version"))
version = migration_status.scalar()
return {
"database": "healthy",
"migration_version": version,
"connectivity": True
}
except Exception as e:
logger.error("Database health check failed", error=str(e))
return {
"database": "unhealthy",
"error": str(e),
"connectivity": False,
"migration_version": None
}
def create_health_manager(
service_name: str,
version: str = "1.0.0",
database_manager: Optional[DatabaseManager] = None,
expected_tables: Optional[List[str]] = None,
custom_checks: Optional[Dict[str, Callable]] = None
) -> HealthCheckManager:
"""Factory function to create a HealthCheckManager"""
return HealthCheckManager(
service_name=service_name,
version=version,
database_manager=database_manager,
expected_tables=expected_tables,
custom_checks=custom_checks
)
def setup_fastapi_health_checks(
app,
service_name: str,
version: str = "1.0.0",
database_manager: Optional[DatabaseManager] = None,
expected_tables: Optional[List[str]] = None,
custom_checks: Optional[Dict[str, Callable]] = None
) -> HealthCheckManager:
"""
Convenience function to setup health checks on a FastAPI app
Args:
app: FastAPI application instance
service_name: Name of the service
version: Service version
database_manager: Database manager instance
expected_tables: List of tables that should exist
custom_checks: Dict of custom check functions
Returns:
HealthCheckManager instance for further configuration
"""
health_manager = create_health_manager(
service_name=service_name,
version=version,
database_manager=database_manager,
expected_tables=expected_tables,
custom_checks=custom_checks
)
fastapi_checker = FastAPIHealthChecker(health_manager)
fastapi_checker.setup_health_routes(app)
return health_manager

197
shared/monitoring/logging.py Executable file
View File

@@ -0,0 +1,197 @@
# ================================================================
# shared/monitoring/logging.py
# ================================================================
"""
Centralized logging configuration for microservices
"""
import logging
import logging.config
import os
import sys
import resource
from typing import Dict, Any
def setup_logging(service_name: str, log_level: str = "INFO",
enable_json: bool = False, enable_file: bool = True) -> None:
"""
Set up logging configuration for a microservice with improved error handling.
Args:
service_name: Name of the service for log identification
log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
enable_json: Whether to use JSON formatting
enable_file: Whether to enable file logging
"""
# Check file descriptor limits
try:
soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
if soft_limit < 1024:
print(f"Warning: Low file descriptor limit ({soft_limit}). Consider increasing with 'ulimit -n'")
if soft_limit < 256:
print("Critical: File descriptor limit is very low. File logging may fail.")
enable_file = False
except Exception:
# resource module might not be available on all platforms
pass
# Create logs directory if it doesn't exist and file logging is enabled
log_dir = "/var/log"
if enable_file:
try:
# First try to create/write to /var/log
test_file = os.path.join(log_dir, f".{service_name}_test")
with open(test_file, 'w') as f:
f.write("test")
os.remove(test_file)
except (PermissionError, OSError):
# Fallback to local directory if can't write to /var/log
log_dir = "./logs"
print(f"Warning: Could not write to /var/log, using {log_dir}")
try:
os.makedirs(log_dir, exist_ok=True)
except Exception as e:
print(f"Warning: Could not create log directory {log_dir}: {e}")
enable_file = False # Disable file logging if we can't create directory
# Define formatters
formatters = {
"standard": {
"format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
"datefmt": "%Y-%m-%d %H:%M:%S"
},
"detailed": {
"format": "%(asctime)s [%(levelname)s] %(name)s [%(filename)s:%(lineno)d] %(funcName)s(): %(message)s",
"datefmt": "%Y-%m-%d %H:%M:%S"
}
}
# Add JSON formatter if requested and available
if enable_json:
try:
import pythonjsonlogger.jsonlogger
formatters["json"] = {
"()": "pythonjsonlogger.jsonlogger.JsonFormatter",
"format": "%(asctime)s %(name)s %(levelname)s %(message)s %(filename)s %(lineno)d"
}
except ImportError:
print("Warning: pythonjsonlogger not available, falling back to standard formatting")
enable_json = False
# Define handlers
handlers = {
"console": {
"class": "logging.StreamHandler",
"level": log_level,
"formatter": "json" if enable_json else "standard",
"stream": "ext://sys.stdout"
}
}
# Add file handler if enabled
if enable_file:
try:
# Test if we can actually write to the log file location
test_filename = f"{log_dir}/{service_name}.log"
test_dir = os.path.dirname(test_filename)
if not os.access(test_dir, os.W_OK):
print(f"Warning: Cannot write to log directory {test_dir}, disabling file logging")
enable_file = False
else:
handlers["file"] = {
"class": "logging.FileHandler",
"level": log_level,
"formatter": "detailed",
"filename": test_filename,
"mode": "a",
"encoding": "utf-8"
}
except Exception as e:
print(f"Warning: Could not configure file handler: {e}")
enable_file = False
# Add logstash handler if in production
logstash_host = os.getenv("LOGSTASH_HOST")
if logstash_host and os.getenv("ENVIRONMENT") == "production":
try:
handlers["logstash"] = {
"class": "logstash.TCPLogstashHandler",
"host": logstash_host,
"port": int(os.getenv("LOGSTASH_PORT", "5000")),
"version": 1,
"message_type": "logstash",
"fqdn": False,
"tags": [service_name]
}
except Exception as e:
print(f"Warning: Could not setup logstash handler: {e}")
# Define root logger configuration
root_handlers = ["console"]
if enable_file:
root_handlers.append("file")
if "logstash" in handlers:
root_handlers.append("logstash")
# Complete logging configuration
config: Dict[str, Any] = {
"version": 1,
"disable_existing_loggers": False,
"formatters": formatters,
"handlers": handlers,
"loggers": {
"": { # Root logger
"handlers": root_handlers,
"level": log_level,
"propagate": False
},
"uvicorn": {
"handlers": ["console"],
"level": log_level,
"propagate": False
},
"uvicorn.access": {
"handlers": ["console"],
"level": log_level,
"propagate": False
},
"sqlalchemy": {
"handlers": ["console"],
"level": "WARNING", # Reduce SQL logging noise
"propagate": False
},
"httpx": {
"handlers": ["console"],
"level": "WARNING", # Reduce HTTP client logging
"propagate": False
}
}
}
try:
logging.config.dictConfig(config)
logger = logging.getLogger(__name__)
logger.info(f"Logging configured for {service_name} at level {log_level}")
if enable_file:
logger.info(f"File logging enabled at {log_dir}/{service_name}.log")
else:
logger.info("File logging disabled")
except Exception as e:
# Fallback to basic logging if configuration fails
logging.basicConfig(
level=getattr(logging, log_level.upper()),
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)
logger.error(f"Failed to configure advanced logging for {service_name}: {e}")
logger.info(f"Using basic logging configuration for {service_name}")
# Additional debugging for file handler issues
if "file" in str(e).lower() or "handler" in str(e).lower():
logger.error(f"File handler configuration failed. Check permissions for {log_dir}")
logger.error(f"Current working directory: {os.getcwd()}")
logger.error(f"Attempting to write to: {log_dir}/{service_name}.log")

View File

@@ -0,0 +1,221 @@
"""
OpenTelemetry Logs Integration for SigNoz
Exports structured logs to SigNoz via OpenTelemetry Collector using HTTP protocol
"""
import os
import logging
import structlog
from typing import Optional
from opentelemetry._logs import set_logger_provider
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.sdk.resources import Resource
# Try to import HTTP log exporter (logs always use HTTP)
try:
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
HTTP_LOG_EXPORTER_AVAILABLE = True
except ImportError:
try:
from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
HTTP_LOG_EXPORTER_AVAILABLE = True
except ImportError:
OTLPLogExporter = None
HTTP_LOG_EXPORTER_AVAILABLE = False
from .otel_config import OTelConfig
logger = structlog.get_logger()
def setup_otel_logging(
service_name: str,
service_version: str = "1.0.0",
otel_endpoint: Optional[str] = None,
enable_console: bool = True
) -> Optional[LoggingHandler]:
"""
Setup OpenTelemetry logging to export logs to SigNoz.
Uses HTTP protocol (port 4318) for sending logs to SigNoz.
Integrates with Python's standard logging to automatically export
all log records to SigNoz via the OTLP HTTP protocol.
Args:
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: Optional override for OTLP endpoint (HTTP format with path)
enable_console: Whether to also log to console (default: True)
Returns:
LoggingHandler instance if successful, None otherwise
Example:
from shared.monitoring.logs_exporter import setup_otel_logging
# Setup during service initialization
handler = setup_otel_logging("auth-service", "1.0.0")
# Now all standard logging calls will be exported to SigNoz
import logging
logger = logging.getLogger(__name__)
logger.info("This will appear in SigNoz!")
"""
# Check if logging export is enabled
if not OTelConfig.is_enabled("logs"):
logger.info(
"OpenTelemetry logs export disabled",
service=service_name,
reason="OTEL_LOGS_EXPORTER not set to 'otlp'"
)
return None
# Check if HTTP log exporter is available
if not HTTP_LOG_EXPORTER_AVAILABLE or OTLPLogExporter is None:
logger.warning(
"OpenTelemetry HTTP log exporter not available",
service=service_name,
reason="opentelemetry-exporter-otlp-proto-http package not installed"
)
return None
try:
# Get endpoints from centralized config
endpoints = OTelConfig.get_endpoints()
# Use provided endpoint or get from config
if otel_endpoint:
http_endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/logs")
else:
http_endpoint = endpoints.logs_http
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure logger provider
logger_provider = LoggerProvider(resource=resource)
set_logger_provider(logger_provider)
# Configure OTLP HTTP exporter for logs
otlp_exporter = OTLPLogExporter(
endpoint=http_endpoint,
timeout=10
)
# Add log record processor with batching
log_processor = BatchLogRecordProcessor(otlp_exporter)
logger_provider.add_log_record_processor(log_processor)
# Create logging handler that bridges standard logging to OpenTelemetry
otel_handler = LoggingHandler(
level=logging.NOTSET, # Capture all levels
logger_provider=logger_provider
)
# Add handler to root logger
root_logger = logging.getLogger()
root_logger.addHandler(otel_handler)
logger.info(
"OpenTelemetry logs export configured successfully",
service=service_name,
http_endpoint=http_endpoint,
protocol="http",
console_logging=enable_console
)
return otel_handler
except Exception as e:
logger.error(
"Failed to setup OpenTelemetry logs export",
service=service_name,
error=str(e)
)
return None
def add_log_context(**context):
"""
Add contextual information to logs that will be sent to SigNoz.
This is useful for adding request IDs, user IDs, tenant IDs, etc.
that help with filtering and correlation in SigNoz.
Args:
**context: Key-value pairs to add to log context
Example:
from shared.monitoring.logs_exporter import add_log_context
# Add context for current request
add_log_context(
request_id="req_123",
user_id="user_456",
tenant_id="tenant_789"
)
# Now all logs will include this context
logger.info("Processing order") # Will include request_id, user_id, tenant_id
"""
# This works with structlog's context binding
bound_logger = structlog.get_logger()
return bound_logger.bind(**context)
def get_current_trace_context() -> dict:
"""
Get current trace context for log correlation.
Returns a dict with trace_id and span_id if available,
which can be added to log records for correlation with traces.
Returns:
Dict with trace_id and span_id, or empty dict if no active trace
Example:
from shared.monitoring.logs_exporter import get_current_trace_context
# Get trace context and add to logs
trace_ctx = get_current_trace_context()
logger.info("Processing request", **trace_ctx)
"""
from opentelemetry import trace
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
return {
"trace_id": format(span.get_span_context().trace_id, '032x'),
"span_id": format(span.get_span_context().span_id, '016x'),
}
return {}
class StructlogOTELProcessor:
"""
Structlog processor that adds OpenTelemetry trace context to logs.
This automatically adds trace_id and span_id to all log records,
enabling correlation between logs and traces in SigNoz.
Usage:
import structlog
from shared.monitoring.logs_exporter import StructlogOTELProcessor
structlog.configure(
processors=[
StructlogOTELProcessor(),
# ... other processors
]
)
"""
def __call__(self, logger, method_name, event_dict):
"""Add trace context to log event"""
trace_ctx = get_current_trace_context()
if trace_ctx:
event_dict.update(trace_ctx)
return event_dict

400
shared/monitoring/metrics.py Executable file
View File

@@ -0,0 +1,400 @@
"""
OpenTelemetry Metrics Collection for Microservices
Replaces Prometheus with native OpenTelemetry metrics export to SigNoz
"""
import time
import logging
import structlog
from typing import Dict, Any, Optional
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from fastapi import Request, Response
from threading import Lock
import os
logger = structlog.get_logger()
# Global registry for metrics collectors
_metrics_registry: Dict[str, 'MetricsCollector'] = {}
_registry_lock = Lock()
class MetricsCollector:
"""
OpenTelemetry-based metrics collector for microservices.
Exports metrics directly to SigNoz via OTLP (no Prometheus).
"""
def __init__(
self,
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
):
self.service_name = service_name
self.service_version = service_version
self.start_time = time.time()
# Use provided meter provider or get global
if meter_provider:
self.meter = meter_provider.get_meter(__name__)
else:
self.meter = metrics.get_meter(__name__)
# Store created instruments
self._counters: Dict[str, Any] = {}
self._histograms: Dict[str, Any] = {}
self._up_down_counters: Dict[str, Any] = {}
self._lock = Lock()
# Register in global registry
with _registry_lock:
_metrics_registry[service_name] = self
# Create default HTTP metrics
self._setup_default_metrics()
logger.info(
"OpenTelemetry metrics collector initialized",
service=service_name
)
def _setup_default_metrics(self):
"""Setup default HTTP metrics"""
self._counters["http_requests_total"] = self.meter.create_counter(
name=f"{self.service_name.replace('-', '_')}_http_requests_total",
description="Total HTTP requests",
unit="requests"
)
self._histograms["http_request_duration"] = self.meter.create_histogram(
name=f"{self.service_name.replace('-', '_')}_http_request_duration_seconds",
description="HTTP request duration in seconds",
unit="s"
)
self._up_down_counters["active_requests"] = self.meter.create_up_down_counter(
name=f"{self.service_name.replace('-', '_')}_active_requests",
description="Number of active HTTP requests",
unit="requests"
)
def register_counter(self, name: str, documentation: str, labels: list = None) -> Any:
"""Register a custom Counter metric"""
with self._lock:
if name in self._counters:
logger.warning(f"Counter '{name}' already registered for {self.service_name}")
return self._counters[name]
try:
counter = self.meter.create_counter(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=documentation,
unit="1"
)
self._counters[name] = counter
logger.info(f"Registered counter: {name} for {self.service_name}")
return counter
except Exception as e:
logger.error(f"Failed to register counter {name} for {self.service_name}: {e}")
raise
def register_histogram(
self,
name: str,
documentation: str,
labels: list = None,
buckets: tuple = None
) -> Any:
"""Register a custom Histogram metric"""
with self._lock:
if name in self._histograms:
logger.warning(f"Histogram '{name}' already registered for {self.service_name}")
return self._histograms[name]
try:
histogram = self.meter.create_histogram(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=documentation,
unit="1"
)
self._histograms[name] = histogram
logger.info(f"Registered histogram: {name} for {self.service_name}")
return histogram
except Exception as e:
logger.error(f"Failed to register histogram {name} for {self.service_name}: {e}")
raise
def register_gauge(self, name: str, documentation: str, labels: list = None) -> Any:
"""Register a custom Gauge metric (using UpDownCounter)"""
with self._lock:
if name in self._up_down_counters:
logger.warning(f"Gauge '{name}' already registered for {self.service_name}")
return self._up_down_counters[name]
try:
gauge = self.meter.create_up_down_counter(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=documentation,
unit="1"
)
self._up_down_counters[name] = gauge
logger.info(f"Registered gauge: {name} for {self.service_name}")
return gauge
except Exception as e:
logger.error(f"Failed to register gauge {name} for {self.service_name}: {e}")
raise
def increment_counter(self, name: str, value: int = 1, labels: Dict[str, str] = None):
"""Increment a counter metric"""
if name not in self._counters:
logger.error(f"Counter '{name}' not registered for {self.service_name}")
return
if labels is None:
labels = {"service": self.service_name}
elif "service" not in labels:
labels["service"] = self.service_name
try:
self._counters[name].add(value, labels)
except Exception as e:
logger.error(f"Failed to increment counter {name} for {self.service_name}: {e}")
def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None):
"""Observe a histogram metric"""
if name not in self._histograms:
logger.error(f"Histogram '{name}' not registered for {self.service_name}")
return
if labels is None:
labels = {"service": self.service_name}
elif "service" not in labels:
labels["service"] = self.service_name
try:
self._histograms[name].record(value, labels)
except Exception as e:
logger.error(f"Failed to observe histogram {name} for {self.service_name}: {e}")
def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None):
"""Set a gauge metric (using add for UpDownCounter)"""
if name not in self._up_down_counters:
logger.error(f"Gauge '{name}' not registered for {self.service_name}")
return
if labels is None:
labels = {"service": self.service_name}
elif "service" not in labels:
labels["service"] = self.service_name
try:
# For UpDownCounter, we need to track the delta
# Store current value and calculate delta
key = f"{name}_{str(sorted(labels.items()))}"
if not hasattr(self, '_gauge_values'):
self._gauge_values = {}
old_value = self._gauge_values.get(key, 0)
delta = value - old_value
self._gauge_values[key] = value
self._up_down_counters[name].add(delta, labels)
except Exception as e:
logger.error(f"Failed to set gauge {name} for {self.service_name}: {e}")
def record_request(self, method: str, endpoint: str, status_code: int, duration: float):
"""Record HTTP request metrics"""
try:
attributes = {
"service": self.service_name,
"http.method": method,
"http.route": endpoint,
"http.status_code": str(status_code)
}
self._counters["http_requests_total"].add(1, attributes)
self._histograms["http_request_duration"].record(duration, attributes)
except Exception as e:
logger.error(f"Failed to record request metrics for {self.service_name}: {e}")
def increment_active_requests(self):
"""Increment active request counter"""
try:
self._up_down_counters["active_requests"].add(1, {"service": self.service_name})
except Exception as e:
logger.error(f"Failed to increment active requests: {e}")
def decrement_active_requests(self):
"""Decrement active request counter"""
try:
self._up_down_counters["active_requests"].add(-1, {"service": self.service_name})
except Exception as e:
logger.error(f"Failed to decrement active requests: {e}")
def set_active_connections(self, count: int):
"""Set active database connections"""
self.set_gauge("active_connections", count)
def get_metrics_collector(service_name: str) -> Optional[MetricsCollector]:
"""Get metrics collector by service name from global registry"""
with _registry_lock:
return _metrics_registry.get(service_name)
def create_metrics_collector(
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
) -> MetricsCollector:
"""
Create metrics collector.
This should be called BEFORE app startup, not during lifespan.
"""
# Get existing or create new
existing = get_metrics_collector(service_name)
if existing:
return existing
return MetricsCollector(service_name, service_version, meter_provider)
def add_metrics_middleware(app, metrics_collector: MetricsCollector):
"""
Add metrics middleware to app. Must be called BEFORE app startup.
"""
@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
# Increment active requests
metrics_collector.increment_active_requests()
start_time = time.time()
try:
response = await call_next(request)
duration = time.time() - start_time
# Record request metrics
metrics_collector.record_request(
method=request.method,
endpoint=request.url.path,
status_code=response.status_code,
duration=duration
)
# Decrement active requests
metrics_collector.decrement_active_requests()
return response
except Exception as e:
duration = time.time() - start_time
# Record failed request
metrics_collector.record_request(
method=request.method,
endpoint=request.url.path,
status_code=500,
duration=duration
)
# Decrement active requests
metrics_collector.decrement_active_requests()
raise
return metrics_collector
def track_user_activity(user_id: str, action: str, service_name: str = "unknown-service", metadata: dict = None):
"""Track user activity metrics using the appropriate metrics collector"""
if metadata is None:
metadata = {}
# Add user-specific attributes
attributes = {
"user.id": user_id,
"action": action,
**metadata
}
# Get the metrics collector for the specified service
metrics_collector = get_metrics_collector(service_name)
if metrics_collector:
# Use the collector's counter registration system
counter_name = "user_activity_total"
# Check if counter already exists, if not register it
if counter_name not in metrics_collector._counters:
metrics_collector.register_counter(
name=counter_name,
documentation="Total user activity events"
)
# Increment the counter with attributes
metrics_collector.increment_counter(counter_name, value=1, labels=attributes)
else:
# Fallback: create a temporary counter if no collector exists
from opentelemetry import metrics
meter = metrics.get_meter(__name__)
user_activity_counter = meter.create_counter(
name="user_activity_total",
description="User activity events",
unit="events"
)
user_activity_counter.add(1, attributes)
def setup_metrics_early(
app,
service_name: str = None,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
) -> MetricsCollector:
"""
Setup metrics collection BEFORE app startup.
This must be called before adding any middleware or starting the app.
Note: No Prometheus endpoint is created - all metrics go to SigNoz via OTLP
"""
if service_name is None:
service_name = getattr(app, 'title', 'unknown-service').lower().replace(' ', '-').replace('.', '_')
# Create metrics collector
metrics_collector = create_metrics_collector(service_name, service_version, meter_provider)
# Add middleware (must be before app starts)
add_metrics_middleware(app, metrics_collector)
# Store in app state for access from routes
app.state.metrics_collector = metrics_collector
logger.info(f"OpenTelemetry metrics setup completed for service: {service_name}")
return metrics_collector
# Helper function for endpoint tracking (kept for backward compatibility)
def track_endpoint_metrics(endpoint_name: str = None, service_name: str = None):
"""Decorator for tracking endpoint metrics - metrics handled by middleware"""
def decorator(func):
import asyncio
from functools import wraps
@wraps(func)
async def async_wrapper(*args, **kwargs):
return await func(*args, **kwargs)
@wraps(func)
def sync_wrapper(*args, **kwargs):
return func(*args, **kwargs)
# Return appropriate wrapper based on function type
if asyncio.iscoroutinefunction(func):
return async_wrapper
else:
return sync_wrapper
return decorator

View File

@@ -0,0 +1,304 @@
"""
OpenTelemetry Metrics Integration for SigNoz
Exports metrics to SigNoz via OpenTelemetry Collector using gRPC protocol
"""
import os
import structlog
from typing import Optional
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.resources import Resource
# Import both gRPC and HTTP exporters
try:
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GrpcMetricExporter
GRPC_AVAILABLE = True
except ImportError:
GRPC_AVAILABLE = False
GrpcMetricExporter = None
try:
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HttpMetricExporter
HTTP_AVAILABLE = True
except ImportError:
HTTP_AVAILABLE = False
HttpMetricExporter = None
from .otel_config import OTelConfig
logger = structlog.get_logger()
def setup_otel_metrics(
service_name: str,
service_version: str = "1.0.0",
otel_endpoint: Optional[str] = None,
export_interval_millis: int = 60000, # Export every 60 seconds
protocol: Optional[str] = None # "grpc" or "http", defaults to grpc
) -> Optional[MeterProvider]:
"""
Setup OpenTelemetry metrics to export to SigNoz.
Supports both gRPC (recommended, port 4317) and HTTP (port 4318) protocols.
Default protocol is gRPC for better performance.
Args:
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: Optional override for OTLP endpoint
export_interval_millis: How often to push metrics in milliseconds (default 60s)
protocol: Protocol to use ("grpc" or "http"). Defaults to "grpc"
Returns:
MeterProvider instance if successful, None otherwise
Example:
from shared.monitoring.metrics_exporter import setup_otel_metrics
# Setup with gRPC (default)
meter_provider = setup_otel_metrics("auth-service", "1.0.0")
# Or with HTTP
meter_provider = setup_otel_metrics("auth-service", "1.0.0", protocol="http")
# Create meters for your metrics
meter = meter_provider.get_meter(__name__)
request_counter = meter.create_counter(
"http.server.requests",
description="Total HTTP requests",
unit="1"
)
# Record metrics
request_counter.add(1, {"method": "GET", "status": "200"})
"""
# Check if metrics export is enabled
if not OTelConfig.is_enabled("metrics"):
logger.info(
"OpenTelemetry metrics export disabled",
service=service_name,
reason="ENABLE_OTEL_METRICS not set to 'true'"
)
return None
# Determine protocol to use
if protocol is None:
protocol = OTelConfig.get_protocol("metrics")
# Validate protocol is available
if protocol == "grpc" and not GRPC_AVAILABLE:
logger.warning(
"gRPC exporter not available, falling back to HTTP",
service=service_name
)
protocol = "http"
elif protocol == "http" and not HTTP_AVAILABLE:
logger.warning(
"HTTP exporter not available, falling back to gRPC",
service=service_name
)
protocol = "grpc"
if protocol not in ["grpc", "http"]:
logger.error(
"Invalid protocol specified",
service=service_name,
protocol=protocol
)
return None
try:
# Get endpoints from centralized config
endpoints = OTelConfig.get_endpoints()
# Determine which endpoint to use
if otel_endpoint:
# User provided override
if protocol == "grpc":
endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
else:
endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/metrics")
else:
# Use config-determined endpoint
if protocol == "grpc":
endpoint = endpoints.metrics_grpc
else:
endpoint = endpoints.metrics_http
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure OTLP exporter based on protocol
if protocol == "grpc":
otlp_exporter = GrpcMetricExporter(
endpoint=endpoint,
insecure=True, # Use secure=False in production with proper TLS
timeout=10
)
else: # http
otlp_exporter = HttpMetricExporter(
endpoint=endpoint,
timeout=10
)
# Create periodic metric reader
metric_reader = PeriodicExportingMetricReader(
exporter=otlp_exporter,
export_interval_millis=export_interval_millis
)
# Configure meter provider
meter_provider = MeterProvider(
resource=resource,
metric_readers=[metric_reader]
)
# Set global meter provider
metrics.set_meter_provider(meter_provider)
logger.info(
"OpenTelemetry metrics export configured successfully",
service=service_name,
endpoint=endpoint,
protocol=protocol,
export_interval_seconds=export_interval_millis / 1000
)
return meter_provider
except Exception as e:
logger.error(
"Failed to setup OpenTelemetry metrics export",
service=service_name,
error=str(e),
protocol=protocol
)
return None
class OTelMetricsCollector:
"""
Wrapper for OpenTelemetry metrics that provides a similar interface
to the Prometheus MetricsCollector.
This allows services to emit metrics that go to both Prometheus and SigNoz.
"""
def __init__(self, service_name: str, meter_provider: MeterProvider):
self.service_name = service_name
self.meter_provider = meter_provider
self.meter = meter_provider.get_meter(__name__)
# Store created instruments
self._counters = {}
self._histograms = {}
self._gauges = {}
def create_counter(self, name: str, description: str = "", unit: str = "1"):
"""Create or get an OpenTelemetry Counter"""
if name not in self._counters:
self._counters[name] = self.meter.create_counter(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=description,
unit=unit
)
return self._counters[name]
def create_histogram(self, name: str, description: str = "", unit: str = "1"):
"""Create or get an OpenTelemetry Histogram"""
if name not in self._histograms:
self._histograms[name] = self.meter.create_histogram(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=description,
unit=unit
)
return self._histograms[name]
def create_gauge(self, name: str, description: str = "", unit: str = "1"):
"""
Create or get an OpenTelemetry observable gauge.
Note: Gauges in OTEL require a callback function.
"""
if name not in self._gauges:
# Store gauge reference for callback registration
self._gauges[name] = {
"name": f"{self.service_name.replace('-', '_')}_{name}",
"description": description,
"unit": unit,
"value": 0,
"attributes": {}
}
return self._gauges[name]
def increment_counter(self, name: str, value: int = 1, attributes: dict = None):
"""Increment a counter with optional attributes"""
if name in self._counters:
if attributes is None:
attributes = {"service": self.service_name}
elif "service" not in attributes:
attributes["service"] = self.service_name
self._counters[name].add(value, attributes)
def observe_histogram(self, name: str, value: float, attributes: dict = None):
"""Record a histogram observation with optional attributes"""
if name in self._histograms:
if attributes is None:
attributes = {"service": self.service_name}
elif "service" not in attributes:
attributes["service"] = self.service_name
self._histograms[name].record(value, attributes)
def set_gauge(self, name: str, value: float, attributes: dict = None):
"""Set a gauge value (stores for next callback)"""
if name in self._gauges:
if attributes is None:
attributes = {"service": self.service_name}
elif "service" not in attributes:
attributes["service"] = self.service_name
self._gauges[name]["value"] = value
self._gauges[name]["attributes"] = attributes
def create_dual_metrics_collector(service_name: str, service_version: str = "1.0.0"):
"""
Create a metrics collector that exports to both Prometheus and SigNoz.
This function sets up both collection strategies:
1. Prometheus client library (for /metrics endpoint scraping)
2. OpenTelemetry metrics (for OTLP push to SigNoz)
Returns a tuple: (prometheus_collector, otel_collector)
Both collectors can be used independently or together.
Example:
from shared.monitoring.metrics_exporter import create_dual_metrics_collector
prom_collector, otel_collector = create_dual_metrics_collector("auth-service")
# Prometheus counter
prom_collector.register_counter("requests_total", "Total requests")
prom_collector.increment_counter("requests_total", labels={"status": "200"})
# OpenTelemetry counter (pushed to SigNoz)
counter = otel_collector.create_counter("requests_total", "Total requests")
counter.add(1, {"status": "200"})
"""
from shared.monitoring.metrics import MetricsCollector
# Create Prometheus collector
prom_collector = MetricsCollector(service_name)
# Create OpenTelemetry collector
meter_provider = setup_otel_metrics(service_name, service_version)
otel_collector = None
if meter_provider:
otel_collector = OTelMetricsCollector(service_name, meter_provider)
return prom_collector, otel_collector

View File

@@ -0,0 +1,293 @@
"""
Centralized OpenTelemetry Configuration
Manages OTEL endpoints and settings for traces, metrics, and logs
"""
import os
from typing import Optional, Tuple
from dataclasses import dataclass
import structlog
logger = structlog.get_logger()
@dataclass
class OTelEndpoints:
"""
Container for OpenTelemetry endpoints.
SigNoz uses different protocols for different signals:
- Traces: gRPC (port 4317)
- Metrics: gRPC (port 4317) or HTTP (port 4318)
- Logs: HTTP (port 4318)
"""
traces_grpc: str # gRPC endpoint for traces (e.g., "host:4317")
metrics_grpc: str # gRPC endpoint for metrics (e.g., "host:4317")
metrics_http: str # HTTP endpoint for metrics (e.g., "http://host:4318/v1/metrics")
logs_http: str # HTTP endpoint for logs (e.g., "http://host:4318/v1/logs")
class OTelConfig:
"""
Centralized configuration for OpenTelemetry exporters.
This class manages endpoint URLs and ensures proper protocol usage:
- gRPC endpoints: host:port (no protocol prefix)
- HTTP endpoints: http://host:port/path (with protocol and path)
"""
# Default base endpoint (can be overridden by environment variables)
DEFAULT_OTEL_COLLECTOR_HOST = "signoz-otel-collector.bakery-ia.svc.cluster.local"
DEFAULT_GRPC_PORT = 4317
DEFAULT_HTTP_PORT = 4318
@classmethod
def get_endpoints(cls) -> OTelEndpoints:
"""
Get OpenTelemetry endpoints from environment variables with proper fallbacks.
Environment variables (in order of precedence):
1. OTEL_EXPORTER_OTLP_ENDPOINT - Base endpoint (gRPC format: host:port)
2. OTEL_EXPORTER_OTLP_TRACES_ENDPOINT - Specific traces endpoint
3. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT - Specific metrics endpoint
4. OTEL_EXPORTER_OTLP_LOGS_ENDPOINT - Specific logs endpoint
5. OTEL_COLLECTOR_ENDPOINT - Legacy variable (HTTP format)
Returns:
OTelEndpoints with all configured endpoints
"""
# Get base endpoint from environment
base_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
if base_endpoint:
# Clean and parse base endpoint
base_grpc = cls._clean_grpc_endpoint(base_endpoint)
base_http_host = cls._extract_host(base_endpoint)
# Validate that the endpoint doesn't contain secret references or malformed data
if cls._contains_secret_reference(base_grpc):
logger.error("OTEL endpoint contains secret reference, falling back to default",
malformed_endpoint=base_endpoint)
base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
else:
# Use default collector
base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
# Get signal-specific endpoints (or use base endpoint)
traces_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", base_grpc)
metrics_endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", base_grpc)
logs_endpoint = os.getenv("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT")
# Validate and clean signal-specific endpoints
traces_grpc = cls._clean_and_validate_grpc_endpoint(traces_endpoint)
metrics_grpc = cls._clean_and_validate_grpc_endpoint(metrics_endpoint)
# For metrics HTTP, convert gRPC endpoint to HTTP if needed
metrics_http = cls._grpc_to_http_endpoint(metrics_grpc, "/v1/metrics")
# For logs, use HTTP endpoint
if logs_endpoint:
logs_http = cls._ensure_http_endpoint(logs_endpoint, "/v1/logs")
else:
logs_http = cls._grpc_to_http_endpoint(base_grpc, "/v1/logs")
endpoints = OTelEndpoints(
traces_grpc=traces_grpc,
metrics_grpc=metrics_grpc,
metrics_http=metrics_http,
logs_http=logs_http
)
logger.info(
"OpenTelemetry endpoints configured",
traces_grpc=endpoints.traces_grpc,
metrics_grpc=endpoints.metrics_grpc,
metrics_http=endpoints.metrics_http,
logs_http=endpoints.logs_http
)
return endpoints
@staticmethod
def _clean_grpc_endpoint(endpoint: str) -> str:
"""
Clean endpoint for gRPC usage (remove protocol, paths).
Args:
endpoint: Raw endpoint string
Returns:
Cleaned endpoint in format "host:port"
"""
# Remove protocol prefixes
endpoint = endpoint.replace("http://", "").replace("https://", "")
# Remove paths (gRPC doesn't use paths)
if "/" in endpoint:
endpoint = endpoint.split("/")[0]
# Ensure it has a port
if ":" not in endpoint:
endpoint = f"{endpoint}:4317"
return endpoint
@staticmethod
def _extract_host(endpoint: str) -> str:
"""
Extract host and convert to HTTP endpoint.
Args:
endpoint: Raw endpoint string
Returns:
HTTP endpoint without path (e.g., "http://host:4318")
"""
# Remove protocol if present
clean = endpoint.replace("http://", "").replace("https://", "")
# Remove path if present
if "/" in clean:
clean = clean.split("/")[0]
# Extract host without port
if ":" in clean:
host = clean.split(":")[0]
else:
host = clean
return f"http://{host}:4318"
@staticmethod
def _grpc_to_http_endpoint(grpc_endpoint: str, path: str) -> str:
"""
Convert gRPC endpoint to HTTP endpoint with path.
Args:
grpc_endpoint: gRPC endpoint (e.g., "host:4317")
path: HTTP path (e.g., "/v1/metrics")
Returns:
HTTP endpoint (e.g., "http://host:4318/v1/metrics")
"""
# Extract host from gRPC endpoint
if ":" in grpc_endpoint:
host = grpc_endpoint.split(":")[0]
else:
host = grpc_endpoint
# Build HTTP endpoint with port 4318
return f"http://{host}:4318{path}"
@staticmethod
def _ensure_http_endpoint(endpoint: str, path: str) -> str:
"""
Ensure endpoint is in HTTP format with proper path.
Args:
endpoint: Raw endpoint string
path: Required path (e.g., "/v1/logs")
Returns:
HTTP endpoint with protocol and path
"""
# Add protocol if missing
if not endpoint.startswith(("http://", "https://")):
endpoint = f"http://{endpoint}"
# Ensure it has the correct port for HTTP
if ":4317" in endpoint:
endpoint = endpoint.replace(":4317", ":4318")
elif ":4318" not in endpoint and ":" in endpoint:
# Has a port but not the right one, replace it
parts = endpoint.split(":")
if len(parts) >= 2:
# Remove existing port and path
base = ":".join(parts[:-1])
endpoint = f"{base}:4318"
elif ":" not in endpoint.replace("http://", "").replace("https://", ""):
# No port at all, add it
endpoint = f"{endpoint}:4318"
# Ensure path is present
if not endpoint.endswith(path):
# Remove any existing path first
if "/" in endpoint.split("://")[1]:
base = endpoint.split("://")[0] + "://" + endpoint.split("://")[1].split("/")[0]
endpoint = base
endpoint = f"{endpoint}{path}"
return endpoint
@classmethod
def get_resource_attributes(
cls,
service_name: str,
service_version: str = "1.0.0"
) -> dict:
"""
Get common resource attributes for all OTEL signals.
Args:
service_name: Name of the service
service_version: Version of the service
Returns:
Dictionary of resource attributes
"""
return {
"service.name": service_name,
"service.version": service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
"k8s.cluster.name": os.getenv("K8S_CLUSTER_NAME", "bakery-ia-cluster"),
}
@classmethod
def is_enabled(cls, signal: str) -> bool:
"""
Check if a specific telemetry signal is enabled.
Args:
signal: One of "traces", "metrics", "logs"
Returns:
True if signal is enabled, False otherwise
"""
signal = signal.lower()
if signal == "traces":
return os.getenv("ENABLE_TRACING", "true").lower() == "true"
elif signal == "metrics":
return os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
elif signal == "logs":
return os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp"
else:
return False
@classmethod
def get_protocol(cls, signal: str) -> str:
"""
Get the preferred protocol for a signal.
Args:
signal: One of "traces", "metrics", "logs"
Returns:
Protocol name ("grpc" or "http")
"""
protocol = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc")
# Signal-specific overrides
if signal == "traces":
return os.getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", protocol)
elif signal == "metrics":
return os.getenv("OTEL_EXPORTER_OTLP_METRICS_PROTOCOL", protocol)
elif signal == "logs":
# Logs always use HTTP in our setup
return "http"
return protocol

View File

@@ -0,0 +1,433 @@
"""
System Metrics Collection for SigNoz
Collects CPU, memory, disk, and process metrics via OpenTelemetry
"""
import os
import psutil
import structlog
from typing import Optional
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
logger = structlog.get_logger()
class SystemMetricsCollector:
"""
Collects system-level metrics (CPU, memory, disk, network, process info)
and exports them to SigNoz via OpenTelemetry.
These metrics help monitor service health and resource utilization.
"""
def __init__(
self,
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
):
self.service_name = service_name
self.service_version = service_version
self.process = psutil.Process()
# Use provided meter provider or get global
if meter_provider:
self.meter = meter_provider.get_meter(__name__)
else:
self.meter = metrics.get_meter(__name__)
# Initialize metric instruments
self._setup_metrics()
logger.info(
"System metrics collector initialized",
service=service_name,
pid=os.getpid()
)
def _setup_metrics(self):
"""Setup all system metric instruments"""
# Process CPU metrics
self.process_cpu_percent = self.meter.create_observable_gauge(
name="process.cpu.utilization",
description="Process CPU utilization percentage",
unit="percent",
callbacks=[self._observe_process_cpu]
)
# Process memory metrics
self.process_memory_usage = self.meter.create_observable_gauge(
name="process.memory.usage",
description="Process memory usage in bytes",
unit="bytes",
callbacks=[self._observe_process_memory]
)
self.process_memory_percent = self.meter.create_observable_gauge(
name="process.memory.utilization",
description="Process memory utilization percentage",
unit="percent",
callbacks=[self._observe_process_memory_percent]
)
# Process thread count
self.process_threads = self.meter.create_observable_gauge(
name="process.threads.count",
description="Number of threads in the process",
unit="threads",
callbacks=[self._observe_process_threads]
)
# Process file descriptors (Unix only)
if hasattr(self.process, 'num_fds'):
self.process_fds = self.meter.create_observable_gauge(
name="process.open_file_descriptors",
description="Number of open file descriptors",
unit="fds",
callbacks=[self._observe_process_fds]
)
# System-wide CPU metrics
self.system_cpu_percent = self.meter.create_observable_gauge(
name="system.cpu.utilization",
description="System-wide CPU utilization percentage",
unit="percent",
callbacks=[self._observe_system_cpu]
)
# System-wide memory metrics
self.system_memory_usage = self.meter.create_observable_gauge(
name="system.memory.usage",
description="System memory usage in bytes",
unit="bytes",
callbacks=[self._observe_system_memory]
)
self.system_memory_percent = self.meter.create_observable_gauge(
name="system.memory.utilization",
description="System memory utilization percentage",
unit="percent",
callbacks=[self._observe_system_memory_percent]
)
# Disk I/O metrics
self.disk_io_read = self.meter.create_observable_counter(
name="system.disk.io.read",
description="Disk bytes read",
unit="bytes",
callbacks=[self._observe_disk_io_read]
)
self.disk_io_write = self.meter.create_observable_counter(
name="system.disk.io.write",
description="Disk bytes written",
unit="bytes",
callbacks=[self._observe_disk_io_write]
)
# Network I/O metrics
self.network_io_sent = self.meter.create_observable_counter(
name="system.network.io.sent",
description="Network bytes sent",
unit="bytes",
callbacks=[self._observe_network_io_sent]
)
self.network_io_recv = self.meter.create_observable_counter(
name="system.network.io.received",
description="Network bytes received",
unit="bytes",
callbacks=[self._observe_network_io_recv]
)
# Callback methods for observable instruments
def _observe_process_cpu(self, options):
"""Observe process CPU usage"""
try:
cpu_percent = self.process.cpu_percent(interval=None)
yield metrics.Observation(
cpu_percent,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect process CPU metrics: {e}")
def _observe_process_memory(self, options):
"""Observe process memory usage"""
try:
mem_info = self.process.memory_info()
yield metrics.Observation(
mem_info.rss, # Resident Set Size
{"service": self.service_name, "type": "rss"}
)
yield metrics.Observation(
mem_info.vms, # Virtual Memory Size
{"service": self.service_name, "type": "vms"}
)
except Exception as e:
logger.warning(f"Failed to collect process memory metrics: {e}")
def _observe_process_memory_percent(self, options):
"""Observe process memory percentage"""
try:
mem_percent = self.process.memory_percent()
yield metrics.Observation(
mem_percent,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect process memory percent: {e}")
def _observe_process_threads(self, options):
"""Observe process thread count"""
try:
num_threads = self.process.num_threads()
yield metrics.Observation(
num_threads,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect process thread count: {e}")
def _observe_process_fds(self, options):
"""Observe process file descriptors (Unix only)"""
try:
num_fds = self.process.num_fds()
yield metrics.Observation(
num_fds,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect process FDs: {e}")
def _observe_system_cpu(self, options):
"""Observe system-wide CPU usage"""
try:
cpu_percent = psutil.cpu_percent(interval=None)
yield metrics.Observation(
cpu_percent,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect system CPU metrics: {e}")
def _observe_system_memory(self, options):
"""Observe system memory usage"""
try:
mem = psutil.virtual_memory()
yield metrics.Observation(
mem.used,
{"service": self.service_name, "type": "used"}
)
yield metrics.Observation(
mem.available,
{"service": self.service_name, "type": "available"}
)
yield metrics.Observation(
mem.total,
{"service": self.service_name, "type": "total"}
)
except Exception as e:
logger.warning(f"Failed to collect system memory metrics: {e}")
def _observe_system_memory_percent(self, options):
"""Observe system memory percentage"""
try:
mem = psutil.virtual_memory()
yield metrics.Observation(
mem.percent,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect system memory percent: {e}")
def _observe_disk_io_read(self, options):
"""Observe disk I/O read bytes"""
try:
disk_io = psutil.disk_io_counters()
if disk_io:
yield metrics.Observation(
disk_io.read_bytes,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect disk I/O read metrics: {e}")
def _observe_disk_io_write(self, options):
"""Observe disk I/O write bytes"""
try:
disk_io = psutil.disk_io_counters()
if disk_io:
yield metrics.Observation(
disk_io.write_bytes,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect disk I/O write metrics: {e}")
def _observe_network_io_sent(self, options):
"""Observe network bytes sent"""
try:
net_io = psutil.net_io_counters()
yield metrics.Observation(
net_io.bytes_sent,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect network sent metrics: {e}")
def _observe_network_io_recv(self, options):
"""Observe network bytes received"""
try:
net_io = psutil.net_io_counters()
yield metrics.Observation(
net_io.bytes_recv,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect network recv metrics: {e}")
class ApplicationMetricsCollector:
"""
Collects application-level metrics (HTTP requests, database connections, etc.)
using OpenTelemetry metrics API only (no Prometheus).
"""
def __init__(
self,
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
):
self.service_name = service_name
# Use provided meter provider or get global
if meter_provider:
self.meter = meter_provider.get_meter(__name__)
else:
self.meter = metrics.get_meter(__name__)
# HTTP metrics
self.http_requests = self.meter.create_counter(
name="http.server.requests",
description="Total HTTP requests",
unit="requests"
)
self.http_request_duration = self.meter.create_histogram(
name="http.server.request.duration",
description="HTTP request duration",
unit="ms"
)
self.http_active_requests = self.meter.create_up_down_counter(
name="http.server.active_requests",
description="Active HTTP requests",
unit="requests"
)
# Database metrics
self.db_connections = self.meter.create_up_down_counter(
name="db.client.connections.usage",
description="Database connections in use",
unit="connections"
)
self.db_query_duration = self.meter.create_histogram(
name="db.client.operation.duration",
description="Database query duration",
unit="ms"
)
logger.info(
"Application metrics collector initialized",
service=service_name
)
def record_http_request(
self,
method: str,
endpoint: str,
status_code: int,
duration_ms: float
):
"""Record an HTTP request"""
attributes = {
"service": self.service_name,
"http.method": method,
"http.route": endpoint,
"http.status_code": status_code
}
self.http_requests.add(1, attributes)
self.http_request_duration.record(duration_ms, attributes)
def increment_active_requests(self):
"""Increment active request count"""
self.http_active_requests.add(1, {"service": self.service_name})
def decrement_active_requests(self):
"""Decrement active request count"""
self.http_active_requests.add(-1, {"service": self.service_name})
def set_db_connections(self, count: int, state: str = "used"):
"""Set database connection count"""
self.db_connections.add(
count,
{"service": self.service_name, "state": state}
)
def record_db_query(self, operation: str, duration_ms: float, table: str = ""):
"""Record a database query"""
attributes = {
"service": self.service_name,
"db.operation": operation
}
if table:
attributes["db.table"] = table
self.db_query_duration.record(duration_ms, attributes)
def setup_all_metrics(
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
) -> tuple[SystemMetricsCollector, ApplicationMetricsCollector]:
"""
Setup both system and application metrics collection.
Args:
service_name: Name of the service
service_version: Version of the service
meter_provider: Optional meter provider (will use global if not provided)
Returns:
Tuple of (SystemMetricsCollector, ApplicationMetricsCollector)
Example:
from shared.monitoring.system_metrics import setup_all_metrics
system_metrics, app_metrics = setup_all_metrics("auth-service", "1.0.0")
# Metrics are automatically collected
# Use app_metrics to record custom application events:
app_metrics.record_http_request("GET", "/api/users", 200, 45.2)
"""
system_metrics = SystemMetricsCollector(service_name, service_version, meter_provider)
app_metrics = ApplicationMetricsCollector(service_name, service_version, meter_provider)
logger.info(
"All metrics collectors initialized",
service=service_name,
collectors=["system", "application"]
)
return system_metrics, app_metrics

View File

@@ -0,0 +1,271 @@
"""
Unified OpenTelemetry Telemetry Setup
Provides a single entry point to configure all telemetry signals:
- Traces: Distributed tracing across services
- Metrics: OTLP metrics export + system metrics collection
- Logs: Structured logs with trace correlation
All signals are exported to SigNoz via OTLP.
"""
import os
import structlog
from typing import Optional, Dict, Any, Tuple
from dataclasses import dataclass
from .otel_config import OTelConfig
from .tracing import setup_tracing
from .metrics_exporter import setup_otel_metrics
from .logs_exporter import setup_otel_logging
from .system_metrics import setup_all_metrics, SystemMetricsCollector, ApplicationMetricsCollector
logger = structlog.get_logger()
@dataclass
class TelemetryProviders:
"""
Container for all OpenTelemetry providers and collectors.
Attributes:
tracer_provider: Provider for distributed tracing
meter_provider: Provider for metrics export
logging_handler: Handler for structured logs
system_metrics: Collector for system-level metrics (CPU, memory, disk, network)
app_metrics: Collector for application-level metrics (HTTP, DB)
"""
tracer_provider: Optional[Any] = None
meter_provider: Optional[Any] = None
logging_handler: Optional[Any] = None
system_metrics: Optional[SystemMetricsCollector] = None
app_metrics: Optional[ApplicationMetricsCollector] = None
def setup_telemetry(
app,
service_name: str,
service_version: str = "1.0.0",
enable_traces: bool = True,
enable_metrics: bool = True,
enable_logs: bool = True,
enable_system_metrics: bool = True,
metrics_protocol: Optional[str] = None, # "grpc" or "http", defaults to grpc
export_interval_millis: int = 60000
) -> TelemetryProviders:
"""
Setup all OpenTelemetry telemetry signals (traces, metrics, logs) for a service.
This is the UNIFIED setup function that configures everything:
- Distributed tracing (gRPC, port 4317)
- Metrics export (gRPC by default, port 4317)
- System metrics collection (CPU, memory, disk, network)
- Application metrics (HTTP requests, DB queries)
- Structured logs export (HTTP, port 4318)
All signals use the centralized OTelConfig for endpoint management.
Args:
app: FastAPI application instance
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
enable_traces: Enable distributed tracing (default: True)
enable_metrics: Enable metrics export to OTLP (default: True)
enable_logs: Enable logs export to OTLP (default: True)
enable_system_metrics: Enable system metrics collection (default: True, can be disabled via ENABLE_SYSTEM_METRICS env)
metrics_protocol: Protocol for metrics ("grpc" or "http", default: "grpc")
export_interval_millis: How often to export metrics in milliseconds
Returns:
TelemetryProviders containing all initialized providers and collectors
Example:
from shared.monitoring.telemetry import setup_telemetry
app = FastAPI(title="Auth Service")
providers = setup_telemetry(
app,
service_name="auth-service",
service_version="1.0.0"
)
# All telemetry is now configured:
# - Traces automatically captured for HTTP requests
# - System metrics automatically collected
# - Application metrics via providers.app_metrics
# - Logs automatically correlated with traces
"""
logger.info(
"Setting up unified OpenTelemetry telemetry",
service=service_name,
version=service_version,
traces=enable_traces,
metrics=enable_metrics,
logs=enable_logs,
system_metrics=enable_system_metrics
)
providers = TelemetryProviders()
# Setup distributed tracing
if enable_traces and OTelConfig.is_enabled("traces"):
try:
providers.tracer_provider = setup_tracing(
app,
service_name=service_name,
service_version=service_version
)
if providers.tracer_provider:
logger.info("✓ Distributed tracing configured", service=service_name)
else:
logger.warning("✗ Distributed tracing setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup distributed tracing", service=service_name, error=str(e))
# Setup OTLP metrics export
if enable_metrics and OTelConfig.is_enabled("metrics"):
try:
providers.meter_provider = setup_otel_metrics(
service_name=service_name,
service_version=service_version,
protocol=metrics_protocol,
export_interval_millis=export_interval_millis
)
if providers.meter_provider:
logger.info("✓ OTLP metrics export configured", service=service_name)
# Setup system and application metrics collectors
if enable_system_metrics:
enable_system_env = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
if enable_system_env:
try:
providers.system_metrics, providers.app_metrics = setup_all_metrics(
service_name=service_name,
service_version=service_version,
meter_provider=providers.meter_provider
)
logger.info(
"✓ System and application metrics collectors initialized",
service=service_name,
system_metrics=["cpu", "memory", "disk", "network"],
app_metrics=["http_requests", "db_queries"]
)
except Exception as e:
logger.warning("✗ Failed to setup metrics collectors", service=service_name, error=str(e))
else:
logger.warning("✗ OTLP metrics export setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup OTLP metrics export", service=service_name, error=str(e))
# Setup logs export
if enable_logs and OTelConfig.is_enabled("logs"):
try:
providers.logging_handler = setup_otel_logging(
service_name=service_name,
service_version=service_version
)
if providers.logging_handler:
logger.info("✓ Structured logs export configured", service=service_name)
else:
logger.warning("✗ Logs export setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup logs export", service=service_name, error=str(e))
# Log endpoint configuration summary
try:
endpoints = OTelConfig.get_endpoints()
summary = {
"service": service_name,
"version": service_version,
"traces": {
"enabled": bool(providers.tracer_provider),
"endpoint": endpoints.traces_grpc if providers.tracer_provider else "disabled"
},
"metrics": {
"enabled": bool(providers.meter_provider),
"endpoint": (endpoints.metrics_grpc if metrics_protocol != "http" else endpoints.metrics_http) if providers.meter_provider else "disabled",
"system_metrics": bool(providers.system_metrics),
"app_metrics": bool(providers.app_metrics)
},
"logs": {
"enabled": bool(providers.logging_handler),
"endpoint": endpoints.logs_http if providers.logging_handler else "disabled"
}
}
logger.info("🎉 Telemetry setup complete", **summary)
except Exception as e:
logger.warning("Could not log endpoint summary", error=str(e))
return providers
def setup_telemetry_simple(
app,
service_name: str,
service_version: str = "1.0.0"
) -> TelemetryProviders:
"""
Simplified telemetry setup with all defaults.
Uses:
- gRPC for traces (port 4317)
- gRPC for metrics (port 4317)
- HTTP for logs (port 4318)
All settings are read from environment variables and OTelConfig.
Args:
app: FastAPI application instance
service_name: Name of the service
service_version: Version of the service
Returns:
TelemetryProviders containing all initialized providers
Example:
from shared.monitoring.telemetry import setup_telemetry_simple
app = FastAPI(title="Auth Service")
providers = setup_telemetry_simple(app, "auth-service")
"""
return setup_telemetry(
app=app,
service_name=service_name,
service_version=service_version
)
def get_telemetry_status() -> Dict[str, Any]:
"""
Get current telemetry configuration status.
Returns:
Dictionary with telemetry status information
Example:
from shared.monitoring.telemetry import get_telemetry_status
status = get_telemetry_status()
print(f"Tracing enabled: {status['traces']['enabled']}")
"""
endpoints = OTelConfig.get_endpoints()
return {
"traces": {
"enabled": OTelConfig.is_enabled("traces"),
"protocol": "grpc",
"endpoint": endpoints.traces_grpc
},
"metrics": {
"enabled": OTelConfig.is_enabled("metrics"),
"protocol": OTelConfig.get_protocol("metrics"),
"grpc_endpoint": endpoints.metrics_grpc,
"http_endpoint": endpoints.metrics_http
},
"logs": {
"enabled": OTelConfig.is_enabled("logs"),
"protocol": "http",
"endpoint": endpoints.logs_http
}
}

227
shared/monitoring/tracing.py Executable file
View File

@@ -0,0 +1,227 @@
"""
OpenTelemetry distributed tracing integration
Provides end-to-end request tracking across all services
"""
import os
import structlog
from typing import Optional
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
# Core instrumentations (should always be available)
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
# Optional instrumentations (may not be installed in all services)
try:
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
HTTPX_AVAILABLE = True
except ImportError:
HTTPX_AVAILABLE = False
try:
from opentelemetry.instrumentation.redis import RedisInstrumentor
REDIS_AVAILABLE = True
except ImportError:
REDIS_AVAILABLE = False
try:
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
SQLALCHEMY_AVAILABLE = True
except ImportError:
SQLALCHEMY_AVAILABLE = False
from .otel_config import OTelConfig
logger = structlog.get_logger()
def setup_tracing(
app,
service_name: str,
service_version: str = "1.0.0",
otel_endpoint: Optional[str] = None
) -> Optional[TracerProvider]:
"""
Setup OpenTelemetry distributed tracing for a FastAPI service.
Automatically instruments:
- FastAPI endpoints
- HTTPX client requests (inter-service calls)
- Redis operations
- PostgreSQL/SQLAlchemy queries
Uses gRPC protocol (port 4317) for sending traces to SigNoz.
Args:
app: FastAPI application instance
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: Optional override for OTLP endpoint (gRPC format: host:port)
Returns:
TracerProvider instance if successful, None otherwise
Example:
from shared.monitoring.tracing import setup_tracing
app = FastAPI(title="Auth Service")
tracer_provider = setup_tracing(app, "auth-service", "1.0.0")
"""
# Check if tracing is enabled
if not OTelConfig.is_enabled("traces"):
logger.info(
"Distributed tracing disabled",
service=service_name,
reason="ENABLE_TRACING not set to 'true'"
)
return None
try:
# Get endpoints from centralized config
endpoints = OTelConfig.get_endpoints()
# Use provided endpoint or get from config
if otel_endpoint:
# Clean user-provided endpoint for gRPC
grpc_endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
else:
grpc_endpoint = endpoints.traces_grpc
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure tracer provider
tracer_provider = TracerProvider(resource=resource)
trace.set_tracer_provider(tracer_provider)
# Configure OTLP gRPC exporter for traces
otlp_exporter = OTLPSpanExporter(
endpoint=grpc_endpoint,
insecure=True # Use secure=False in production with proper TLS
)
# Add span processor with batching for performance
span_processor = BatchSpanProcessor(otlp_exporter)
tracer_provider.add_span_processor(span_processor)
# Auto-instrument FastAPI
FastAPIInstrumentor.instrument_app(
app,
tracer_provider=tracer_provider,
excluded_urls="health,metrics" # Don't trace health/metrics endpoints
)
# Auto-instrument HTTPX (inter-service communication) if available
if HTTPX_AVAILABLE:
try:
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("HTTPX instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument HTTPX: {e}")
# Auto-instrument Redis if available
if REDIS_AVAILABLE:
try:
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("Redis instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument Redis: {e}")
# Auto-instrument SQLAlchemy if available
if SQLALCHEMY_AVAILABLE:
try:
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("SQLAlchemy instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
logger.info(
"Distributed tracing configured successfully",
service=service_name,
grpc_endpoint=grpc_endpoint,
protocol="grpc"
)
return tracer_provider
except Exception as e:
logger.error(
"Failed to setup tracing - continuing without it",
service=service_name,
error=str(e)
)
return None
def get_current_trace_id() -> Optional[str]:
"""
Get the current trace ID for correlation with logs.
Returns:
Trace ID as hex string, or None if no active trace
"""
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
return format(span.get_span_context().trace_id, '032x')
return None
def get_current_span_id() -> Optional[str]:
"""
Get the current span ID.
Returns:
Span ID as hex string, or None if no active span
"""
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
return format(span.get_span_context().span_id, '016x')
return None
def add_trace_attributes(**attributes):
"""
Add custom attributes to the current span.
Example:
add_trace_attributes(
user_id="123",
tenant_id="abc",
operation="user_registration"
)
"""
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
for key, value in attributes.items():
span.set_attribute(key, str(value))
def add_trace_event(name: str, **attributes):
"""
Add an event to the current span (for important operations).
Example:
add_trace_event("user_authenticated", user_id="123", method="jwt")
"""
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
span.add_event(name, attributes)
def record_exception(exception: Exception):
"""
Record an exception in the current span.
Args:
exception: The exception to record
"""
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
span.record_exception(exception)
span.set_status(trace.Status(trace.StatusCode.ERROR, str(exception)))