Imporve monitoring 5

This commit is contained in:
Urtzi Alfaro
2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions

View File

@@ -0,0 +1,271 @@
"""
Unified OpenTelemetry Telemetry Setup
Provides a single entry point to configure all telemetry signals:
- Traces: Distributed tracing across services
- Metrics: OTLP metrics export + system metrics collection
- Logs: Structured logs with trace correlation
All signals are exported to SigNoz via OTLP.
"""
import os
import structlog
from typing import Optional, Dict, Any, Tuple
from dataclasses import dataclass
from .otel_config import OTelConfig
from .tracing import setup_tracing
from .metrics_exporter import setup_otel_metrics
from .logs_exporter import setup_otel_logging
from .system_metrics import setup_all_metrics, SystemMetricsCollector, ApplicationMetricsCollector
logger = structlog.get_logger()
@dataclass
class TelemetryProviders:
"""
Container for all OpenTelemetry providers and collectors.
Attributes:
tracer_provider: Provider for distributed tracing
meter_provider: Provider for metrics export
logging_handler: Handler for structured logs
system_metrics: Collector for system-level metrics (CPU, memory, disk, network)
app_metrics: Collector for application-level metrics (HTTP, DB)
"""
tracer_provider: Optional[Any] = None
meter_provider: Optional[Any] = None
logging_handler: Optional[Any] = None
system_metrics: Optional[SystemMetricsCollector] = None
app_metrics: Optional[ApplicationMetricsCollector] = None
def setup_telemetry(
app,
service_name: str,
service_version: str = "1.0.0",
enable_traces: bool = True,
enable_metrics: bool = True,
enable_logs: bool = True,
enable_system_metrics: bool = True,
metrics_protocol: Optional[str] = None, # "grpc" or "http", defaults to grpc
export_interval_millis: int = 60000
) -> TelemetryProviders:
"""
Setup all OpenTelemetry telemetry signals (traces, metrics, logs) for a service.
This is the UNIFIED setup function that configures everything:
- Distributed tracing (gRPC, port 4317)
- Metrics export (gRPC by default, port 4317)
- System metrics collection (CPU, memory, disk, network)
- Application metrics (HTTP requests, DB queries)
- Structured logs export (HTTP, port 4318)
All signals use the centralized OTelConfig for endpoint management.
Args:
app: FastAPI application instance
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
enable_traces: Enable distributed tracing (default: True)
enable_metrics: Enable metrics export to OTLP (default: True)
enable_logs: Enable logs export to OTLP (default: True)
enable_system_metrics: Enable system metrics collection (default: True, can be disabled via ENABLE_SYSTEM_METRICS env)
metrics_protocol: Protocol for metrics ("grpc" or "http", default: "grpc")
export_interval_millis: How often to export metrics in milliseconds
Returns:
TelemetryProviders containing all initialized providers and collectors
Example:
from shared.monitoring.telemetry import setup_telemetry
app = FastAPI(title="Auth Service")
providers = setup_telemetry(
app,
service_name="auth-service",
service_version="1.0.0"
)
# All telemetry is now configured:
# - Traces automatically captured for HTTP requests
# - System metrics automatically collected
# - Application metrics via providers.app_metrics
# - Logs automatically correlated with traces
"""
logger.info(
"Setting up unified OpenTelemetry telemetry",
service=service_name,
version=service_version,
traces=enable_traces,
metrics=enable_metrics,
logs=enable_logs,
system_metrics=enable_system_metrics
)
providers = TelemetryProviders()
# Setup distributed tracing
if enable_traces and OTelConfig.is_enabled("traces"):
try:
providers.tracer_provider = setup_tracing(
app,
service_name=service_name,
service_version=service_version
)
if providers.tracer_provider:
logger.info("✓ Distributed tracing configured", service=service_name)
else:
logger.warning("✗ Distributed tracing setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup distributed tracing", service=service_name, error=str(e))
# Setup OTLP metrics export
if enable_metrics and OTelConfig.is_enabled("metrics"):
try:
providers.meter_provider = setup_otel_metrics(
service_name=service_name,
service_version=service_version,
protocol=metrics_protocol,
export_interval_millis=export_interval_millis
)
if providers.meter_provider:
logger.info("✓ OTLP metrics export configured", service=service_name)
# Setup system and application metrics collectors
if enable_system_metrics:
enable_system_env = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
if enable_system_env:
try:
providers.system_metrics, providers.app_metrics = setup_all_metrics(
service_name=service_name,
service_version=service_version,
meter_provider=providers.meter_provider
)
logger.info(
"✓ System and application metrics collectors initialized",
service=service_name,
system_metrics=["cpu", "memory", "disk", "network"],
app_metrics=["http_requests", "db_queries"]
)
except Exception as e:
logger.warning("✗ Failed to setup metrics collectors", service=service_name, error=str(e))
else:
logger.warning("✗ OTLP metrics export setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup OTLP metrics export", service=service_name, error=str(e))
# Setup logs export
if enable_logs and OTelConfig.is_enabled("logs"):
try:
providers.logging_handler = setup_otel_logging(
service_name=service_name,
service_version=service_version
)
if providers.logging_handler:
logger.info("✓ Structured logs export configured", service=service_name)
else:
logger.warning("✗ Logs export setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup logs export", service=service_name, error=str(e))
# Log endpoint configuration summary
try:
endpoints = OTelConfig.get_endpoints()
summary = {
"service": service_name,
"version": service_version,
"traces": {
"enabled": bool(providers.tracer_provider),
"endpoint": endpoints.traces_grpc if providers.tracer_provider else "disabled"
},
"metrics": {
"enabled": bool(providers.meter_provider),
"endpoint": (endpoints.metrics_grpc if metrics_protocol != "http" else endpoints.metrics_http) if providers.meter_provider else "disabled",
"system_metrics": bool(providers.system_metrics),
"app_metrics": bool(providers.app_metrics)
},
"logs": {
"enabled": bool(providers.logging_handler),
"endpoint": endpoints.logs_http if providers.logging_handler else "disabled"
}
}
logger.info("🎉 Telemetry setup complete", **summary)
except Exception as e:
logger.warning("Could not log endpoint summary", error=str(e))
return providers
def setup_telemetry_simple(
app,
service_name: str,
service_version: str = "1.0.0"
) -> TelemetryProviders:
"""
Simplified telemetry setup with all defaults.
Uses:
- gRPC for traces (port 4317)
- gRPC for metrics (port 4317)
- HTTP for logs (port 4318)
All settings are read from environment variables and OTelConfig.
Args:
app: FastAPI application instance
service_name: Name of the service
service_version: Version of the service
Returns:
TelemetryProviders containing all initialized providers
Example:
from shared.monitoring.telemetry import setup_telemetry_simple
app = FastAPI(title="Auth Service")
providers = setup_telemetry_simple(app, "auth-service")
"""
return setup_telemetry(
app=app,
service_name=service_name,
service_version=service_version
)
def get_telemetry_status() -> Dict[str, Any]:
"""
Get current telemetry configuration status.
Returns:
Dictionary with telemetry status information
Example:
from shared.monitoring.telemetry import get_telemetry_status
status = get_telemetry_status()
print(f"Tracing enabled: {status['traces']['enabled']}")
"""
endpoints = OTelConfig.get_endpoints()
return {
"traces": {
"enabled": OTelConfig.is_enabled("traces"),
"protocol": "grpc",
"endpoint": endpoints.traces_grpc
},
"metrics": {
"enabled": OTelConfig.is_enabled("metrics"),
"protocol": OTelConfig.get_protocol("metrics"),
"grpc_endpoint": endpoints.metrics_grpc,
"http_endpoint": endpoints.metrics_http
},
"logs": {
"enabled": OTelConfig.is_enabled("logs"),
"protocol": "http",
"endpoint": endpoints.logs_http
}
}