Imporve monitoring 5

This commit is contained in:
Urtzi Alfaro
2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions

View File

@@ -20,10 +20,11 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi.routing import APIRouter
from shared.monitoring import setup_logging, setup_otel_logging, setup_otel_metrics, setup_all_metrics
from shared.monitoring.metrics import setup_metrics_early
from shared.monitoring import (
setup_logging,
setup_telemetry
)
from shared.monitoring.health_checks import setup_fastapi_health_checks
from shared.monitoring.tracing import setup_tracing
from shared.database.base import DatabaseManager
if TYPE_CHECKING:
@@ -77,24 +78,13 @@ class BaseFastAPIService:
# Initialize logging
setup_logging(service_name, log_level)
# Setup OpenTelemetry logging export if enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
setup_otel_logging(service_name, version)
self.logger = structlog.get_logger()
self.logger.info(f"OpenTelemetry logs export enabled for {service_name}")
except Exception as e:
self.logger = structlog.get_logger()
self.logger.warning(f"Failed to setup OpenTelemetry logs export: {e}")
else:
self.logger = structlog.get_logger()
self.logger = structlog.get_logger()
# Will be set during app creation
self.app: Optional[FastAPI] = None
self.metrics_collector = None
self.health_manager = None
self.alert_service = None
self.telemetry_providers = None # Contains all OTEL providers and metrics collectors
def create_app(self, **fastapi_kwargs) -> FastAPI:
"""
@@ -116,49 +106,25 @@ class BaseFastAPIService:
# Create FastAPI app
self.app = FastAPI(**config)
# Setup metrics BEFORE middleware and lifespan
if self.enable_metrics:
self.metrics_collector = setup_metrics_early(self.app, self.service_name)
# Setup OpenTelemetry metrics export if enabled
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
if enable_otel_metrics:
try:
self.otel_meter_provider = setup_otel_metrics(self.service_name, self.version)
if self.otel_meter_provider:
self.logger.info(f"OpenTelemetry metrics export enabled for {self.service_name}")
# Setup system metrics collection (CPU, memory, disk, network)
enable_system_metrics = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
if enable_system_metrics:
try:
self.system_metrics, self.app_metrics = setup_all_metrics(
self.service_name,
self.version,
self.otel_meter_provider
)
self.logger.info(f"System metrics collection enabled for {self.service_name}")
except Exception as e:
self.logger.warning(f"Failed to setup system metrics: {e}")
except Exception as e:
self.logger.warning(f"Failed to setup OpenTelemetry metrics export: {e}")
# Setup distributed tracing
# Check both constructor flag and environment variable
tracing_enabled = self.enable_tracing and os.getenv("ENABLE_TRACING", "true").lower() == "true"
if tracing_enabled:
try:
otel_endpoint = os.getenv(
"OTEL_COLLECTOR_ENDPOINT",
"http://signoz-otel-collector.bakery-ia:4318"
)
setup_tracing(self.app, self.service_name, self.version, otel_endpoint)
self.logger.info(f"Distributed tracing enabled for {self.service_name}")
except Exception as e:
self.logger.warning(f"Failed to setup tracing, continuing without it: {e}")
else:
self.logger.info(f"Distributed tracing disabled for {self.service_name}")
# Setup unified OpenTelemetry telemetry
# This single call configures:
# - Distributed tracing (gRPC, port 4317)
# - OTLP metrics export (gRPC, port 4317)
# - System metrics collection (CPU, memory, disk, network)
# - Application metrics (HTTP requests, DB queries)
# - Structured logs export (HTTP, port 4318)
try:
self.telemetry_providers = setup_telemetry(
app=self.app,
service_name=self.service_name,
service_version=self.version,
enable_traces=self.enable_tracing,
enable_metrics=self.enable_metrics,
enable_logs=True, # Controlled by OTEL_LOGS_EXPORTER env var
enable_system_metrics=True # Controlled by ENABLE_SYSTEM_METRICS env var
)
except Exception as e:
self.logger.warning("Failed to setup telemetry", error=str(e))
# Setup lifespan
self.app.router.lifespan_context = self._create_lifespan()
@@ -361,10 +327,6 @@ class BaseFastAPIService:
method=request.method
)
# Record error metric if available
if self.metrics_collector:
self.metrics_collector.increment_counter("errors_total", labels={"type": "unhandled"})
return JSONResponse(
status_code=500,
content={
@@ -409,7 +371,10 @@ class BaseFastAPIService:
def register_custom_metrics(self, metrics_config: Dict[str, Dict[str, Any]]):
"""
Register custom metrics for the service
Register custom OTEL metrics for the service.
Note: System metrics (CPU, memory, disk, network) and application metrics (HTTP, DB)
are automatically created by setup_telemetry(). Use this for additional custom metrics.
Args:
metrics_config: Dict with metric name as key and config as value
@@ -417,25 +382,36 @@ class BaseFastAPIService:
"user_registrations": {
"type": "counter",
"description": "Total user registrations",
"labels": ["status"]
"unit": "registrations"
}
}
"""
if not self.metrics_collector:
self.logger.warning("Metrics collector not available")
if not self.telemetry_providers or not self.telemetry_providers.meter_provider:
self.logger.warning("OTEL meter provider not available - metrics not registered")
return
from opentelemetry.metrics import get_meter
meter = get_meter(self.service_name)
for metric_name, config in metrics_config.items():
metric_type = config.get("type", "counter")
description = config.get("description", f"{metric_name} metric")
labels = config.get("labels", [])
unit = config.get("unit", "1")
if metric_type == "counter":
self.metrics_collector.register_counter(metric_name, description, labels=labels)
elif metric_type == "histogram":
self.metrics_collector.register_histogram(metric_name, description, labels=labels)
else:
self.logger.warning(f"Unsupported metric type: {metric_type}")
try:
if metric_type == "counter":
meter.create_counter(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom counter: {metric_name}")
elif metric_type == "histogram":
meter.create_histogram(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom histogram: {metric_name}")
elif metric_type == "gauge":
meter.create_up_down_counter(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom gauge: {metric_name}")
else:
self.logger.warning(f"Unsupported metric type: {metric_type}")
except Exception as e:
self.logger.error(f"Failed to register metric {metric_name}", error=str(e))
def run_development_server(self, host: str = "0.0.0.0", port: int = 8000, reload: Optional[bool] = None):
"""