Imporve monitoring 5
This commit is contained in:
@@ -20,10 +20,11 @@ from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi.routing import APIRouter
|
||||
|
||||
from shared.monitoring import setup_logging, setup_otel_logging, setup_otel_metrics, setup_all_metrics
|
||||
from shared.monitoring.metrics import setup_metrics_early
|
||||
from shared.monitoring import (
|
||||
setup_logging,
|
||||
setup_telemetry
|
||||
)
|
||||
from shared.monitoring.health_checks import setup_fastapi_health_checks
|
||||
from shared.monitoring.tracing import setup_tracing
|
||||
from shared.database.base import DatabaseManager
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -77,24 +78,13 @@ class BaseFastAPIService:
|
||||
|
||||
# Initialize logging
|
||||
setup_logging(service_name, log_level)
|
||||
|
||||
# Setup OpenTelemetry logging export if enabled
|
||||
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
|
||||
try:
|
||||
setup_otel_logging(service_name, version)
|
||||
self.logger = structlog.get_logger()
|
||||
self.logger.info(f"OpenTelemetry logs export enabled for {service_name}")
|
||||
except Exception as e:
|
||||
self.logger = structlog.get_logger()
|
||||
self.logger.warning(f"Failed to setup OpenTelemetry logs export: {e}")
|
||||
else:
|
||||
self.logger = structlog.get_logger()
|
||||
self.logger = structlog.get_logger()
|
||||
|
||||
# Will be set during app creation
|
||||
self.app: Optional[FastAPI] = None
|
||||
self.metrics_collector = None
|
||||
self.health_manager = None
|
||||
self.alert_service = None
|
||||
self.telemetry_providers = None # Contains all OTEL providers and metrics collectors
|
||||
|
||||
def create_app(self, **fastapi_kwargs) -> FastAPI:
|
||||
"""
|
||||
@@ -116,49 +106,25 @@ class BaseFastAPIService:
|
||||
# Create FastAPI app
|
||||
self.app = FastAPI(**config)
|
||||
|
||||
# Setup metrics BEFORE middleware and lifespan
|
||||
if self.enable_metrics:
|
||||
self.metrics_collector = setup_metrics_early(self.app, self.service_name)
|
||||
|
||||
# Setup OpenTelemetry metrics export if enabled
|
||||
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
|
||||
if enable_otel_metrics:
|
||||
try:
|
||||
self.otel_meter_provider = setup_otel_metrics(self.service_name, self.version)
|
||||
if self.otel_meter_provider:
|
||||
self.logger.info(f"OpenTelemetry metrics export enabled for {self.service_name}")
|
||||
|
||||
# Setup system metrics collection (CPU, memory, disk, network)
|
||||
enable_system_metrics = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
|
||||
if enable_system_metrics:
|
||||
try:
|
||||
self.system_metrics, self.app_metrics = setup_all_metrics(
|
||||
self.service_name,
|
||||
self.version,
|
||||
self.otel_meter_provider
|
||||
)
|
||||
self.logger.info(f"System metrics collection enabled for {self.service_name}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to setup system metrics: {e}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to setup OpenTelemetry metrics export: {e}")
|
||||
|
||||
# Setup distributed tracing
|
||||
# Check both constructor flag and environment variable
|
||||
tracing_enabled = self.enable_tracing and os.getenv("ENABLE_TRACING", "true").lower() == "true"
|
||||
|
||||
if tracing_enabled:
|
||||
try:
|
||||
otel_endpoint = os.getenv(
|
||||
"OTEL_COLLECTOR_ENDPOINT",
|
||||
"http://signoz-otel-collector.bakery-ia:4318"
|
||||
)
|
||||
setup_tracing(self.app, self.service_name, self.version, otel_endpoint)
|
||||
self.logger.info(f"Distributed tracing enabled for {self.service_name}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to setup tracing, continuing without it: {e}")
|
||||
else:
|
||||
self.logger.info(f"Distributed tracing disabled for {self.service_name}")
|
||||
# Setup unified OpenTelemetry telemetry
|
||||
# This single call configures:
|
||||
# - Distributed tracing (gRPC, port 4317)
|
||||
# - OTLP metrics export (gRPC, port 4317)
|
||||
# - System metrics collection (CPU, memory, disk, network)
|
||||
# - Application metrics (HTTP requests, DB queries)
|
||||
# - Structured logs export (HTTP, port 4318)
|
||||
try:
|
||||
self.telemetry_providers = setup_telemetry(
|
||||
app=self.app,
|
||||
service_name=self.service_name,
|
||||
service_version=self.version,
|
||||
enable_traces=self.enable_tracing,
|
||||
enable_metrics=self.enable_metrics,
|
||||
enable_logs=True, # Controlled by OTEL_LOGS_EXPORTER env var
|
||||
enable_system_metrics=True # Controlled by ENABLE_SYSTEM_METRICS env var
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.warning("Failed to setup telemetry", error=str(e))
|
||||
|
||||
# Setup lifespan
|
||||
self.app.router.lifespan_context = self._create_lifespan()
|
||||
@@ -361,10 +327,6 @@ class BaseFastAPIService:
|
||||
method=request.method
|
||||
)
|
||||
|
||||
# Record error metric if available
|
||||
if self.metrics_collector:
|
||||
self.metrics_collector.increment_counter("errors_total", labels={"type": "unhandled"})
|
||||
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content={
|
||||
@@ -409,7 +371,10 @@ class BaseFastAPIService:
|
||||
|
||||
def register_custom_metrics(self, metrics_config: Dict[str, Dict[str, Any]]):
|
||||
"""
|
||||
Register custom metrics for the service
|
||||
Register custom OTEL metrics for the service.
|
||||
|
||||
Note: System metrics (CPU, memory, disk, network) and application metrics (HTTP, DB)
|
||||
are automatically created by setup_telemetry(). Use this for additional custom metrics.
|
||||
|
||||
Args:
|
||||
metrics_config: Dict with metric name as key and config as value
|
||||
@@ -417,25 +382,36 @@ class BaseFastAPIService:
|
||||
"user_registrations": {
|
||||
"type": "counter",
|
||||
"description": "Total user registrations",
|
||||
"labels": ["status"]
|
||||
"unit": "registrations"
|
||||
}
|
||||
}
|
||||
"""
|
||||
if not self.metrics_collector:
|
||||
self.logger.warning("Metrics collector not available")
|
||||
if not self.telemetry_providers or not self.telemetry_providers.meter_provider:
|
||||
self.logger.warning("OTEL meter provider not available - metrics not registered")
|
||||
return
|
||||
|
||||
from opentelemetry.metrics import get_meter
|
||||
meter = get_meter(self.service_name)
|
||||
|
||||
for metric_name, config in metrics_config.items():
|
||||
metric_type = config.get("type", "counter")
|
||||
description = config.get("description", f"{metric_name} metric")
|
||||
labels = config.get("labels", [])
|
||||
unit = config.get("unit", "1")
|
||||
|
||||
if metric_type == "counter":
|
||||
self.metrics_collector.register_counter(metric_name, description, labels=labels)
|
||||
elif metric_type == "histogram":
|
||||
self.metrics_collector.register_histogram(metric_name, description, labels=labels)
|
||||
else:
|
||||
self.logger.warning(f"Unsupported metric type: {metric_type}")
|
||||
try:
|
||||
if metric_type == "counter":
|
||||
meter.create_counter(metric_name, description=description, unit=unit)
|
||||
self.logger.info(f"Registered custom counter: {metric_name}")
|
||||
elif metric_type == "histogram":
|
||||
meter.create_histogram(metric_name, description=description, unit=unit)
|
||||
self.logger.info(f"Registered custom histogram: {metric_name}")
|
||||
elif metric_type == "gauge":
|
||||
meter.create_up_down_counter(metric_name, description=description, unit=unit)
|
||||
self.logger.info(f"Registered custom gauge: {metric_name}")
|
||||
else:
|
||||
self.logger.warning(f"Unsupported metric type: {metric_type}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to register metric {metric_name}", error=str(e))
|
||||
|
||||
def run_development_server(self, host: str = "0.0.0.0", port: int = 8000, reload: Optional[bool] = None):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user