Update monitoring packages to latest versions

- Updated all OpenTelemetry packages to latest versions:
  - opentelemetry-api: 1.27.0 → 1.39.1
  - opentelemetry-sdk: 1.27.0 → 1.39.1
  - opentelemetry-exporter-otlp-proto-grpc: 1.27.0 → 1.39.1
  - opentelemetry-exporter-otlp-proto-http: 1.27.0 → 1.39.1
  - opentelemetry-instrumentation-fastapi: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-httpx: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-redis: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-sqlalchemy: 0.48b0 → 0.60b1

- Removed prometheus-client==0.23.1 from all services
- Unified all services to use the same monitoring package versions

Generated by Mistral Vibe.
Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
This commit is contained in:
Urtzi Alfaro
2026-01-08 19:25:52 +01:00
parent dfb7e4b237
commit 29d19087f1
129 changed files with 5718 additions and 1821 deletions

View File

@@ -1,79 +1,101 @@
# ================================================================
# shared/monitoring/metrics.py - FIXED VERSION
# ================================================================
"""
Centralized metrics collection for microservices - Fixed middleware issue
OpenTelemetry Metrics Collection for Microservices
Replaces Prometheus with native OpenTelemetry metrics export to SigNoz
"""
import time
import logging
from typing import Dict, Any, List, Optional
from prometheus_client import Counter, Histogram, Gauge, start_http_server, generate_latest
import structlog
from typing import Dict, Any, Optional
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from fastapi import Request, Response
from threading import Lock
import os
logger = logging.getLogger(__name__)
logger = structlog.get_logger()
# Global registry for metrics collectors
_metrics_registry: Dict[str, 'MetricsCollector'] = {}
_registry_lock = Lock()
# Default Prometheus metrics
DEFAULT_REQUEST_COUNT = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status_code', 'service']
)
DEFAULT_REQUEST_DURATION = Histogram(
'http_request_duration_seconds',
'HTTP request duration in seconds',
['method', 'endpoint', 'service']
)
DEFAULT_ACTIVE_CONNECTIONS = Gauge(
'active_connections',
'Active database connections',
['service']
)
class MetricsCollector:
"""Thread-safe metrics collector for microservices"""
"""
OpenTelemetry-based metrics collector for microservices.
Exports metrics directly to SigNoz via OTLP (no Prometheus).
"""
def __init__(self, service_name: str):
def __init__(
self,
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
):
self.service_name = service_name
self.service_version = service_version
self.start_time = time.time()
self._counters: Dict[str, Counter] = {}
self._histograms: Dict[str, Histogram] = {}
self._gauges: Dict[str, Gauge] = {}
# Use provided meter provider or get global
if meter_provider:
self.meter = meter_provider.get_meter(__name__)
else:
self.meter = metrics.get_meter(__name__)
# Store created instruments
self._counters: Dict[str, Any] = {}
self._histograms: Dict[str, Any] = {}
self._up_down_counters: Dict[str, Any] = {}
self._lock = Lock()
# Register in global registry
with _registry_lock:
_metrics_registry[service_name] = self
def start_metrics_server(self, port: int = 8080):
"""Start Prometheus metrics server"""
try:
start_http_server(port)
logger.info(f"Metrics server started on port {port} for {self.service_name}")
except Exception as e:
logger.error(f"Failed to start metrics server for {self.service_name}: {e}")
# Create default HTTP metrics
self._setup_default_metrics()
def register_counter(self, name: str, documentation: str, labels: List[str] = None) -> Counter:
"""Register a custom Counter metric."""
logger.info(
"OpenTelemetry metrics collector initialized",
service=service_name
)
def _setup_default_metrics(self):
"""Setup default HTTP metrics"""
self._counters["http_requests_total"] = self.meter.create_counter(
name=f"{self.service_name.replace('-', '_')}_http_requests_total",
description="Total HTTP requests",
unit="requests"
)
self._histograms["http_request_duration"] = self.meter.create_histogram(
name=f"{self.service_name.replace('-', '_')}_http_request_duration_seconds",
description="HTTP request duration in seconds",
unit="s"
)
self._up_down_counters["active_requests"] = self.meter.create_up_down_counter(
name=f"{self.service_name.replace('-', '_')}_active_requests",
description="Number of active HTTP requests",
unit="requests"
)
def register_counter(self, name: str, documentation: str, labels: list = None) -> Any:
"""Register a custom Counter metric"""
with self._lock:
if name in self._counters:
logger.warning(f"Counter '{name}' already registered for {self.service_name}")
return self._counters[name]
if labels is None:
labels = ['service']
elif 'service' not in labels:
labels.append('service')
try:
counter = Counter(f"{self.service_name.replace('-', '_')}_{name}", documentation, labelnames=labels)
counter = self.meter.create_counter(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=documentation,
unit="1"
)
self._counters[name] = counter
logger.info(f"Registered counter: {name} for {self.service_name}")
return counter
@@ -81,65 +103,46 @@ class MetricsCollector:
logger.error(f"Failed to register counter {name} for {self.service_name}: {e}")
raise
def register_histogram(self, name: str, documentation: str, labels: List[str] = None,
buckets: tuple = Histogram.DEFAULT_BUCKETS) -> Histogram:
"""Register a custom Histogram metric."""
def register_histogram(
self,
name: str,
documentation: str,
labels: list = None,
buckets: tuple = None
) -> Any:
"""Register a custom Histogram metric"""
with self._lock:
if name in self._histograms:
logger.warning(f"Histogram '{name}' already registered for {self.service_name}")
return self._histograms[name]
if labels is None:
labels = ['service']
elif 'service' not in labels:
labels.append('service')
try:
histogram = Histogram(f"{self.service_name.replace('-', '_')}_{name}", documentation,
labelnames=labels, buckets=buckets)
histogram = self.meter.create_histogram(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=documentation,
unit="1"
)
self._histograms[name] = histogram
logger.info(f"Registered histogram: {name} for {self.service_name}")
return histogram
except ValueError as e:
if "Duplicated timeseries" in str(e):
# Metric already exists in global registry, try to find it
from prometheus_client import REGISTRY
metric_name = f"{self.service_name.replace('-', '_')}_{name}"
for collector in REGISTRY._collector_to_names.keys():
if hasattr(collector, '_name') and collector._name == metric_name:
self._histograms[name] = collector
logger.warning(f"Reusing existing histogram: {name} for {self.service_name}")
return collector
# If we can't find it, create a new name with suffix
import time
suffix = str(int(time.time() * 1000))[-6:] # Last 6 digits of timestamp
histogram = Histogram(f"{self.service_name.replace('-', '_')}_{name}_{suffix}",
documentation, labelnames=labels, buckets=buckets)
self._histograms[name] = histogram
logger.warning(f"Created histogram with suffix: {name}_{suffix} for {self.service_name}")
return histogram
else:
logger.error(f"Failed to register histogram {name} for {self.service_name}: {e}")
raise
except Exception as e:
logger.error(f"Failed to register histogram {name} for {self.service_name}: {e}")
raise
def register_gauge(self, name: str, documentation: str, labels: List[str] = None) -> Gauge:
"""Register a custom Gauge metric."""
def register_gauge(self, name: str, documentation: str, labels: list = None) -> Any:
"""Register a custom Gauge metric (using UpDownCounter)"""
with self._lock:
if name in self._gauges:
if name in self._up_down_counters:
logger.warning(f"Gauge '{name}' already registered for {self.service_name}")
return self._gauges[name]
if labels is None:
labels = ['service']
elif 'service' not in labels:
labels.append('service')
return self._up_down_counters[name]
try:
gauge = Gauge(f"{self.service_name.replace('-', '_')}_{name}", documentation, labelnames=labels)
self._gauges[name] = gauge
gauge = self.meter.create_up_down_counter(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=documentation,
unit="1"
)
self._up_down_counters[name] = gauge
logger.info(f"Registered gauge: {name} for {self.service_name}")
return gauge
except Exception as e:
@@ -147,104 +150,118 @@ class MetricsCollector:
raise
def increment_counter(self, name: str, value: int = 1, labels: Dict[str, str] = None):
"""Increment a counter metric."""
"""Increment a counter metric"""
if name not in self._counters:
logger.error(f"Counter '{name}' not registered for {self.service_name}. Cannot increment.")
logger.error(f"Counter '{name}' not registered for {self.service_name}")
return
if labels is None:
labels = {'service': self.service_name}
elif 'service' not in labels:
labels['service'] = self.service_name
labels = {"service": self.service_name}
elif "service" not in labels:
labels["service"] = self.service_name
try:
self._counters[name].labels(**labels).inc(value)
self._counters[name].add(value, labels)
except Exception as e:
logger.error(f"Failed to increment counter {name} for {self.service_name}: {e}")
def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None):
"""Observe a histogram metric."""
"""Observe a histogram metric"""
if name not in self._histograms:
logger.error(f"Histogram '{name}' not registered for {self.service_name}. Cannot observe.")
logger.error(f"Histogram '{name}' not registered for {self.service_name}")
return
if labels is None:
labels = {'service': self.service_name}
elif 'service' not in labels:
labels['service'] = self.service_name
labels = {"service": self.service_name}
elif "service" not in labels:
labels["service"] = self.service_name
try:
self._histograms[name].labels(**labels).observe(value)
self._histograms[name].record(value, labels)
except Exception as e:
logger.error(f"Failed to observe histogram {name} for {self.service_name}: {e}")
def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None):
"""Set a gauge metric."""
if name not in self._gauges:
logger.error(f"Gauge '{name}' not registered for {self.service_name}. Cannot set.")
"""Set a gauge metric (using add for UpDownCounter)"""
if name not in self._up_down_counters:
logger.error(f"Gauge '{name}' not registered for {self.service_name}")
return
if labels is None:
labels = {'service': self.service_name}
elif 'service' not in labels:
labels['service'] = self.service_name
labels = {"service": self.service_name}
elif "service" not in labels:
labels["service"] = self.service_name
try:
self._gauges[name].labels(**labels).set(value)
# For UpDownCounter, we need to track the delta
# Store current value and calculate delta
key = f"{name}_{str(sorted(labels.items()))}"
if not hasattr(self, '_gauge_values'):
self._gauge_values = {}
old_value = self._gauge_values.get(key, 0)
delta = value - old_value
self._gauge_values[key] = value
self._up_down_counters[name].add(delta, labels)
except Exception as e:
logger.error(f"Failed to set gauge {name} for {self.service_name}: {e}")
def record_request(self, method: str, endpoint: str, status_code: int, duration: float):
"""Record HTTP request metrics using default metrics."""
"""Record HTTP request metrics"""
try:
DEFAULT_REQUEST_COUNT.labels(
method=method,
endpoint=endpoint,
status_code=status_code,
service=self.service_name
).inc()
attributes = {
"service": self.service_name,
"http.method": method,
"http.route": endpoint,
"http.status_code": str(status_code)
}
DEFAULT_REQUEST_DURATION.labels(
method=method,
endpoint=endpoint,
service=self.service_name
).observe(duration)
self._counters["http_requests_total"].add(1, attributes)
self._histograms["http_request_duration"].record(duration, attributes)
except Exception as e:
logger.error(f"Failed to record request metrics for {self.service_name}: {e}")
def set_active_connections(self, count: int):
"""Set active database connections using default gauge."""
def increment_active_requests(self):
"""Increment active request counter"""
try:
DEFAULT_ACTIVE_CONNECTIONS.labels(service=self.service_name).set(count)
self._up_down_counters["active_requests"].add(1, {"service": self.service_name})
except Exception as e:
logger.error(f"Failed to set active connections for {self.service_name}: {e}")
logger.error(f"Failed to increment active requests: {e}")
def get_metrics(self) -> str:
"""Return Prometheus metrics in exposition format."""
def decrement_active_requests(self):
"""Decrement active request counter"""
try:
return generate_latest().decode('utf-8')
self._up_down_counters["active_requests"].add(-1, {"service": self.service_name})
except Exception as e:
logger.error(f"Failed to generate metrics for {self.service_name}: {e}")
return ""
logger.error(f"Failed to decrement active requests: {e}")
def set_active_connections(self, count: int):
"""Set active database connections"""
self.set_gauge("active_connections", count)
def get_metrics_collector(service_name: str) -> Optional[MetricsCollector]:
"""Get metrics collector by service name from global registry."""
"""Get metrics collector by service name from global registry"""
with _registry_lock:
return _metrics_registry.get(service_name)
def create_metrics_collector(service_name: str) -> MetricsCollector:
def create_metrics_collector(
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
) -> MetricsCollector:
"""
Create metrics collector without adding middleware.
Create metrics collector.
This should be called BEFORE app startup, not during lifespan.
"""
# Get existing or create new
existing = get_metrics_collector(service_name)
if existing:
return existing
return MetricsCollector(service_name)
return MetricsCollector(service_name, service_version, meter_provider)
def add_metrics_middleware(app, metrics_collector: MetricsCollector):
@@ -253,12 +270,14 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
"""
@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
# Increment active requests
metrics_collector.increment_active_requests()
start_time = time.time()
try:
response = await call_next(request)
duration = time.time() - start_time
# Record request metrics
metrics_collector.record_request(
method=request.method,
@@ -266,10 +285,14 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
status_code=response.status_code,
duration=duration
)
# Decrement active requests
metrics_collector.decrement_active_requests()
return response
except Exception as e:
duration = time.time() - start_time
# Record failed request
metrics_collector.record_request(
method=request.method,
@@ -277,61 +300,55 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
status_code=500,
duration=duration
)
# Decrement active requests
metrics_collector.decrement_active_requests()
raise
return metrics_collector
def add_metrics_endpoint(app, metrics_collector: MetricsCollector):
"""Add metrics endpoint to app"""
@app.get("/metrics")
async def prometheus_metrics():
"""Prometheus metrics endpoint"""
return Response(
content=metrics_collector.get_metrics(),
media_type="text/plain; version=0.0.4; charset=utf-8"
)
def setup_metrics_early(app, service_name: str = None) -> MetricsCollector:
def setup_metrics_early(
app,
service_name: str = None,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
) -> MetricsCollector:
"""
Setup metrics collection BEFORE app startup.
This must be called before adding any middleware or starting the app.
Note: No Prometheus endpoint is created - all metrics go to SigNoz via OTLP
"""
if service_name is None:
service_name = getattr(app, 'title', 'unknown-service').lower().replace(' ', '-').replace('.', '_')
# Create metrics collector
metrics_collector = create_metrics_collector(service_name)
metrics_collector = create_metrics_collector(service_name, service_version, meter_provider)
# Add middleware (must be before app starts)
add_metrics_middleware(app, metrics_collector)
# Add metrics endpoint
add_metrics_endpoint(app, metrics_collector)
# Store in app state for access from routes
app.state.metrics_collector = metrics_collector
logger.info(f"Metrics setup completed for service: {service_name}")
logger.info(f"OpenTelemetry metrics setup completed for service: {service_name}")
return metrics_collector
# Additional helper function for endpoint tracking
# Helper function for endpoint tracking (kept for backward compatibility)
def track_endpoint_metrics(endpoint_name: str = None, service_name: str = None):
"""Decorator for tracking endpoint metrics - Fixed for async functions"""
"""Decorator for tracking endpoint metrics - metrics handled by middleware"""
def decorator(func):
import asyncio
from functools import wraps
@wraps(func)
async def async_wrapper(*args, **kwargs):
# For now, just pass through - metrics are handled by middleware
return await func(*args, **kwargs)
@wraps(func)
def sync_wrapper(*args, **kwargs):
# For now, just pass through - metrics are handled by middleware
return func(*args, **kwargs)
# Return appropriate wrapper based on function type
@@ -340,4 +357,3 @@ def track_endpoint_metrics(endpoint_name: str = None, service_name: str = None):
else:
return sync_wrapper
return decorator