Update monitoring packages to latest versions
- Updated all OpenTelemetry packages to latest versions: - opentelemetry-api: 1.27.0 → 1.39.1 - opentelemetry-sdk: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-grpc: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-http: 1.27.0 → 1.39.1 - opentelemetry-instrumentation-fastapi: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-httpx: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-redis: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-sqlalchemy: 0.48b0 → 0.60b1 - Removed prometheus-client==0.23.1 from all services - Unified all services to use the same monitoring package versions Generated by Mistral Vibe. Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
This commit is contained in:
@@ -1,79 +1,101 @@
|
||||
# ================================================================
|
||||
# shared/monitoring/metrics.py - FIXED VERSION
|
||||
# ================================================================
|
||||
"""
|
||||
Centralized metrics collection for microservices - Fixed middleware issue
|
||||
OpenTelemetry Metrics Collection for Microservices
|
||||
Replaces Prometheus with native OpenTelemetry metrics export to SigNoz
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
from prometheus_client import Counter, Histogram, Gauge, start_http_server, generate_latest
|
||||
import structlog
|
||||
from typing import Dict, Any, Optional
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
||||
from fastapi import Request, Response
|
||||
from threading import Lock
|
||||
import os
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = structlog.get_logger()
|
||||
|
||||
# Global registry for metrics collectors
|
||||
_metrics_registry: Dict[str, 'MetricsCollector'] = {}
|
||||
_registry_lock = Lock()
|
||||
|
||||
# Default Prometheus metrics
|
||||
DEFAULT_REQUEST_COUNT = Counter(
|
||||
'http_requests_total',
|
||||
'Total HTTP requests',
|
||||
['method', 'endpoint', 'status_code', 'service']
|
||||
)
|
||||
|
||||
DEFAULT_REQUEST_DURATION = Histogram(
|
||||
'http_request_duration_seconds',
|
||||
'HTTP request duration in seconds',
|
||||
['method', 'endpoint', 'service']
|
||||
)
|
||||
|
||||
DEFAULT_ACTIVE_CONNECTIONS = Gauge(
|
||||
'active_connections',
|
||||
'Active database connections',
|
||||
['service']
|
||||
)
|
||||
|
||||
class MetricsCollector:
|
||||
"""Thread-safe metrics collector for microservices"""
|
||||
"""
|
||||
OpenTelemetry-based metrics collector for microservices.
|
||||
Exports metrics directly to SigNoz via OTLP (no Prometheus).
|
||||
"""
|
||||
|
||||
def __init__(self, service_name: str):
|
||||
def __init__(
|
||||
self,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
):
|
||||
self.service_name = service_name
|
||||
self.service_version = service_version
|
||||
self.start_time = time.time()
|
||||
self._counters: Dict[str, Counter] = {}
|
||||
self._histograms: Dict[str, Histogram] = {}
|
||||
self._gauges: Dict[str, Gauge] = {}
|
||||
|
||||
# Use provided meter provider or get global
|
||||
if meter_provider:
|
||||
self.meter = meter_provider.get_meter(__name__)
|
||||
else:
|
||||
self.meter = metrics.get_meter(__name__)
|
||||
|
||||
# Store created instruments
|
||||
self._counters: Dict[str, Any] = {}
|
||||
self._histograms: Dict[str, Any] = {}
|
||||
self._up_down_counters: Dict[str, Any] = {}
|
||||
self._lock = Lock()
|
||||
|
||||
|
||||
# Register in global registry
|
||||
with _registry_lock:
|
||||
_metrics_registry[service_name] = self
|
||||
|
||||
def start_metrics_server(self, port: int = 8080):
|
||||
"""Start Prometheus metrics server"""
|
||||
try:
|
||||
start_http_server(port)
|
||||
logger.info(f"Metrics server started on port {port} for {self.service_name}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start metrics server for {self.service_name}: {e}")
|
||||
# Create default HTTP metrics
|
||||
self._setup_default_metrics()
|
||||
|
||||
def register_counter(self, name: str, documentation: str, labels: List[str] = None) -> Counter:
|
||||
"""Register a custom Counter metric."""
|
||||
logger.info(
|
||||
"OpenTelemetry metrics collector initialized",
|
||||
service=service_name
|
||||
)
|
||||
|
||||
def _setup_default_metrics(self):
|
||||
"""Setup default HTTP metrics"""
|
||||
self._counters["http_requests_total"] = self.meter.create_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_http_requests_total",
|
||||
description="Total HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
self._histograms["http_request_duration"] = self.meter.create_histogram(
|
||||
name=f"{self.service_name.replace('-', '_')}_http_request_duration_seconds",
|
||||
description="HTTP request duration in seconds",
|
||||
unit="s"
|
||||
)
|
||||
|
||||
self._up_down_counters["active_requests"] = self.meter.create_up_down_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_active_requests",
|
||||
description="Number of active HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
def register_counter(self, name: str, documentation: str, labels: list = None) -> Any:
|
||||
"""Register a custom Counter metric"""
|
||||
with self._lock:
|
||||
if name in self._counters:
|
||||
logger.warning(f"Counter '{name}' already registered for {self.service_name}")
|
||||
return self._counters[name]
|
||||
|
||||
if labels is None:
|
||||
labels = ['service']
|
||||
elif 'service' not in labels:
|
||||
labels.append('service')
|
||||
|
||||
|
||||
try:
|
||||
counter = Counter(f"{self.service_name.replace('-', '_')}_{name}", documentation, labelnames=labels)
|
||||
counter = self.meter.create_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=documentation,
|
||||
unit="1"
|
||||
)
|
||||
self._counters[name] = counter
|
||||
logger.info(f"Registered counter: {name} for {self.service_name}")
|
||||
return counter
|
||||
@@ -81,65 +103,46 @@ class MetricsCollector:
|
||||
logger.error(f"Failed to register counter {name} for {self.service_name}: {e}")
|
||||
raise
|
||||
|
||||
def register_histogram(self, name: str, documentation: str, labels: List[str] = None,
|
||||
buckets: tuple = Histogram.DEFAULT_BUCKETS) -> Histogram:
|
||||
"""Register a custom Histogram metric."""
|
||||
def register_histogram(
|
||||
self,
|
||||
name: str,
|
||||
documentation: str,
|
||||
labels: list = None,
|
||||
buckets: tuple = None
|
||||
) -> Any:
|
||||
"""Register a custom Histogram metric"""
|
||||
with self._lock:
|
||||
if name in self._histograms:
|
||||
logger.warning(f"Histogram '{name}' already registered for {self.service_name}")
|
||||
return self._histograms[name]
|
||||
|
||||
if labels is None:
|
||||
labels = ['service']
|
||||
elif 'service' not in labels:
|
||||
labels.append('service')
|
||||
|
||||
|
||||
try:
|
||||
histogram = Histogram(f"{self.service_name.replace('-', '_')}_{name}", documentation,
|
||||
labelnames=labels, buckets=buckets)
|
||||
histogram = self.meter.create_histogram(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=documentation,
|
||||
unit="1"
|
||||
)
|
||||
self._histograms[name] = histogram
|
||||
logger.info(f"Registered histogram: {name} for {self.service_name}")
|
||||
return histogram
|
||||
except ValueError as e:
|
||||
if "Duplicated timeseries" in str(e):
|
||||
# Metric already exists in global registry, try to find it
|
||||
from prometheus_client import REGISTRY
|
||||
metric_name = f"{self.service_name.replace('-', '_')}_{name}"
|
||||
for collector in REGISTRY._collector_to_names.keys():
|
||||
if hasattr(collector, '_name') and collector._name == metric_name:
|
||||
self._histograms[name] = collector
|
||||
logger.warning(f"Reusing existing histogram: {name} for {self.service_name}")
|
||||
return collector
|
||||
# If we can't find it, create a new name with suffix
|
||||
import time
|
||||
suffix = str(int(time.time() * 1000))[-6:] # Last 6 digits of timestamp
|
||||
histogram = Histogram(f"{self.service_name.replace('-', '_')}_{name}_{suffix}",
|
||||
documentation, labelnames=labels, buckets=buckets)
|
||||
self._histograms[name] = histogram
|
||||
logger.warning(f"Created histogram with suffix: {name}_{suffix} for {self.service_name}")
|
||||
return histogram
|
||||
else:
|
||||
logger.error(f"Failed to register histogram {name} for {self.service_name}: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register histogram {name} for {self.service_name}: {e}")
|
||||
raise
|
||||
|
||||
def register_gauge(self, name: str, documentation: str, labels: List[str] = None) -> Gauge:
|
||||
"""Register a custom Gauge metric."""
|
||||
def register_gauge(self, name: str, documentation: str, labels: list = None) -> Any:
|
||||
"""Register a custom Gauge metric (using UpDownCounter)"""
|
||||
with self._lock:
|
||||
if name in self._gauges:
|
||||
if name in self._up_down_counters:
|
||||
logger.warning(f"Gauge '{name}' already registered for {self.service_name}")
|
||||
return self._gauges[name]
|
||||
|
||||
if labels is None:
|
||||
labels = ['service']
|
||||
elif 'service' not in labels:
|
||||
labels.append('service')
|
||||
|
||||
return self._up_down_counters[name]
|
||||
|
||||
try:
|
||||
gauge = Gauge(f"{self.service_name.replace('-', '_')}_{name}", documentation, labelnames=labels)
|
||||
self._gauges[name] = gauge
|
||||
gauge = self.meter.create_up_down_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=documentation,
|
||||
unit="1"
|
||||
)
|
||||
self._up_down_counters[name] = gauge
|
||||
logger.info(f"Registered gauge: {name} for {self.service_name}")
|
||||
return gauge
|
||||
except Exception as e:
|
||||
@@ -147,104 +150,118 @@ class MetricsCollector:
|
||||
raise
|
||||
|
||||
def increment_counter(self, name: str, value: int = 1, labels: Dict[str, str] = None):
|
||||
"""Increment a counter metric."""
|
||||
"""Increment a counter metric"""
|
||||
if name not in self._counters:
|
||||
logger.error(f"Counter '{name}' not registered for {self.service_name}. Cannot increment.")
|
||||
logger.error(f"Counter '{name}' not registered for {self.service_name}")
|
||||
return
|
||||
|
||||
if labels is None:
|
||||
labels = {'service': self.service_name}
|
||||
elif 'service' not in labels:
|
||||
labels['service'] = self.service_name
|
||||
labels = {"service": self.service_name}
|
||||
elif "service" not in labels:
|
||||
labels["service"] = self.service_name
|
||||
|
||||
try:
|
||||
self._counters[name].labels(**labels).inc(value)
|
||||
self._counters[name].add(value, labels)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to increment counter {name} for {self.service_name}: {e}")
|
||||
|
||||
def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None):
|
||||
"""Observe a histogram metric."""
|
||||
"""Observe a histogram metric"""
|
||||
if name not in self._histograms:
|
||||
logger.error(f"Histogram '{name}' not registered for {self.service_name}. Cannot observe.")
|
||||
logger.error(f"Histogram '{name}' not registered for {self.service_name}")
|
||||
return
|
||||
|
||||
if labels is None:
|
||||
labels = {'service': self.service_name}
|
||||
elif 'service' not in labels:
|
||||
labels['service'] = self.service_name
|
||||
labels = {"service": self.service_name}
|
||||
elif "service" not in labels:
|
||||
labels["service"] = self.service_name
|
||||
|
||||
try:
|
||||
self._histograms[name].labels(**labels).observe(value)
|
||||
self._histograms[name].record(value, labels)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to observe histogram {name} for {self.service_name}: {e}")
|
||||
|
||||
def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None):
|
||||
"""Set a gauge metric."""
|
||||
if name not in self._gauges:
|
||||
logger.error(f"Gauge '{name}' not registered for {self.service_name}. Cannot set.")
|
||||
"""Set a gauge metric (using add for UpDownCounter)"""
|
||||
if name not in self._up_down_counters:
|
||||
logger.error(f"Gauge '{name}' not registered for {self.service_name}")
|
||||
return
|
||||
|
||||
if labels is None:
|
||||
labels = {'service': self.service_name}
|
||||
elif 'service' not in labels:
|
||||
labels['service'] = self.service_name
|
||||
labels = {"service": self.service_name}
|
||||
elif "service" not in labels:
|
||||
labels["service"] = self.service_name
|
||||
|
||||
try:
|
||||
self._gauges[name].labels(**labels).set(value)
|
||||
# For UpDownCounter, we need to track the delta
|
||||
# Store current value and calculate delta
|
||||
key = f"{name}_{str(sorted(labels.items()))}"
|
||||
if not hasattr(self, '_gauge_values'):
|
||||
self._gauge_values = {}
|
||||
|
||||
old_value = self._gauge_values.get(key, 0)
|
||||
delta = value - old_value
|
||||
self._gauge_values[key] = value
|
||||
|
||||
self._up_down_counters[name].add(delta, labels)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to set gauge {name} for {self.service_name}: {e}")
|
||||
|
||||
def record_request(self, method: str, endpoint: str, status_code: int, duration: float):
|
||||
"""Record HTTP request metrics using default metrics."""
|
||||
"""Record HTTP request metrics"""
|
||||
try:
|
||||
DEFAULT_REQUEST_COUNT.labels(
|
||||
method=method,
|
||||
endpoint=endpoint,
|
||||
status_code=status_code,
|
||||
service=self.service_name
|
||||
).inc()
|
||||
attributes = {
|
||||
"service": self.service_name,
|
||||
"http.method": method,
|
||||
"http.route": endpoint,
|
||||
"http.status_code": str(status_code)
|
||||
}
|
||||
|
||||
DEFAULT_REQUEST_DURATION.labels(
|
||||
method=method,
|
||||
endpoint=endpoint,
|
||||
service=self.service_name
|
||||
).observe(duration)
|
||||
self._counters["http_requests_total"].add(1, attributes)
|
||||
self._histograms["http_request_duration"].record(duration, attributes)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to record request metrics for {self.service_name}: {e}")
|
||||
|
||||
def set_active_connections(self, count: int):
|
||||
"""Set active database connections using default gauge."""
|
||||
def increment_active_requests(self):
|
||||
"""Increment active request counter"""
|
||||
try:
|
||||
DEFAULT_ACTIVE_CONNECTIONS.labels(service=self.service_name).set(count)
|
||||
self._up_down_counters["active_requests"].add(1, {"service": self.service_name})
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to set active connections for {self.service_name}: {e}")
|
||||
logger.error(f"Failed to increment active requests: {e}")
|
||||
|
||||
def get_metrics(self) -> str:
|
||||
"""Return Prometheus metrics in exposition format."""
|
||||
def decrement_active_requests(self):
|
||||
"""Decrement active request counter"""
|
||||
try:
|
||||
return generate_latest().decode('utf-8')
|
||||
self._up_down_counters["active_requests"].add(-1, {"service": self.service_name})
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate metrics for {self.service_name}: {e}")
|
||||
return ""
|
||||
logger.error(f"Failed to decrement active requests: {e}")
|
||||
|
||||
def set_active_connections(self, count: int):
|
||||
"""Set active database connections"""
|
||||
self.set_gauge("active_connections", count)
|
||||
|
||||
|
||||
def get_metrics_collector(service_name: str) -> Optional[MetricsCollector]:
|
||||
"""Get metrics collector by service name from global registry."""
|
||||
"""Get metrics collector by service name from global registry"""
|
||||
with _registry_lock:
|
||||
return _metrics_registry.get(service_name)
|
||||
|
||||
|
||||
def create_metrics_collector(service_name: str) -> MetricsCollector:
|
||||
def create_metrics_collector(
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
) -> MetricsCollector:
|
||||
"""
|
||||
Create metrics collector without adding middleware.
|
||||
Create metrics collector.
|
||||
This should be called BEFORE app startup, not during lifespan.
|
||||
"""
|
||||
# Get existing or create new
|
||||
existing = get_metrics_collector(service_name)
|
||||
if existing:
|
||||
return existing
|
||||
|
||||
return MetricsCollector(service_name)
|
||||
|
||||
return MetricsCollector(service_name, service_version, meter_provider)
|
||||
|
||||
|
||||
def add_metrics_middleware(app, metrics_collector: MetricsCollector):
|
||||
@@ -253,12 +270,14 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
|
||||
"""
|
||||
@app.middleware("http")
|
||||
async def metrics_middleware(request: Request, call_next):
|
||||
# Increment active requests
|
||||
metrics_collector.increment_active_requests()
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
try:
|
||||
response = await call_next(request)
|
||||
duration = time.time() - start_time
|
||||
|
||||
|
||||
# Record request metrics
|
||||
metrics_collector.record_request(
|
||||
method=request.method,
|
||||
@@ -266,10 +285,14 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
|
||||
status_code=response.status_code,
|
||||
duration=duration
|
||||
)
|
||||
|
||||
|
||||
# Decrement active requests
|
||||
metrics_collector.decrement_active_requests()
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
duration = time.time() - start_time
|
||||
|
||||
# Record failed request
|
||||
metrics_collector.record_request(
|
||||
method=request.method,
|
||||
@@ -277,61 +300,55 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
|
||||
status_code=500,
|
||||
duration=duration
|
||||
)
|
||||
|
||||
# Decrement active requests
|
||||
metrics_collector.decrement_active_requests()
|
||||
raise
|
||||
|
||||
|
||||
return metrics_collector
|
||||
|
||||
|
||||
def add_metrics_endpoint(app, metrics_collector: MetricsCollector):
|
||||
"""Add metrics endpoint to app"""
|
||||
@app.get("/metrics")
|
||||
async def prometheus_metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
return Response(
|
||||
content=metrics_collector.get_metrics(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8"
|
||||
)
|
||||
|
||||
|
||||
def setup_metrics_early(app, service_name: str = None) -> MetricsCollector:
|
||||
def setup_metrics_early(
|
||||
app,
|
||||
service_name: str = None,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
) -> MetricsCollector:
|
||||
"""
|
||||
Setup metrics collection BEFORE app startup.
|
||||
This must be called before adding any middleware or starting the app.
|
||||
|
||||
Note: No Prometheus endpoint is created - all metrics go to SigNoz via OTLP
|
||||
"""
|
||||
if service_name is None:
|
||||
service_name = getattr(app, 'title', 'unknown-service').lower().replace(' ', '-').replace('.', '_')
|
||||
|
||||
|
||||
# Create metrics collector
|
||||
metrics_collector = create_metrics_collector(service_name)
|
||||
|
||||
metrics_collector = create_metrics_collector(service_name, service_version, meter_provider)
|
||||
|
||||
# Add middleware (must be before app starts)
|
||||
add_metrics_middleware(app, metrics_collector)
|
||||
|
||||
# Add metrics endpoint
|
||||
add_metrics_endpoint(app, metrics_collector)
|
||||
|
||||
|
||||
# Store in app state for access from routes
|
||||
app.state.metrics_collector = metrics_collector
|
||||
|
||||
logger.info(f"Metrics setup completed for service: {service_name}")
|
||||
|
||||
logger.info(f"OpenTelemetry metrics setup completed for service: {service_name}")
|
||||
return metrics_collector
|
||||
|
||||
|
||||
# Additional helper function for endpoint tracking
|
||||
# Helper function for endpoint tracking (kept for backward compatibility)
|
||||
def track_endpoint_metrics(endpoint_name: str = None, service_name: str = None):
|
||||
"""Decorator for tracking endpoint metrics - Fixed for async functions"""
|
||||
"""Decorator for tracking endpoint metrics - metrics handled by middleware"""
|
||||
def decorator(func):
|
||||
import asyncio
|
||||
from functools import wraps
|
||||
|
||||
@wraps(func)
|
||||
async def async_wrapper(*args, **kwargs):
|
||||
# For now, just pass through - metrics are handled by middleware
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
@wraps(func)
|
||||
def sync_wrapper(*args, **kwargs):
|
||||
# For now, just pass through - metrics are handled by middleware
|
||||
return func(*args, **kwargs)
|
||||
|
||||
# Return appropriate wrapper based on function type
|
||||
@@ -340,4 +357,3 @@ def track_endpoint_metrics(endpoint_name: str = None, service_name: str = None):
|
||||
else:
|
||||
return sync_wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
Reference in New Issue
Block a user