Update monitoring packages to latest versions

- Updated all OpenTelemetry packages to latest versions:
  - opentelemetry-api: 1.27.0 → 1.39.1
  - opentelemetry-sdk: 1.27.0 → 1.39.1
  - opentelemetry-exporter-otlp-proto-grpc: 1.27.0 → 1.39.1
  - opentelemetry-exporter-otlp-proto-http: 1.27.0 → 1.39.1
  - opentelemetry-instrumentation-fastapi: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-httpx: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-redis: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-sqlalchemy: 0.48b0 → 0.60b1

- Removed prometheus-client==0.23.1 from all services
- Unified all services to use the same monitoring package versions

Generated by Mistral Vibe.
Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
This commit is contained in:
Urtzi Alfaro
2026-01-08 19:25:52 +01:00
parent dfb7e4b237
commit 29d19087f1
129 changed files with 5718 additions and 1821 deletions

View File

@@ -10,6 +10,22 @@ from .health_checks import (
create_health_manager,
setup_fastapi_health_checks
)
from .logs_exporter import (
setup_otel_logging,
add_log_context,
get_current_trace_context,
StructlogOTELProcessor
)
from .metrics_exporter import (
setup_otel_metrics,
OTelMetricsCollector,
create_dual_metrics_collector
)
from .system_metrics import (
SystemMetricsCollector,
ApplicationMetricsCollector,
setup_all_metrics
)
__all__ = [
'setup_logging',
@@ -19,5 +35,15 @@ __all__ = [
'HealthCheckManager',
'FastAPIHealthChecker',
'create_health_manager',
'setup_fastapi_health_checks'
'setup_fastapi_health_checks',
'setup_otel_logging',
'add_log_context',
'get_current_trace_context',
'StructlogOTELProcessor',
'setup_otel_metrics',
'OTelMetricsCollector',
'create_dual_metrics_collector',
'SystemMetricsCollector',
'ApplicationMetricsCollector',
'setup_all_metrics'
]

View File

@@ -0,0 +1,220 @@
"""
OpenTelemetry Logs Integration for SigNoz
Exports structured logs to SigNoz via OpenTelemetry Collector
"""
import os
import logging
import structlog
from typing import Optional
from opentelemetry._logs import set_logger_provider
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
try:
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
except ImportError:
try:
from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
except ImportError:
OTLPLogExporter = None
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
logger = structlog.get_logger()
def setup_otel_logging(
service_name: str,
service_version: str = "1.0.0",
otel_endpoint: Optional[str] = None,
enable_console: bool = True
) -> Optional[LoggingHandler]:
"""
Setup OpenTelemetry logging to export logs to SigNoz.
This integrates with Python's standard logging to automatically
export all log records to SigNoz via the OTLP protocol.
Args:
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (default from env)
enable_console: Whether to also log to console (default: True)
Returns:
LoggingHandler instance if successful, None otherwise
Example:
from shared.monitoring.logs_exporter import setup_otel_logging
# Setup during service initialization
setup_otel_logging("auth-service", "1.0.0")
# Now all standard logging calls will be exported to SigNoz
import logging
logger = logging.getLogger(__name__)
logger.info("This will appear in SigNoz!")
"""
# Check if logging export is enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() != "otlp":
logger.info(
"OpenTelemetry logs export disabled",
service=service_name,
reason="OTEL_LOGS_EXPORTER not set to 'otlp'"
)
return None
# Get OTLP endpoint from environment or parameter
if otel_endpoint is None:
otel_endpoint = os.getenv(
"OTEL_EXPORTER_OTLP_ENDPOINT",
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.signoz:4318")
)
# Ensure endpoint has /v1/logs path for HTTP
if not otel_endpoint.endswith("/v1/logs"):
otel_endpoint = f"{otel_endpoint}/v1/logs"
try:
# Check if OTLPLogExporter is available
if OTLPLogExporter is None:
logger.warning(
"OpenTelemetry HTTP OTLP exporter not available",
service=service_name,
reason="opentelemetry-exporter-otlp-proto-http package not installed"
)
return None
# Create resource with service information
resource = Resource(attributes={
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
})
# Configure logger provider
logger_provider = LoggerProvider(resource=resource)
set_logger_provider(logger_provider)
# Configure OTLP exporter for logs
otlp_exporter = OTLPLogExporter(
endpoint=otel_endpoint,
timeout=10
)
# Add log record processor with batching
log_processor = BatchLogRecordProcessor(otlp_exporter)
logger_provider.add_log_record_processor(log_processor)
# Create logging handler that bridges standard logging to OpenTelemetry
otel_handler = LoggingHandler(
level=logging.NOTSET, # Capture all levels
logger_provider=logger_provider
)
# Add handler to root logger
root_logger = logging.getLogger()
root_logger.addHandler(otel_handler)
logger.info(
"OpenTelemetry logs export configured",
service=service_name,
otel_endpoint=otel_endpoint,
console_logging=enable_console
)
return otel_handler
except Exception as e:
logger.error(
"Failed to setup OpenTelemetry logs export",
service=service_name,
error=str(e),
reason="Will continue with standard logging only"
)
return None
def add_log_context(**context):
"""
Add contextual information to logs that will be sent to SigNoz.
This is useful for adding request IDs, user IDs, tenant IDs, etc.
that help with filtering and correlation in SigNoz.
Args:
**context: Key-value pairs to add to log context
Example:
from shared.monitoring.logs_exporter import add_log_context
# Add context for current request
add_log_context(
request_id="req_123",
user_id="user_456",
tenant_id="tenant_789"
)
# Now all logs will include this context
logger.info("Processing order") # Will include request_id, user_id, tenant_id
"""
# This works with structlog's context binding
bound_logger = structlog.get_logger()
return bound_logger.bind(**context)
def get_current_trace_context() -> dict:
"""
Get current trace context for log correlation.
Returns a dict with trace_id and span_id if available,
which can be added to log records for correlation with traces.
Returns:
Dict with trace_id and span_id, or empty dict if no active trace
Example:
from shared.monitoring.logs_exporter import get_current_trace_context
# Get trace context and add to logs
trace_ctx = get_current_trace_context()
logger.info("Processing request", **trace_ctx)
"""
from opentelemetry import trace
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
return {
"trace_id": format(span.get_span_context().trace_id, '032x'),
"span_id": format(span.get_span_context().span_id, '016x'),
}
return {}
class StructlogOTELProcessor:
"""
Structlog processor that adds OpenTelemetry trace context to logs.
This automatically adds trace_id and span_id to all log records,
enabling correlation between logs and traces in SigNoz.
Usage:
import structlog
from shared.monitoring.logs_exporter import StructlogOTELProcessor
structlog.configure(
processors=[
StructlogOTELProcessor(),
# ... other processors
]
)
"""
def __call__(self, logger, method_name, event_dict):
"""Add trace context to log event"""
trace_ctx = get_current_trace_context()
if trace_ctx:
event_dict.update(trace_ctx)
return event_dict

View File

@@ -1,79 +1,101 @@
# ================================================================
# shared/monitoring/metrics.py - FIXED VERSION
# ================================================================
"""
Centralized metrics collection for microservices - Fixed middleware issue
OpenTelemetry Metrics Collection for Microservices
Replaces Prometheus with native OpenTelemetry metrics export to SigNoz
"""
import time
import logging
from typing import Dict, Any, List, Optional
from prometheus_client import Counter, Histogram, Gauge, start_http_server, generate_latest
import structlog
from typing import Dict, Any, Optional
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from fastapi import Request, Response
from threading import Lock
import os
logger = logging.getLogger(__name__)
logger = structlog.get_logger()
# Global registry for metrics collectors
_metrics_registry: Dict[str, 'MetricsCollector'] = {}
_registry_lock = Lock()
# Default Prometheus metrics
DEFAULT_REQUEST_COUNT = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status_code', 'service']
)
DEFAULT_REQUEST_DURATION = Histogram(
'http_request_duration_seconds',
'HTTP request duration in seconds',
['method', 'endpoint', 'service']
)
DEFAULT_ACTIVE_CONNECTIONS = Gauge(
'active_connections',
'Active database connections',
['service']
)
class MetricsCollector:
"""Thread-safe metrics collector for microservices"""
"""
OpenTelemetry-based metrics collector for microservices.
Exports metrics directly to SigNoz via OTLP (no Prometheus).
"""
def __init__(self, service_name: str):
def __init__(
self,
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
):
self.service_name = service_name
self.service_version = service_version
self.start_time = time.time()
self._counters: Dict[str, Counter] = {}
self._histograms: Dict[str, Histogram] = {}
self._gauges: Dict[str, Gauge] = {}
# Use provided meter provider or get global
if meter_provider:
self.meter = meter_provider.get_meter(__name__)
else:
self.meter = metrics.get_meter(__name__)
# Store created instruments
self._counters: Dict[str, Any] = {}
self._histograms: Dict[str, Any] = {}
self._up_down_counters: Dict[str, Any] = {}
self._lock = Lock()
# Register in global registry
with _registry_lock:
_metrics_registry[service_name] = self
def start_metrics_server(self, port: int = 8080):
"""Start Prometheus metrics server"""
try:
start_http_server(port)
logger.info(f"Metrics server started on port {port} for {self.service_name}")
except Exception as e:
logger.error(f"Failed to start metrics server for {self.service_name}: {e}")
# Create default HTTP metrics
self._setup_default_metrics()
def register_counter(self, name: str, documentation: str, labels: List[str] = None) -> Counter:
"""Register a custom Counter metric."""
logger.info(
"OpenTelemetry metrics collector initialized",
service=service_name
)
def _setup_default_metrics(self):
"""Setup default HTTP metrics"""
self._counters["http_requests_total"] = self.meter.create_counter(
name=f"{self.service_name.replace('-', '_')}_http_requests_total",
description="Total HTTP requests",
unit="requests"
)
self._histograms["http_request_duration"] = self.meter.create_histogram(
name=f"{self.service_name.replace('-', '_')}_http_request_duration_seconds",
description="HTTP request duration in seconds",
unit="s"
)
self._up_down_counters["active_requests"] = self.meter.create_up_down_counter(
name=f"{self.service_name.replace('-', '_')}_active_requests",
description="Number of active HTTP requests",
unit="requests"
)
def register_counter(self, name: str, documentation: str, labels: list = None) -> Any:
"""Register a custom Counter metric"""
with self._lock:
if name in self._counters:
logger.warning(f"Counter '{name}' already registered for {self.service_name}")
return self._counters[name]
if labels is None:
labels = ['service']
elif 'service' not in labels:
labels.append('service')
try:
counter = Counter(f"{self.service_name.replace('-', '_')}_{name}", documentation, labelnames=labels)
counter = self.meter.create_counter(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=documentation,
unit="1"
)
self._counters[name] = counter
logger.info(f"Registered counter: {name} for {self.service_name}")
return counter
@@ -81,65 +103,46 @@ class MetricsCollector:
logger.error(f"Failed to register counter {name} for {self.service_name}: {e}")
raise
def register_histogram(self, name: str, documentation: str, labels: List[str] = None,
buckets: tuple = Histogram.DEFAULT_BUCKETS) -> Histogram:
"""Register a custom Histogram metric."""
def register_histogram(
self,
name: str,
documentation: str,
labels: list = None,
buckets: tuple = None
) -> Any:
"""Register a custom Histogram metric"""
with self._lock:
if name in self._histograms:
logger.warning(f"Histogram '{name}' already registered for {self.service_name}")
return self._histograms[name]
if labels is None:
labels = ['service']
elif 'service' not in labels:
labels.append('service')
try:
histogram = Histogram(f"{self.service_name.replace('-', '_')}_{name}", documentation,
labelnames=labels, buckets=buckets)
histogram = self.meter.create_histogram(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=documentation,
unit="1"
)
self._histograms[name] = histogram
logger.info(f"Registered histogram: {name} for {self.service_name}")
return histogram
except ValueError as e:
if "Duplicated timeseries" in str(e):
# Metric already exists in global registry, try to find it
from prometheus_client import REGISTRY
metric_name = f"{self.service_name.replace('-', '_')}_{name}"
for collector in REGISTRY._collector_to_names.keys():
if hasattr(collector, '_name') and collector._name == metric_name:
self._histograms[name] = collector
logger.warning(f"Reusing existing histogram: {name} for {self.service_name}")
return collector
# If we can't find it, create a new name with suffix
import time
suffix = str(int(time.time() * 1000))[-6:] # Last 6 digits of timestamp
histogram = Histogram(f"{self.service_name.replace('-', '_')}_{name}_{suffix}",
documentation, labelnames=labels, buckets=buckets)
self._histograms[name] = histogram
logger.warning(f"Created histogram with suffix: {name}_{suffix} for {self.service_name}")
return histogram
else:
logger.error(f"Failed to register histogram {name} for {self.service_name}: {e}")
raise
except Exception as e:
logger.error(f"Failed to register histogram {name} for {self.service_name}: {e}")
raise
def register_gauge(self, name: str, documentation: str, labels: List[str] = None) -> Gauge:
"""Register a custom Gauge metric."""
def register_gauge(self, name: str, documentation: str, labels: list = None) -> Any:
"""Register a custom Gauge metric (using UpDownCounter)"""
with self._lock:
if name in self._gauges:
if name in self._up_down_counters:
logger.warning(f"Gauge '{name}' already registered for {self.service_name}")
return self._gauges[name]
if labels is None:
labels = ['service']
elif 'service' not in labels:
labels.append('service')
return self._up_down_counters[name]
try:
gauge = Gauge(f"{self.service_name.replace('-', '_')}_{name}", documentation, labelnames=labels)
self._gauges[name] = gauge
gauge = self.meter.create_up_down_counter(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=documentation,
unit="1"
)
self._up_down_counters[name] = gauge
logger.info(f"Registered gauge: {name} for {self.service_name}")
return gauge
except Exception as e:
@@ -147,104 +150,118 @@ class MetricsCollector:
raise
def increment_counter(self, name: str, value: int = 1, labels: Dict[str, str] = None):
"""Increment a counter metric."""
"""Increment a counter metric"""
if name not in self._counters:
logger.error(f"Counter '{name}' not registered for {self.service_name}. Cannot increment.")
logger.error(f"Counter '{name}' not registered for {self.service_name}")
return
if labels is None:
labels = {'service': self.service_name}
elif 'service' not in labels:
labels['service'] = self.service_name
labels = {"service": self.service_name}
elif "service" not in labels:
labels["service"] = self.service_name
try:
self._counters[name].labels(**labels).inc(value)
self._counters[name].add(value, labels)
except Exception as e:
logger.error(f"Failed to increment counter {name} for {self.service_name}: {e}")
def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None):
"""Observe a histogram metric."""
"""Observe a histogram metric"""
if name not in self._histograms:
logger.error(f"Histogram '{name}' not registered for {self.service_name}. Cannot observe.")
logger.error(f"Histogram '{name}' not registered for {self.service_name}")
return
if labels is None:
labels = {'service': self.service_name}
elif 'service' not in labels:
labels['service'] = self.service_name
labels = {"service": self.service_name}
elif "service" not in labels:
labels["service"] = self.service_name
try:
self._histograms[name].labels(**labels).observe(value)
self._histograms[name].record(value, labels)
except Exception as e:
logger.error(f"Failed to observe histogram {name} for {self.service_name}: {e}")
def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None):
"""Set a gauge metric."""
if name not in self._gauges:
logger.error(f"Gauge '{name}' not registered for {self.service_name}. Cannot set.")
"""Set a gauge metric (using add for UpDownCounter)"""
if name not in self._up_down_counters:
logger.error(f"Gauge '{name}' not registered for {self.service_name}")
return
if labels is None:
labels = {'service': self.service_name}
elif 'service' not in labels:
labels['service'] = self.service_name
labels = {"service": self.service_name}
elif "service" not in labels:
labels["service"] = self.service_name
try:
self._gauges[name].labels(**labels).set(value)
# For UpDownCounter, we need to track the delta
# Store current value and calculate delta
key = f"{name}_{str(sorted(labels.items()))}"
if not hasattr(self, '_gauge_values'):
self._gauge_values = {}
old_value = self._gauge_values.get(key, 0)
delta = value - old_value
self._gauge_values[key] = value
self._up_down_counters[name].add(delta, labels)
except Exception as e:
logger.error(f"Failed to set gauge {name} for {self.service_name}: {e}")
def record_request(self, method: str, endpoint: str, status_code: int, duration: float):
"""Record HTTP request metrics using default metrics."""
"""Record HTTP request metrics"""
try:
DEFAULT_REQUEST_COUNT.labels(
method=method,
endpoint=endpoint,
status_code=status_code,
service=self.service_name
).inc()
attributes = {
"service": self.service_name,
"http.method": method,
"http.route": endpoint,
"http.status_code": str(status_code)
}
DEFAULT_REQUEST_DURATION.labels(
method=method,
endpoint=endpoint,
service=self.service_name
).observe(duration)
self._counters["http_requests_total"].add(1, attributes)
self._histograms["http_request_duration"].record(duration, attributes)
except Exception as e:
logger.error(f"Failed to record request metrics for {self.service_name}: {e}")
def set_active_connections(self, count: int):
"""Set active database connections using default gauge."""
def increment_active_requests(self):
"""Increment active request counter"""
try:
DEFAULT_ACTIVE_CONNECTIONS.labels(service=self.service_name).set(count)
self._up_down_counters["active_requests"].add(1, {"service": self.service_name})
except Exception as e:
logger.error(f"Failed to set active connections for {self.service_name}: {e}")
logger.error(f"Failed to increment active requests: {e}")
def get_metrics(self) -> str:
"""Return Prometheus metrics in exposition format."""
def decrement_active_requests(self):
"""Decrement active request counter"""
try:
return generate_latest().decode('utf-8')
self._up_down_counters["active_requests"].add(-1, {"service": self.service_name})
except Exception as e:
logger.error(f"Failed to generate metrics for {self.service_name}: {e}")
return ""
logger.error(f"Failed to decrement active requests: {e}")
def set_active_connections(self, count: int):
"""Set active database connections"""
self.set_gauge("active_connections", count)
def get_metrics_collector(service_name: str) -> Optional[MetricsCollector]:
"""Get metrics collector by service name from global registry."""
"""Get metrics collector by service name from global registry"""
with _registry_lock:
return _metrics_registry.get(service_name)
def create_metrics_collector(service_name: str) -> MetricsCollector:
def create_metrics_collector(
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
) -> MetricsCollector:
"""
Create metrics collector without adding middleware.
Create metrics collector.
This should be called BEFORE app startup, not during lifespan.
"""
# Get existing or create new
existing = get_metrics_collector(service_name)
if existing:
return existing
return MetricsCollector(service_name)
return MetricsCollector(service_name, service_version, meter_provider)
def add_metrics_middleware(app, metrics_collector: MetricsCollector):
@@ -253,12 +270,14 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
"""
@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
# Increment active requests
metrics_collector.increment_active_requests()
start_time = time.time()
try:
response = await call_next(request)
duration = time.time() - start_time
# Record request metrics
metrics_collector.record_request(
method=request.method,
@@ -266,10 +285,14 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
status_code=response.status_code,
duration=duration
)
# Decrement active requests
metrics_collector.decrement_active_requests()
return response
except Exception as e:
duration = time.time() - start_time
# Record failed request
metrics_collector.record_request(
method=request.method,
@@ -277,61 +300,55 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
status_code=500,
duration=duration
)
# Decrement active requests
metrics_collector.decrement_active_requests()
raise
return metrics_collector
def add_metrics_endpoint(app, metrics_collector: MetricsCollector):
"""Add metrics endpoint to app"""
@app.get("/metrics")
async def prometheus_metrics():
"""Prometheus metrics endpoint"""
return Response(
content=metrics_collector.get_metrics(),
media_type="text/plain; version=0.0.4; charset=utf-8"
)
def setup_metrics_early(app, service_name: str = None) -> MetricsCollector:
def setup_metrics_early(
app,
service_name: str = None,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
) -> MetricsCollector:
"""
Setup metrics collection BEFORE app startup.
This must be called before adding any middleware or starting the app.
Note: No Prometheus endpoint is created - all metrics go to SigNoz via OTLP
"""
if service_name is None:
service_name = getattr(app, 'title', 'unknown-service').lower().replace(' ', '-').replace('.', '_')
# Create metrics collector
metrics_collector = create_metrics_collector(service_name)
metrics_collector = create_metrics_collector(service_name, service_version, meter_provider)
# Add middleware (must be before app starts)
add_metrics_middleware(app, metrics_collector)
# Add metrics endpoint
add_metrics_endpoint(app, metrics_collector)
# Store in app state for access from routes
app.state.metrics_collector = metrics_collector
logger.info(f"Metrics setup completed for service: {service_name}")
logger.info(f"OpenTelemetry metrics setup completed for service: {service_name}")
return metrics_collector
# Additional helper function for endpoint tracking
# Helper function for endpoint tracking (kept for backward compatibility)
def track_endpoint_metrics(endpoint_name: str = None, service_name: str = None):
"""Decorator for tracking endpoint metrics - Fixed for async functions"""
"""Decorator for tracking endpoint metrics - metrics handled by middleware"""
def decorator(func):
import asyncio
from functools import wraps
@wraps(func)
async def async_wrapper(*args, **kwargs):
# For now, just pass through - metrics are handled by middleware
return await func(*args, **kwargs)
@wraps(func)
def sync_wrapper(*args, **kwargs):
# For now, just pass through - metrics are handled by middleware
return func(*args, **kwargs)
# Return appropriate wrapper based on function type
@@ -340,4 +357,3 @@ def track_endpoint_metrics(endpoint_name: str = None, service_name: str = None):
else:
return sync_wrapper
return decorator

View File

@@ -0,0 +1,250 @@
"""
OpenTelemetry Metrics Integration for SigNoz
Exports metrics to SigNoz via OpenTelemetry Collector in addition to Prometheus
"""
import os
import structlog
from typing import Optional
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
logger = structlog.get_logger()
def setup_otel_metrics(
service_name: str,
service_version: str = "1.0.0",
otel_endpoint: Optional[str] = None,
export_interval_millis: int = 60000 # Export every 60 seconds
) -> Optional[MeterProvider]:
"""
Setup OpenTelemetry metrics to export to SigNoz.
This creates a dual-export strategy:
- Prometheus exposition format at /metrics (for Prometheus scraping)
- OTLP push to SigNoz collector (for direct ingestion)
Args:
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (default from env)
export_interval_millis: How often to push metrics (default 60s)
Returns:
MeterProvider instance if successful, None otherwise
Example:
from shared.monitoring.metrics_exporter import setup_otel_metrics
# Setup during service initialization
meter_provider = setup_otel_metrics("auth-service", "1.0.0")
# Create meters for your metrics
meter = meter_provider.get_meter(__name__)
request_counter = meter.create_counter(
"http.server.requests",
description="Total HTTP requests",
unit="1"
)
# Record metrics
request_counter.add(1, {"method": "GET", "status": "200"})
"""
# Check if metrics export is enabled
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
if not enable_otel_metrics:
logger.info(
"OpenTelemetry metrics export disabled",
service=service_name,
reason="ENABLE_OTEL_METRICS not set to 'true'"
)
return None
# Get OTLP endpoint from environment or parameter
if otel_endpoint is None:
otel_endpoint = os.getenv(
"OTEL_EXPORTER_OTLP_ENDPOINT",
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.signoz:4318")
)
# Ensure endpoint has /v1/metrics path for HTTP
if not otel_endpoint.endswith("/v1/metrics"):
otel_endpoint = f"{otel_endpoint}/v1/metrics"
try:
# Create resource with service information
resource = Resource(attributes={
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
})
# Configure OTLP exporter for metrics
otlp_exporter = OTLPMetricExporter(
endpoint=otel_endpoint,
timeout=10
)
# Create periodic metric reader
metric_reader = PeriodicExportingMetricReader(
exporter=otlp_exporter,
export_interval_millis=export_interval_millis
)
# Configure meter provider
meter_provider = MeterProvider(
resource=resource,
metric_readers=[metric_reader]
)
# Set global meter provider
metrics.set_meter_provider(meter_provider)
logger.info(
"OpenTelemetry metrics export configured",
service=service_name,
otel_endpoint=otel_endpoint,
export_interval_seconds=export_interval_millis / 1000
)
return meter_provider
except Exception as e:
logger.error(
"Failed to setup OpenTelemetry metrics export",
service=service_name,
error=str(e),
reason="Will continue with Prometheus-only metrics"
)
return None
class OTelMetricsCollector:
"""
Wrapper for OpenTelemetry metrics that provides a similar interface
to the Prometheus MetricsCollector.
This allows services to emit metrics that go to both Prometheus and SigNoz.
"""
def __init__(self, service_name: str, meter_provider: MeterProvider):
self.service_name = service_name
self.meter_provider = meter_provider
self.meter = meter_provider.get_meter(__name__)
# Store created instruments
self._counters = {}
self._histograms = {}
self._gauges = {}
def create_counter(self, name: str, description: str = "", unit: str = "1"):
"""Create or get an OpenTelemetry Counter"""
if name not in self._counters:
self._counters[name] = self.meter.create_counter(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=description,
unit=unit
)
return self._counters[name]
def create_histogram(self, name: str, description: str = "", unit: str = "1"):
"""Create or get an OpenTelemetry Histogram"""
if name not in self._histograms:
self._histograms[name] = self.meter.create_histogram(
name=f"{self.service_name.replace('-', '_')}_{name}",
description=description,
unit=unit
)
return self._histograms[name]
def create_gauge(self, name: str, description: str = "", unit: str = "1"):
"""
Create or get an OpenTelemetry observable gauge.
Note: Gauges in OTEL require a callback function.
"""
if name not in self._gauges:
# Store gauge reference for callback registration
self._gauges[name] = {
"name": f"{self.service_name.replace('-', '_')}_{name}",
"description": description,
"unit": unit,
"value": 0,
"attributes": {}
}
return self._gauges[name]
def increment_counter(self, name: str, value: int = 1, attributes: dict = None):
"""Increment a counter with optional attributes"""
if name in self._counters:
if attributes is None:
attributes = {"service": self.service_name}
elif "service" not in attributes:
attributes["service"] = self.service_name
self._counters[name].add(value, attributes)
def observe_histogram(self, name: str, value: float, attributes: dict = None):
"""Record a histogram observation with optional attributes"""
if name in self._histograms:
if attributes is None:
attributes = {"service": self.service_name}
elif "service" not in attributes:
attributes["service"] = self.service_name
self._histograms[name].record(value, attributes)
def set_gauge(self, name: str, value: float, attributes: dict = None):
"""Set a gauge value (stores for next callback)"""
if name in self._gauges:
if attributes is None:
attributes = {"service": self.service_name}
elif "service" not in attributes:
attributes["service"] = self.service_name
self._gauges[name]["value"] = value
self._gauges[name]["attributes"] = attributes
def create_dual_metrics_collector(service_name: str, service_version: str = "1.0.0"):
"""
Create a metrics collector that exports to both Prometheus and SigNoz.
This function sets up both collection strategies:
1. Prometheus client library (for /metrics endpoint scraping)
2. OpenTelemetry metrics (for OTLP push to SigNoz)
Returns a tuple: (prometheus_collector, otel_collector)
Both collectors can be used independently or together.
Example:
from shared.monitoring.metrics_exporter import create_dual_metrics_collector
prom_collector, otel_collector = create_dual_metrics_collector("auth-service")
# Prometheus counter
prom_collector.register_counter("requests_total", "Total requests")
prom_collector.increment_counter("requests_total", labels={"status": "200"})
# OpenTelemetry counter (pushed to SigNoz)
counter = otel_collector.create_counter("requests_total", "Total requests")
counter.add(1, {"status": "200"})
"""
from shared.monitoring.metrics import MetricsCollector
# Create Prometheus collector
prom_collector = MetricsCollector(service_name)
# Create OpenTelemetry collector
meter_provider = setup_otel_metrics(service_name, service_version)
otel_collector = None
if meter_provider:
otel_collector = OTelMetricsCollector(service_name, meter_provider)
return prom_collector, otel_collector

View File

@@ -0,0 +1,433 @@
"""
System Metrics Collection for SigNoz
Collects CPU, memory, disk, and process metrics via OpenTelemetry
"""
import os
import psutil
import structlog
from typing import Optional
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
logger = structlog.get_logger()
class SystemMetricsCollector:
"""
Collects system-level metrics (CPU, memory, disk, network, process info)
and exports them to SigNoz via OpenTelemetry.
These metrics help monitor service health and resource utilization.
"""
def __init__(
self,
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
):
self.service_name = service_name
self.service_version = service_version
self.process = psutil.Process()
# Use provided meter provider or get global
if meter_provider:
self.meter = meter_provider.get_meter(__name__)
else:
self.meter = metrics.get_meter(__name__)
# Initialize metric instruments
self._setup_metrics()
logger.info(
"System metrics collector initialized",
service=service_name,
pid=os.getpid()
)
def _setup_metrics(self):
"""Setup all system metric instruments"""
# Process CPU metrics
self.process_cpu_percent = self.meter.create_observable_gauge(
name="process.cpu.utilization",
description="Process CPU utilization percentage",
unit="percent",
callbacks=[self._observe_process_cpu]
)
# Process memory metrics
self.process_memory_usage = self.meter.create_observable_gauge(
name="process.memory.usage",
description="Process memory usage in bytes",
unit="bytes",
callbacks=[self._observe_process_memory]
)
self.process_memory_percent = self.meter.create_observable_gauge(
name="process.memory.utilization",
description="Process memory utilization percentage",
unit="percent",
callbacks=[self._observe_process_memory_percent]
)
# Process thread count
self.process_threads = self.meter.create_observable_gauge(
name="process.threads.count",
description="Number of threads in the process",
unit="threads",
callbacks=[self._observe_process_threads]
)
# Process file descriptors (Unix only)
if hasattr(self.process, 'num_fds'):
self.process_fds = self.meter.create_observable_gauge(
name="process.open_file_descriptors",
description="Number of open file descriptors",
unit="fds",
callbacks=[self._observe_process_fds]
)
# System-wide CPU metrics
self.system_cpu_percent = self.meter.create_observable_gauge(
name="system.cpu.utilization",
description="System-wide CPU utilization percentage",
unit="percent",
callbacks=[self._observe_system_cpu]
)
# System-wide memory metrics
self.system_memory_usage = self.meter.create_observable_gauge(
name="system.memory.usage",
description="System memory usage in bytes",
unit="bytes",
callbacks=[self._observe_system_memory]
)
self.system_memory_percent = self.meter.create_observable_gauge(
name="system.memory.utilization",
description="System memory utilization percentage",
unit="percent",
callbacks=[self._observe_system_memory_percent]
)
# Disk I/O metrics
self.disk_io_read = self.meter.create_observable_counter(
name="system.disk.io.read",
description="Disk bytes read",
unit="bytes",
callbacks=[self._observe_disk_io_read]
)
self.disk_io_write = self.meter.create_observable_counter(
name="system.disk.io.write",
description="Disk bytes written",
unit="bytes",
callbacks=[self._observe_disk_io_write]
)
# Network I/O metrics
self.network_io_sent = self.meter.create_observable_counter(
name="system.network.io.sent",
description="Network bytes sent",
unit="bytes",
callbacks=[self._observe_network_io_sent]
)
self.network_io_recv = self.meter.create_observable_counter(
name="system.network.io.received",
description="Network bytes received",
unit="bytes",
callbacks=[self._observe_network_io_recv]
)
# Callback methods for observable instruments
def _observe_process_cpu(self, options):
"""Observe process CPU usage"""
try:
cpu_percent = self.process.cpu_percent(interval=None)
yield metrics.Observation(
cpu_percent,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect process CPU metrics: {e}")
def _observe_process_memory(self, options):
"""Observe process memory usage"""
try:
mem_info = self.process.memory_info()
yield metrics.Observation(
mem_info.rss, # Resident Set Size
{"service": self.service_name, "type": "rss"}
)
yield metrics.Observation(
mem_info.vms, # Virtual Memory Size
{"service": self.service_name, "type": "vms"}
)
except Exception as e:
logger.warning(f"Failed to collect process memory metrics: {e}")
def _observe_process_memory_percent(self, options):
"""Observe process memory percentage"""
try:
mem_percent = self.process.memory_percent()
yield metrics.Observation(
mem_percent,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect process memory percent: {e}")
def _observe_process_threads(self, options):
"""Observe process thread count"""
try:
num_threads = self.process.num_threads()
yield metrics.Observation(
num_threads,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect process thread count: {e}")
def _observe_process_fds(self, options):
"""Observe process file descriptors (Unix only)"""
try:
num_fds = self.process.num_fds()
yield metrics.Observation(
num_fds,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect process FDs: {e}")
def _observe_system_cpu(self, options):
"""Observe system-wide CPU usage"""
try:
cpu_percent = psutil.cpu_percent(interval=None)
yield metrics.Observation(
cpu_percent,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect system CPU metrics: {e}")
def _observe_system_memory(self, options):
"""Observe system memory usage"""
try:
mem = psutil.virtual_memory()
yield metrics.Observation(
mem.used,
{"service": self.service_name, "type": "used"}
)
yield metrics.Observation(
mem.available,
{"service": self.service_name, "type": "available"}
)
yield metrics.Observation(
mem.total,
{"service": self.service_name, "type": "total"}
)
except Exception as e:
logger.warning(f"Failed to collect system memory metrics: {e}")
def _observe_system_memory_percent(self, options):
"""Observe system memory percentage"""
try:
mem = psutil.virtual_memory()
yield metrics.Observation(
mem.percent,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect system memory percent: {e}")
def _observe_disk_io_read(self, options):
"""Observe disk I/O read bytes"""
try:
disk_io = psutil.disk_io_counters()
if disk_io:
yield metrics.Observation(
disk_io.read_bytes,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect disk I/O read metrics: {e}")
def _observe_disk_io_write(self, options):
"""Observe disk I/O write bytes"""
try:
disk_io = psutil.disk_io_counters()
if disk_io:
yield metrics.Observation(
disk_io.write_bytes,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect disk I/O write metrics: {e}")
def _observe_network_io_sent(self, options):
"""Observe network bytes sent"""
try:
net_io = psutil.net_io_counters()
yield metrics.Observation(
net_io.bytes_sent,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect network sent metrics: {e}")
def _observe_network_io_recv(self, options):
"""Observe network bytes received"""
try:
net_io = psutil.net_io_counters()
yield metrics.Observation(
net_io.bytes_recv,
{"service": self.service_name}
)
except Exception as e:
logger.warning(f"Failed to collect network recv metrics: {e}")
class ApplicationMetricsCollector:
"""
Collects application-level metrics (HTTP requests, database connections, etc.)
using OpenTelemetry metrics API only (no Prometheus).
"""
def __init__(
self,
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
):
self.service_name = service_name
# Use provided meter provider or get global
if meter_provider:
self.meter = meter_provider.get_meter(__name__)
else:
self.meter = metrics.get_meter(__name__)
# HTTP metrics
self.http_requests = self.meter.create_counter(
name="http.server.requests",
description="Total HTTP requests",
unit="requests"
)
self.http_request_duration = self.meter.create_histogram(
name="http.server.request.duration",
description="HTTP request duration",
unit="ms"
)
self.http_active_requests = self.meter.create_up_down_counter(
name="http.server.active_requests",
description="Active HTTP requests",
unit="requests"
)
# Database metrics
self.db_connections = self.meter.create_up_down_counter(
name="db.client.connections.usage",
description="Database connections in use",
unit="connections"
)
self.db_query_duration = self.meter.create_histogram(
name="db.client.operation.duration",
description="Database query duration",
unit="ms"
)
logger.info(
"Application metrics collector initialized",
service=service_name
)
def record_http_request(
self,
method: str,
endpoint: str,
status_code: int,
duration_ms: float
):
"""Record an HTTP request"""
attributes = {
"service": self.service_name,
"http.method": method,
"http.route": endpoint,
"http.status_code": status_code
}
self.http_requests.add(1, attributes)
self.http_request_duration.record(duration_ms, attributes)
def increment_active_requests(self):
"""Increment active request count"""
self.http_active_requests.add(1, {"service": self.service_name})
def decrement_active_requests(self):
"""Decrement active request count"""
self.http_active_requests.add(-1, {"service": self.service_name})
def set_db_connections(self, count: int, state: str = "used"):
"""Set database connection count"""
self.db_connections.add(
count,
{"service": self.service_name, "state": state}
)
def record_db_query(self, operation: str, duration_ms: float, table: str = ""):
"""Record a database query"""
attributes = {
"service": self.service_name,
"db.operation": operation
}
if table:
attributes["db.table"] = table
self.db_query_duration.record(duration_ms, attributes)
def setup_all_metrics(
service_name: str,
service_version: str = "1.0.0",
meter_provider: Optional[MeterProvider] = None
) -> tuple[SystemMetricsCollector, ApplicationMetricsCollector]:
"""
Setup both system and application metrics collection.
Args:
service_name: Name of the service
service_version: Version of the service
meter_provider: Optional meter provider (will use global if not provided)
Returns:
Tuple of (SystemMetricsCollector, ApplicationMetricsCollector)
Example:
from shared.monitoring.system_metrics import setup_all_metrics
system_metrics, app_metrics = setup_all_metrics("auth-service", "1.0.0")
# Metrics are automatically collected
# Use app_metrics to record custom application events:
app_metrics.record_http_request("GET", "/api/users", 200, 45.2)
"""
system_metrics = SystemMetricsCollector(service_name, service_version, meter_provider)
app_metrics = ApplicationMetricsCollector(service_name, service_version, meter_provider)
logger.info(
"All metrics collectors initialized",
service=service_name,
collectors=["system", "application"]
)
return system_metrics, app_metrics

View File

@@ -22,7 +22,7 @@ def setup_tracing(
app,
service_name: str,
service_version: str = "1.0.0",
jaeger_endpoint: str = "http://jaeger-collector.monitoring:4317"
otel_endpoint: str = "http://signoz-otel-collector.signoz:4318"
):
"""
Setup OpenTelemetry distributed tracing for a FastAPI service.
@@ -37,7 +37,7 @@ def setup_tracing(
app: FastAPI application instance
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
jaeger_endpoint: Jaeger collector gRPC endpoint
otel_endpoint: OpenTelemetry collector endpoint (SigNoz)
Example:
from shared.monitoring.tracing import setup_tracing
@@ -58,9 +58,9 @@ def setup_tracing(
tracer_provider = TracerProvider(resource=resource)
trace.set_tracer_provider(tracer_provider)
# Configure OTLP exporter to send to Jaeger
# Configure OTLP exporter to send to SigNoz
otlp_exporter = OTLPSpanExporter(
endpoint=jaeger_endpoint,
endpoint=otel_endpoint,
insecure=True # Use TLS in production
)
@@ -100,7 +100,7 @@ def setup_tracing(
logger.info(
"Distributed tracing configured",
service=service_name,
jaeger_endpoint=jaeger_endpoint
otel_endpoint=otel_endpoint
)
except Exception as e: