Update monitoring packages to latest versions
- Updated all OpenTelemetry packages to latest versions: - opentelemetry-api: 1.27.0 → 1.39.1 - opentelemetry-sdk: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-grpc: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-http: 1.27.0 → 1.39.1 - opentelemetry-instrumentation-fastapi: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-httpx: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-redis: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-sqlalchemy: 0.48b0 → 0.60b1 - Removed prometheus-client==0.23.1 from all services - Unified all services to use the same monitoring package versions Generated by Mistral Vibe. Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
This commit is contained in:
@@ -10,6 +10,22 @@ from .health_checks import (
|
||||
create_health_manager,
|
||||
setup_fastapi_health_checks
|
||||
)
|
||||
from .logs_exporter import (
|
||||
setup_otel_logging,
|
||||
add_log_context,
|
||||
get_current_trace_context,
|
||||
StructlogOTELProcessor
|
||||
)
|
||||
from .metrics_exporter import (
|
||||
setup_otel_metrics,
|
||||
OTelMetricsCollector,
|
||||
create_dual_metrics_collector
|
||||
)
|
||||
from .system_metrics import (
|
||||
SystemMetricsCollector,
|
||||
ApplicationMetricsCollector,
|
||||
setup_all_metrics
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'setup_logging',
|
||||
@@ -19,5 +35,15 @@ __all__ = [
|
||||
'HealthCheckManager',
|
||||
'FastAPIHealthChecker',
|
||||
'create_health_manager',
|
||||
'setup_fastapi_health_checks'
|
||||
'setup_fastapi_health_checks',
|
||||
'setup_otel_logging',
|
||||
'add_log_context',
|
||||
'get_current_trace_context',
|
||||
'StructlogOTELProcessor',
|
||||
'setup_otel_metrics',
|
||||
'OTelMetricsCollector',
|
||||
'create_dual_metrics_collector',
|
||||
'SystemMetricsCollector',
|
||||
'ApplicationMetricsCollector',
|
||||
'setup_all_metrics'
|
||||
]
|
||||
220
shared/monitoring/logs_exporter.py
Normal file
220
shared/monitoring/logs_exporter.py
Normal file
@@ -0,0 +1,220 @@
|
||||
"""
|
||||
OpenTelemetry Logs Integration for SigNoz
|
||||
Exports structured logs to SigNoz via OpenTelemetry Collector
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import structlog
|
||||
from typing import Optional
|
||||
from opentelemetry._logs import set_logger_provider
|
||||
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
|
||||
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
||||
try:
|
||||
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
|
||||
except ImportError:
|
||||
try:
|
||||
from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
|
||||
except ImportError:
|
||||
OTLPLogExporter = None
|
||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def setup_otel_logging(
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
otel_endpoint: Optional[str] = None,
|
||||
enable_console: bool = True
|
||||
) -> Optional[LoggingHandler]:
|
||||
"""
|
||||
Setup OpenTelemetry logging to export logs to SigNoz.
|
||||
|
||||
This integrates with Python's standard logging to automatically
|
||||
export all log records to SigNoz via the OTLP protocol.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
otel_endpoint: OpenTelemetry collector endpoint (default from env)
|
||||
enable_console: Whether to also log to console (default: True)
|
||||
|
||||
Returns:
|
||||
LoggingHandler instance if successful, None otherwise
|
||||
|
||||
Example:
|
||||
from shared.monitoring.logs_exporter import setup_otel_logging
|
||||
|
||||
# Setup during service initialization
|
||||
setup_otel_logging("auth-service", "1.0.0")
|
||||
|
||||
# Now all standard logging calls will be exported to SigNoz
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info("This will appear in SigNoz!")
|
||||
"""
|
||||
|
||||
# Check if logging export is enabled
|
||||
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() != "otlp":
|
||||
logger.info(
|
||||
"OpenTelemetry logs export disabled",
|
||||
service=service_name,
|
||||
reason="OTEL_LOGS_EXPORTER not set to 'otlp'"
|
||||
)
|
||||
return None
|
||||
|
||||
# Get OTLP endpoint from environment or parameter
|
||||
if otel_endpoint is None:
|
||||
otel_endpoint = os.getenv(
|
||||
"OTEL_EXPORTER_OTLP_ENDPOINT",
|
||||
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.signoz:4318")
|
||||
)
|
||||
|
||||
# Ensure endpoint has /v1/logs path for HTTP
|
||||
if not otel_endpoint.endswith("/v1/logs"):
|
||||
otel_endpoint = f"{otel_endpoint}/v1/logs"
|
||||
|
||||
try:
|
||||
# Check if OTLPLogExporter is available
|
||||
if OTLPLogExporter is None:
|
||||
logger.warning(
|
||||
"OpenTelemetry HTTP OTLP exporter not available",
|
||||
service=service_name,
|
||||
reason="opentelemetry-exporter-otlp-proto-http package not installed"
|
||||
)
|
||||
return None
|
||||
|
||||
# Create resource with service information
|
||||
resource = Resource(attributes={
|
||||
SERVICE_NAME: service_name,
|
||||
SERVICE_VERSION: service_version,
|
||||
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
|
||||
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
|
||||
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
|
||||
})
|
||||
|
||||
# Configure logger provider
|
||||
logger_provider = LoggerProvider(resource=resource)
|
||||
set_logger_provider(logger_provider)
|
||||
|
||||
# Configure OTLP exporter for logs
|
||||
otlp_exporter = OTLPLogExporter(
|
||||
endpoint=otel_endpoint,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
# Add log record processor with batching
|
||||
log_processor = BatchLogRecordProcessor(otlp_exporter)
|
||||
logger_provider.add_log_record_processor(log_processor)
|
||||
|
||||
# Create logging handler that bridges standard logging to OpenTelemetry
|
||||
otel_handler = LoggingHandler(
|
||||
level=logging.NOTSET, # Capture all levels
|
||||
logger_provider=logger_provider
|
||||
)
|
||||
|
||||
# Add handler to root logger
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.addHandler(otel_handler)
|
||||
|
||||
logger.info(
|
||||
"OpenTelemetry logs export configured",
|
||||
service=service_name,
|
||||
otel_endpoint=otel_endpoint,
|
||||
console_logging=enable_console
|
||||
)
|
||||
|
||||
return otel_handler
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to setup OpenTelemetry logs export",
|
||||
service=service_name,
|
||||
error=str(e),
|
||||
reason="Will continue with standard logging only"
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def add_log_context(**context):
|
||||
"""
|
||||
Add contextual information to logs that will be sent to SigNoz.
|
||||
|
||||
This is useful for adding request IDs, user IDs, tenant IDs, etc.
|
||||
that help with filtering and correlation in SigNoz.
|
||||
|
||||
Args:
|
||||
**context: Key-value pairs to add to log context
|
||||
|
||||
Example:
|
||||
from shared.monitoring.logs_exporter import add_log_context
|
||||
|
||||
# Add context for current request
|
||||
add_log_context(
|
||||
request_id="req_123",
|
||||
user_id="user_456",
|
||||
tenant_id="tenant_789"
|
||||
)
|
||||
|
||||
# Now all logs will include this context
|
||||
logger.info("Processing order") # Will include request_id, user_id, tenant_id
|
||||
"""
|
||||
# This works with structlog's context binding
|
||||
bound_logger = structlog.get_logger()
|
||||
return bound_logger.bind(**context)
|
||||
|
||||
|
||||
def get_current_trace_context() -> dict:
|
||||
"""
|
||||
Get current trace context for log correlation.
|
||||
|
||||
Returns a dict with trace_id and span_id if available,
|
||||
which can be added to log records for correlation with traces.
|
||||
|
||||
Returns:
|
||||
Dict with trace_id and span_id, or empty dict if no active trace
|
||||
|
||||
Example:
|
||||
from shared.monitoring.logs_exporter import get_current_trace_context
|
||||
|
||||
# Get trace context and add to logs
|
||||
trace_ctx = get_current_trace_context()
|
||||
logger.info("Processing request", **trace_ctx)
|
||||
"""
|
||||
from opentelemetry import trace
|
||||
|
||||
span = trace.get_current_span()
|
||||
if span and span.get_span_context().is_valid:
|
||||
return {
|
||||
"trace_id": format(span.get_span_context().trace_id, '032x'),
|
||||
"span_id": format(span.get_span_context().span_id, '016x'),
|
||||
}
|
||||
return {}
|
||||
|
||||
|
||||
class StructlogOTELProcessor:
|
||||
"""
|
||||
Structlog processor that adds OpenTelemetry trace context to logs.
|
||||
|
||||
This automatically adds trace_id and span_id to all log records,
|
||||
enabling correlation between logs and traces in SigNoz.
|
||||
|
||||
Usage:
|
||||
import structlog
|
||||
from shared.monitoring.logs_exporter import StructlogOTELProcessor
|
||||
|
||||
structlog.configure(
|
||||
processors=[
|
||||
StructlogOTELProcessor(),
|
||||
# ... other processors
|
||||
]
|
||||
)
|
||||
"""
|
||||
|
||||
def __call__(self, logger, method_name, event_dict):
|
||||
"""Add trace context to log event"""
|
||||
trace_ctx = get_current_trace_context()
|
||||
if trace_ctx:
|
||||
event_dict.update(trace_ctx)
|
||||
return event_dict
|
||||
@@ -1,79 +1,101 @@
|
||||
# ================================================================
|
||||
# shared/monitoring/metrics.py - FIXED VERSION
|
||||
# ================================================================
|
||||
"""
|
||||
Centralized metrics collection for microservices - Fixed middleware issue
|
||||
OpenTelemetry Metrics Collection for Microservices
|
||||
Replaces Prometheus with native OpenTelemetry metrics export to SigNoz
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
from prometheus_client import Counter, Histogram, Gauge, start_http_server, generate_latest
|
||||
import structlog
|
||||
from typing import Dict, Any, Optional
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
||||
from fastapi import Request, Response
|
||||
from threading import Lock
|
||||
import os
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = structlog.get_logger()
|
||||
|
||||
# Global registry for metrics collectors
|
||||
_metrics_registry: Dict[str, 'MetricsCollector'] = {}
|
||||
_registry_lock = Lock()
|
||||
|
||||
# Default Prometheus metrics
|
||||
DEFAULT_REQUEST_COUNT = Counter(
|
||||
'http_requests_total',
|
||||
'Total HTTP requests',
|
||||
['method', 'endpoint', 'status_code', 'service']
|
||||
)
|
||||
|
||||
DEFAULT_REQUEST_DURATION = Histogram(
|
||||
'http_request_duration_seconds',
|
||||
'HTTP request duration in seconds',
|
||||
['method', 'endpoint', 'service']
|
||||
)
|
||||
|
||||
DEFAULT_ACTIVE_CONNECTIONS = Gauge(
|
||||
'active_connections',
|
||||
'Active database connections',
|
||||
['service']
|
||||
)
|
||||
|
||||
class MetricsCollector:
|
||||
"""Thread-safe metrics collector for microservices"""
|
||||
"""
|
||||
OpenTelemetry-based metrics collector for microservices.
|
||||
Exports metrics directly to SigNoz via OTLP (no Prometheus).
|
||||
"""
|
||||
|
||||
def __init__(self, service_name: str):
|
||||
def __init__(
|
||||
self,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
):
|
||||
self.service_name = service_name
|
||||
self.service_version = service_version
|
||||
self.start_time = time.time()
|
||||
self._counters: Dict[str, Counter] = {}
|
||||
self._histograms: Dict[str, Histogram] = {}
|
||||
self._gauges: Dict[str, Gauge] = {}
|
||||
|
||||
# Use provided meter provider or get global
|
||||
if meter_provider:
|
||||
self.meter = meter_provider.get_meter(__name__)
|
||||
else:
|
||||
self.meter = metrics.get_meter(__name__)
|
||||
|
||||
# Store created instruments
|
||||
self._counters: Dict[str, Any] = {}
|
||||
self._histograms: Dict[str, Any] = {}
|
||||
self._up_down_counters: Dict[str, Any] = {}
|
||||
self._lock = Lock()
|
||||
|
||||
|
||||
# Register in global registry
|
||||
with _registry_lock:
|
||||
_metrics_registry[service_name] = self
|
||||
|
||||
def start_metrics_server(self, port: int = 8080):
|
||||
"""Start Prometheus metrics server"""
|
||||
try:
|
||||
start_http_server(port)
|
||||
logger.info(f"Metrics server started on port {port} for {self.service_name}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start metrics server for {self.service_name}: {e}")
|
||||
# Create default HTTP metrics
|
||||
self._setup_default_metrics()
|
||||
|
||||
def register_counter(self, name: str, documentation: str, labels: List[str] = None) -> Counter:
|
||||
"""Register a custom Counter metric."""
|
||||
logger.info(
|
||||
"OpenTelemetry metrics collector initialized",
|
||||
service=service_name
|
||||
)
|
||||
|
||||
def _setup_default_metrics(self):
|
||||
"""Setup default HTTP metrics"""
|
||||
self._counters["http_requests_total"] = self.meter.create_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_http_requests_total",
|
||||
description="Total HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
self._histograms["http_request_duration"] = self.meter.create_histogram(
|
||||
name=f"{self.service_name.replace('-', '_')}_http_request_duration_seconds",
|
||||
description="HTTP request duration in seconds",
|
||||
unit="s"
|
||||
)
|
||||
|
||||
self._up_down_counters["active_requests"] = self.meter.create_up_down_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_active_requests",
|
||||
description="Number of active HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
def register_counter(self, name: str, documentation: str, labels: list = None) -> Any:
|
||||
"""Register a custom Counter metric"""
|
||||
with self._lock:
|
||||
if name in self._counters:
|
||||
logger.warning(f"Counter '{name}' already registered for {self.service_name}")
|
||||
return self._counters[name]
|
||||
|
||||
if labels is None:
|
||||
labels = ['service']
|
||||
elif 'service' not in labels:
|
||||
labels.append('service')
|
||||
|
||||
|
||||
try:
|
||||
counter = Counter(f"{self.service_name.replace('-', '_')}_{name}", documentation, labelnames=labels)
|
||||
counter = self.meter.create_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=documentation,
|
||||
unit="1"
|
||||
)
|
||||
self._counters[name] = counter
|
||||
logger.info(f"Registered counter: {name} for {self.service_name}")
|
||||
return counter
|
||||
@@ -81,65 +103,46 @@ class MetricsCollector:
|
||||
logger.error(f"Failed to register counter {name} for {self.service_name}: {e}")
|
||||
raise
|
||||
|
||||
def register_histogram(self, name: str, documentation: str, labels: List[str] = None,
|
||||
buckets: tuple = Histogram.DEFAULT_BUCKETS) -> Histogram:
|
||||
"""Register a custom Histogram metric."""
|
||||
def register_histogram(
|
||||
self,
|
||||
name: str,
|
||||
documentation: str,
|
||||
labels: list = None,
|
||||
buckets: tuple = None
|
||||
) -> Any:
|
||||
"""Register a custom Histogram metric"""
|
||||
with self._lock:
|
||||
if name in self._histograms:
|
||||
logger.warning(f"Histogram '{name}' already registered for {self.service_name}")
|
||||
return self._histograms[name]
|
||||
|
||||
if labels is None:
|
||||
labels = ['service']
|
||||
elif 'service' not in labels:
|
||||
labels.append('service')
|
||||
|
||||
|
||||
try:
|
||||
histogram = Histogram(f"{self.service_name.replace('-', '_')}_{name}", documentation,
|
||||
labelnames=labels, buckets=buckets)
|
||||
histogram = self.meter.create_histogram(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=documentation,
|
||||
unit="1"
|
||||
)
|
||||
self._histograms[name] = histogram
|
||||
logger.info(f"Registered histogram: {name} for {self.service_name}")
|
||||
return histogram
|
||||
except ValueError as e:
|
||||
if "Duplicated timeseries" in str(e):
|
||||
# Metric already exists in global registry, try to find it
|
||||
from prometheus_client import REGISTRY
|
||||
metric_name = f"{self.service_name.replace('-', '_')}_{name}"
|
||||
for collector in REGISTRY._collector_to_names.keys():
|
||||
if hasattr(collector, '_name') and collector._name == metric_name:
|
||||
self._histograms[name] = collector
|
||||
logger.warning(f"Reusing existing histogram: {name} for {self.service_name}")
|
||||
return collector
|
||||
# If we can't find it, create a new name with suffix
|
||||
import time
|
||||
suffix = str(int(time.time() * 1000))[-6:] # Last 6 digits of timestamp
|
||||
histogram = Histogram(f"{self.service_name.replace('-', '_')}_{name}_{suffix}",
|
||||
documentation, labelnames=labels, buckets=buckets)
|
||||
self._histograms[name] = histogram
|
||||
logger.warning(f"Created histogram with suffix: {name}_{suffix} for {self.service_name}")
|
||||
return histogram
|
||||
else:
|
||||
logger.error(f"Failed to register histogram {name} for {self.service_name}: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register histogram {name} for {self.service_name}: {e}")
|
||||
raise
|
||||
|
||||
def register_gauge(self, name: str, documentation: str, labels: List[str] = None) -> Gauge:
|
||||
"""Register a custom Gauge metric."""
|
||||
def register_gauge(self, name: str, documentation: str, labels: list = None) -> Any:
|
||||
"""Register a custom Gauge metric (using UpDownCounter)"""
|
||||
with self._lock:
|
||||
if name in self._gauges:
|
||||
if name in self._up_down_counters:
|
||||
logger.warning(f"Gauge '{name}' already registered for {self.service_name}")
|
||||
return self._gauges[name]
|
||||
|
||||
if labels is None:
|
||||
labels = ['service']
|
||||
elif 'service' not in labels:
|
||||
labels.append('service')
|
||||
|
||||
return self._up_down_counters[name]
|
||||
|
||||
try:
|
||||
gauge = Gauge(f"{self.service_name.replace('-', '_')}_{name}", documentation, labelnames=labels)
|
||||
self._gauges[name] = gauge
|
||||
gauge = self.meter.create_up_down_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=documentation,
|
||||
unit="1"
|
||||
)
|
||||
self._up_down_counters[name] = gauge
|
||||
logger.info(f"Registered gauge: {name} for {self.service_name}")
|
||||
return gauge
|
||||
except Exception as e:
|
||||
@@ -147,104 +150,118 @@ class MetricsCollector:
|
||||
raise
|
||||
|
||||
def increment_counter(self, name: str, value: int = 1, labels: Dict[str, str] = None):
|
||||
"""Increment a counter metric."""
|
||||
"""Increment a counter metric"""
|
||||
if name not in self._counters:
|
||||
logger.error(f"Counter '{name}' not registered for {self.service_name}. Cannot increment.")
|
||||
logger.error(f"Counter '{name}' not registered for {self.service_name}")
|
||||
return
|
||||
|
||||
if labels is None:
|
||||
labels = {'service': self.service_name}
|
||||
elif 'service' not in labels:
|
||||
labels['service'] = self.service_name
|
||||
labels = {"service": self.service_name}
|
||||
elif "service" not in labels:
|
||||
labels["service"] = self.service_name
|
||||
|
||||
try:
|
||||
self._counters[name].labels(**labels).inc(value)
|
||||
self._counters[name].add(value, labels)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to increment counter {name} for {self.service_name}: {e}")
|
||||
|
||||
def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None):
|
||||
"""Observe a histogram metric."""
|
||||
"""Observe a histogram metric"""
|
||||
if name not in self._histograms:
|
||||
logger.error(f"Histogram '{name}' not registered for {self.service_name}. Cannot observe.")
|
||||
logger.error(f"Histogram '{name}' not registered for {self.service_name}")
|
||||
return
|
||||
|
||||
if labels is None:
|
||||
labels = {'service': self.service_name}
|
||||
elif 'service' not in labels:
|
||||
labels['service'] = self.service_name
|
||||
labels = {"service": self.service_name}
|
||||
elif "service" not in labels:
|
||||
labels["service"] = self.service_name
|
||||
|
||||
try:
|
||||
self._histograms[name].labels(**labels).observe(value)
|
||||
self._histograms[name].record(value, labels)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to observe histogram {name} for {self.service_name}: {e}")
|
||||
|
||||
def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None):
|
||||
"""Set a gauge metric."""
|
||||
if name not in self._gauges:
|
||||
logger.error(f"Gauge '{name}' not registered for {self.service_name}. Cannot set.")
|
||||
"""Set a gauge metric (using add for UpDownCounter)"""
|
||||
if name not in self._up_down_counters:
|
||||
logger.error(f"Gauge '{name}' not registered for {self.service_name}")
|
||||
return
|
||||
|
||||
if labels is None:
|
||||
labels = {'service': self.service_name}
|
||||
elif 'service' not in labels:
|
||||
labels['service'] = self.service_name
|
||||
labels = {"service": self.service_name}
|
||||
elif "service" not in labels:
|
||||
labels["service"] = self.service_name
|
||||
|
||||
try:
|
||||
self._gauges[name].labels(**labels).set(value)
|
||||
# For UpDownCounter, we need to track the delta
|
||||
# Store current value and calculate delta
|
||||
key = f"{name}_{str(sorted(labels.items()))}"
|
||||
if not hasattr(self, '_gauge_values'):
|
||||
self._gauge_values = {}
|
||||
|
||||
old_value = self._gauge_values.get(key, 0)
|
||||
delta = value - old_value
|
||||
self._gauge_values[key] = value
|
||||
|
||||
self._up_down_counters[name].add(delta, labels)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to set gauge {name} for {self.service_name}: {e}")
|
||||
|
||||
def record_request(self, method: str, endpoint: str, status_code: int, duration: float):
|
||||
"""Record HTTP request metrics using default metrics."""
|
||||
"""Record HTTP request metrics"""
|
||||
try:
|
||||
DEFAULT_REQUEST_COUNT.labels(
|
||||
method=method,
|
||||
endpoint=endpoint,
|
||||
status_code=status_code,
|
||||
service=self.service_name
|
||||
).inc()
|
||||
attributes = {
|
||||
"service": self.service_name,
|
||||
"http.method": method,
|
||||
"http.route": endpoint,
|
||||
"http.status_code": str(status_code)
|
||||
}
|
||||
|
||||
DEFAULT_REQUEST_DURATION.labels(
|
||||
method=method,
|
||||
endpoint=endpoint,
|
||||
service=self.service_name
|
||||
).observe(duration)
|
||||
self._counters["http_requests_total"].add(1, attributes)
|
||||
self._histograms["http_request_duration"].record(duration, attributes)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to record request metrics for {self.service_name}: {e}")
|
||||
|
||||
def set_active_connections(self, count: int):
|
||||
"""Set active database connections using default gauge."""
|
||||
def increment_active_requests(self):
|
||||
"""Increment active request counter"""
|
||||
try:
|
||||
DEFAULT_ACTIVE_CONNECTIONS.labels(service=self.service_name).set(count)
|
||||
self._up_down_counters["active_requests"].add(1, {"service": self.service_name})
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to set active connections for {self.service_name}: {e}")
|
||||
logger.error(f"Failed to increment active requests: {e}")
|
||||
|
||||
def get_metrics(self) -> str:
|
||||
"""Return Prometheus metrics in exposition format."""
|
||||
def decrement_active_requests(self):
|
||||
"""Decrement active request counter"""
|
||||
try:
|
||||
return generate_latest().decode('utf-8')
|
||||
self._up_down_counters["active_requests"].add(-1, {"service": self.service_name})
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate metrics for {self.service_name}: {e}")
|
||||
return ""
|
||||
logger.error(f"Failed to decrement active requests: {e}")
|
||||
|
||||
def set_active_connections(self, count: int):
|
||||
"""Set active database connections"""
|
||||
self.set_gauge("active_connections", count)
|
||||
|
||||
|
||||
def get_metrics_collector(service_name: str) -> Optional[MetricsCollector]:
|
||||
"""Get metrics collector by service name from global registry."""
|
||||
"""Get metrics collector by service name from global registry"""
|
||||
with _registry_lock:
|
||||
return _metrics_registry.get(service_name)
|
||||
|
||||
|
||||
def create_metrics_collector(service_name: str) -> MetricsCollector:
|
||||
def create_metrics_collector(
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
) -> MetricsCollector:
|
||||
"""
|
||||
Create metrics collector without adding middleware.
|
||||
Create metrics collector.
|
||||
This should be called BEFORE app startup, not during lifespan.
|
||||
"""
|
||||
# Get existing or create new
|
||||
existing = get_metrics_collector(service_name)
|
||||
if existing:
|
||||
return existing
|
||||
|
||||
return MetricsCollector(service_name)
|
||||
|
||||
return MetricsCollector(service_name, service_version, meter_provider)
|
||||
|
||||
|
||||
def add_metrics_middleware(app, metrics_collector: MetricsCollector):
|
||||
@@ -253,12 +270,14 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
|
||||
"""
|
||||
@app.middleware("http")
|
||||
async def metrics_middleware(request: Request, call_next):
|
||||
# Increment active requests
|
||||
metrics_collector.increment_active_requests()
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
try:
|
||||
response = await call_next(request)
|
||||
duration = time.time() - start_time
|
||||
|
||||
|
||||
# Record request metrics
|
||||
metrics_collector.record_request(
|
||||
method=request.method,
|
||||
@@ -266,10 +285,14 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
|
||||
status_code=response.status_code,
|
||||
duration=duration
|
||||
)
|
||||
|
||||
|
||||
# Decrement active requests
|
||||
metrics_collector.decrement_active_requests()
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
duration = time.time() - start_time
|
||||
|
||||
# Record failed request
|
||||
metrics_collector.record_request(
|
||||
method=request.method,
|
||||
@@ -277,61 +300,55 @@ def add_metrics_middleware(app, metrics_collector: MetricsCollector):
|
||||
status_code=500,
|
||||
duration=duration
|
||||
)
|
||||
|
||||
# Decrement active requests
|
||||
metrics_collector.decrement_active_requests()
|
||||
raise
|
||||
|
||||
|
||||
return metrics_collector
|
||||
|
||||
|
||||
def add_metrics_endpoint(app, metrics_collector: MetricsCollector):
|
||||
"""Add metrics endpoint to app"""
|
||||
@app.get("/metrics")
|
||||
async def prometheus_metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
return Response(
|
||||
content=metrics_collector.get_metrics(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8"
|
||||
)
|
||||
|
||||
|
||||
def setup_metrics_early(app, service_name: str = None) -> MetricsCollector:
|
||||
def setup_metrics_early(
|
||||
app,
|
||||
service_name: str = None,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
) -> MetricsCollector:
|
||||
"""
|
||||
Setup metrics collection BEFORE app startup.
|
||||
This must be called before adding any middleware or starting the app.
|
||||
|
||||
Note: No Prometheus endpoint is created - all metrics go to SigNoz via OTLP
|
||||
"""
|
||||
if service_name is None:
|
||||
service_name = getattr(app, 'title', 'unknown-service').lower().replace(' ', '-').replace('.', '_')
|
||||
|
||||
|
||||
# Create metrics collector
|
||||
metrics_collector = create_metrics_collector(service_name)
|
||||
|
||||
metrics_collector = create_metrics_collector(service_name, service_version, meter_provider)
|
||||
|
||||
# Add middleware (must be before app starts)
|
||||
add_metrics_middleware(app, metrics_collector)
|
||||
|
||||
# Add metrics endpoint
|
||||
add_metrics_endpoint(app, metrics_collector)
|
||||
|
||||
|
||||
# Store in app state for access from routes
|
||||
app.state.metrics_collector = metrics_collector
|
||||
|
||||
logger.info(f"Metrics setup completed for service: {service_name}")
|
||||
|
||||
logger.info(f"OpenTelemetry metrics setup completed for service: {service_name}")
|
||||
return metrics_collector
|
||||
|
||||
|
||||
# Additional helper function for endpoint tracking
|
||||
# Helper function for endpoint tracking (kept for backward compatibility)
|
||||
def track_endpoint_metrics(endpoint_name: str = None, service_name: str = None):
|
||||
"""Decorator for tracking endpoint metrics - Fixed for async functions"""
|
||||
"""Decorator for tracking endpoint metrics - metrics handled by middleware"""
|
||||
def decorator(func):
|
||||
import asyncio
|
||||
from functools import wraps
|
||||
|
||||
@wraps(func)
|
||||
async def async_wrapper(*args, **kwargs):
|
||||
# For now, just pass through - metrics are handled by middleware
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
@wraps(func)
|
||||
def sync_wrapper(*args, **kwargs):
|
||||
# For now, just pass through - metrics are handled by middleware
|
||||
return func(*args, **kwargs)
|
||||
|
||||
# Return appropriate wrapper based on function type
|
||||
@@ -340,4 +357,3 @@ def track_endpoint_metrics(endpoint_name: str = None, service_name: str = None):
|
||||
else:
|
||||
return sync_wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
250
shared/monitoring/metrics_exporter.py
Normal file
250
shared/monitoring/metrics_exporter.py
Normal file
@@ -0,0 +1,250 @@
|
||||
"""
|
||||
OpenTelemetry Metrics Integration for SigNoz
|
||||
Exports metrics to SigNoz via OpenTelemetry Collector in addition to Prometheus
|
||||
"""
|
||||
|
||||
import os
|
||||
import structlog
|
||||
from typing import Optional
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
def setup_otel_metrics(
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
otel_endpoint: Optional[str] = None,
|
||||
export_interval_millis: int = 60000 # Export every 60 seconds
|
||||
) -> Optional[MeterProvider]:
|
||||
"""
|
||||
Setup OpenTelemetry metrics to export to SigNoz.
|
||||
|
||||
This creates a dual-export strategy:
|
||||
- Prometheus exposition format at /metrics (for Prometheus scraping)
|
||||
- OTLP push to SigNoz collector (for direct ingestion)
|
||||
|
||||
Args:
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
otel_endpoint: OpenTelemetry collector endpoint (default from env)
|
||||
export_interval_millis: How often to push metrics (default 60s)
|
||||
|
||||
Returns:
|
||||
MeterProvider instance if successful, None otherwise
|
||||
|
||||
Example:
|
||||
from shared.monitoring.metrics_exporter import setup_otel_metrics
|
||||
|
||||
# Setup during service initialization
|
||||
meter_provider = setup_otel_metrics("auth-service", "1.0.0")
|
||||
|
||||
# Create meters for your metrics
|
||||
meter = meter_provider.get_meter(__name__)
|
||||
request_counter = meter.create_counter(
|
||||
"http.server.requests",
|
||||
description="Total HTTP requests",
|
||||
unit="1"
|
||||
)
|
||||
|
||||
# Record metrics
|
||||
request_counter.add(1, {"method": "GET", "status": "200"})
|
||||
"""
|
||||
|
||||
# Check if metrics export is enabled
|
||||
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
|
||||
if not enable_otel_metrics:
|
||||
logger.info(
|
||||
"OpenTelemetry metrics export disabled",
|
||||
service=service_name,
|
||||
reason="ENABLE_OTEL_METRICS not set to 'true'"
|
||||
)
|
||||
return None
|
||||
|
||||
# Get OTLP endpoint from environment or parameter
|
||||
if otel_endpoint is None:
|
||||
otel_endpoint = os.getenv(
|
||||
"OTEL_EXPORTER_OTLP_ENDPOINT",
|
||||
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.signoz:4318")
|
||||
)
|
||||
|
||||
# Ensure endpoint has /v1/metrics path for HTTP
|
||||
if not otel_endpoint.endswith("/v1/metrics"):
|
||||
otel_endpoint = f"{otel_endpoint}/v1/metrics"
|
||||
|
||||
try:
|
||||
# Create resource with service information
|
||||
resource = Resource(attributes={
|
||||
SERVICE_NAME: service_name,
|
||||
SERVICE_VERSION: service_version,
|
||||
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
|
||||
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
|
||||
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
|
||||
})
|
||||
|
||||
# Configure OTLP exporter for metrics
|
||||
otlp_exporter = OTLPMetricExporter(
|
||||
endpoint=otel_endpoint,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
# Create periodic metric reader
|
||||
metric_reader = PeriodicExportingMetricReader(
|
||||
exporter=otlp_exporter,
|
||||
export_interval_millis=export_interval_millis
|
||||
)
|
||||
|
||||
# Configure meter provider
|
||||
meter_provider = MeterProvider(
|
||||
resource=resource,
|
||||
metric_readers=[metric_reader]
|
||||
)
|
||||
|
||||
# Set global meter provider
|
||||
metrics.set_meter_provider(meter_provider)
|
||||
|
||||
logger.info(
|
||||
"OpenTelemetry metrics export configured",
|
||||
service=service_name,
|
||||
otel_endpoint=otel_endpoint,
|
||||
export_interval_seconds=export_interval_millis / 1000
|
||||
)
|
||||
|
||||
return meter_provider
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to setup OpenTelemetry metrics export",
|
||||
service=service_name,
|
||||
error=str(e),
|
||||
reason="Will continue with Prometheus-only metrics"
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
class OTelMetricsCollector:
|
||||
"""
|
||||
Wrapper for OpenTelemetry metrics that provides a similar interface
|
||||
to the Prometheus MetricsCollector.
|
||||
|
||||
This allows services to emit metrics that go to both Prometheus and SigNoz.
|
||||
"""
|
||||
|
||||
def __init__(self, service_name: str, meter_provider: MeterProvider):
|
||||
self.service_name = service_name
|
||||
self.meter_provider = meter_provider
|
||||
self.meter = meter_provider.get_meter(__name__)
|
||||
|
||||
# Store created instruments
|
||||
self._counters = {}
|
||||
self._histograms = {}
|
||||
self._gauges = {}
|
||||
|
||||
def create_counter(self, name: str, description: str = "", unit: str = "1"):
|
||||
"""Create or get an OpenTelemetry Counter"""
|
||||
if name not in self._counters:
|
||||
self._counters[name] = self.meter.create_counter(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=description,
|
||||
unit=unit
|
||||
)
|
||||
return self._counters[name]
|
||||
|
||||
def create_histogram(self, name: str, description: str = "", unit: str = "1"):
|
||||
"""Create or get an OpenTelemetry Histogram"""
|
||||
if name not in self._histograms:
|
||||
self._histograms[name] = self.meter.create_histogram(
|
||||
name=f"{self.service_name.replace('-', '_')}_{name}",
|
||||
description=description,
|
||||
unit=unit
|
||||
)
|
||||
return self._histograms[name]
|
||||
|
||||
def create_gauge(self, name: str, description: str = "", unit: str = "1"):
|
||||
"""
|
||||
Create or get an OpenTelemetry observable gauge.
|
||||
Note: Gauges in OTEL require a callback function.
|
||||
"""
|
||||
if name not in self._gauges:
|
||||
# Store gauge reference for callback registration
|
||||
self._gauges[name] = {
|
||||
"name": f"{self.service_name.replace('-', '_')}_{name}",
|
||||
"description": description,
|
||||
"unit": unit,
|
||||
"value": 0,
|
||||
"attributes": {}
|
||||
}
|
||||
return self._gauges[name]
|
||||
|
||||
def increment_counter(self, name: str, value: int = 1, attributes: dict = None):
|
||||
"""Increment a counter with optional attributes"""
|
||||
if name in self._counters:
|
||||
if attributes is None:
|
||||
attributes = {"service": self.service_name}
|
||||
elif "service" not in attributes:
|
||||
attributes["service"] = self.service_name
|
||||
|
||||
self._counters[name].add(value, attributes)
|
||||
|
||||
def observe_histogram(self, name: str, value: float, attributes: dict = None):
|
||||
"""Record a histogram observation with optional attributes"""
|
||||
if name in self._histograms:
|
||||
if attributes is None:
|
||||
attributes = {"service": self.service_name}
|
||||
elif "service" not in attributes:
|
||||
attributes["service"] = self.service_name
|
||||
|
||||
self._histograms[name].record(value, attributes)
|
||||
|
||||
def set_gauge(self, name: str, value: float, attributes: dict = None):
|
||||
"""Set a gauge value (stores for next callback)"""
|
||||
if name in self._gauges:
|
||||
if attributes is None:
|
||||
attributes = {"service": self.service_name}
|
||||
elif "service" not in attributes:
|
||||
attributes["service"] = self.service_name
|
||||
|
||||
self._gauges[name]["value"] = value
|
||||
self._gauges[name]["attributes"] = attributes
|
||||
|
||||
|
||||
def create_dual_metrics_collector(service_name: str, service_version: str = "1.0.0"):
|
||||
"""
|
||||
Create a metrics collector that exports to both Prometheus and SigNoz.
|
||||
|
||||
This function sets up both collection strategies:
|
||||
1. Prometheus client library (for /metrics endpoint scraping)
|
||||
2. OpenTelemetry metrics (for OTLP push to SigNoz)
|
||||
|
||||
Returns a tuple: (prometheus_collector, otel_collector)
|
||||
Both collectors can be used independently or together.
|
||||
|
||||
Example:
|
||||
from shared.monitoring.metrics_exporter import create_dual_metrics_collector
|
||||
|
||||
prom_collector, otel_collector = create_dual_metrics_collector("auth-service")
|
||||
|
||||
# Prometheus counter
|
||||
prom_collector.register_counter("requests_total", "Total requests")
|
||||
prom_collector.increment_counter("requests_total", labels={"status": "200"})
|
||||
|
||||
# OpenTelemetry counter (pushed to SigNoz)
|
||||
counter = otel_collector.create_counter("requests_total", "Total requests")
|
||||
counter.add(1, {"status": "200"})
|
||||
"""
|
||||
from shared.monitoring.metrics import MetricsCollector
|
||||
|
||||
# Create Prometheus collector
|
||||
prom_collector = MetricsCollector(service_name)
|
||||
|
||||
# Create OpenTelemetry collector
|
||||
meter_provider = setup_otel_metrics(service_name, service_version)
|
||||
otel_collector = None
|
||||
if meter_provider:
|
||||
otel_collector = OTelMetricsCollector(service_name, meter_provider)
|
||||
|
||||
return prom_collector, otel_collector
|
||||
433
shared/monitoring/system_metrics.py
Normal file
433
shared/monitoring/system_metrics.py
Normal file
@@ -0,0 +1,433 @@
|
||||
"""
|
||||
System Metrics Collection for SigNoz
|
||||
Collects CPU, memory, disk, and process metrics via OpenTelemetry
|
||||
"""
|
||||
|
||||
import os
|
||||
import psutil
|
||||
import structlog
|
||||
from typing import Optional
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class SystemMetricsCollector:
|
||||
"""
|
||||
Collects system-level metrics (CPU, memory, disk, network, process info)
|
||||
and exports them to SigNoz via OpenTelemetry.
|
||||
|
||||
These metrics help monitor service health and resource utilization.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
):
|
||||
self.service_name = service_name
|
||||
self.service_version = service_version
|
||||
self.process = psutil.Process()
|
||||
|
||||
# Use provided meter provider or get global
|
||||
if meter_provider:
|
||||
self.meter = meter_provider.get_meter(__name__)
|
||||
else:
|
||||
self.meter = metrics.get_meter(__name__)
|
||||
|
||||
# Initialize metric instruments
|
||||
self._setup_metrics()
|
||||
|
||||
logger.info(
|
||||
"System metrics collector initialized",
|
||||
service=service_name,
|
||||
pid=os.getpid()
|
||||
)
|
||||
|
||||
def _setup_metrics(self):
|
||||
"""Setup all system metric instruments"""
|
||||
|
||||
# Process CPU metrics
|
||||
self.process_cpu_percent = self.meter.create_observable_gauge(
|
||||
name="process.cpu.utilization",
|
||||
description="Process CPU utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_process_cpu]
|
||||
)
|
||||
|
||||
# Process memory metrics
|
||||
self.process_memory_usage = self.meter.create_observable_gauge(
|
||||
name="process.memory.usage",
|
||||
description="Process memory usage in bytes",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_process_memory]
|
||||
)
|
||||
|
||||
self.process_memory_percent = self.meter.create_observable_gauge(
|
||||
name="process.memory.utilization",
|
||||
description="Process memory utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_process_memory_percent]
|
||||
)
|
||||
|
||||
# Process thread count
|
||||
self.process_threads = self.meter.create_observable_gauge(
|
||||
name="process.threads.count",
|
||||
description="Number of threads in the process",
|
||||
unit="threads",
|
||||
callbacks=[self._observe_process_threads]
|
||||
)
|
||||
|
||||
# Process file descriptors (Unix only)
|
||||
if hasattr(self.process, 'num_fds'):
|
||||
self.process_fds = self.meter.create_observable_gauge(
|
||||
name="process.open_file_descriptors",
|
||||
description="Number of open file descriptors",
|
||||
unit="fds",
|
||||
callbacks=[self._observe_process_fds]
|
||||
)
|
||||
|
||||
# System-wide CPU metrics
|
||||
self.system_cpu_percent = self.meter.create_observable_gauge(
|
||||
name="system.cpu.utilization",
|
||||
description="System-wide CPU utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_system_cpu]
|
||||
)
|
||||
|
||||
# System-wide memory metrics
|
||||
self.system_memory_usage = self.meter.create_observable_gauge(
|
||||
name="system.memory.usage",
|
||||
description="System memory usage in bytes",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_system_memory]
|
||||
)
|
||||
|
||||
self.system_memory_percent = self.meter.create_observable_gauge(
|
||||
name="system.memory.utilization",
|
||||
description="System memory utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_system_memory_percent]
|
||||
)
|
||||
|
||||
# Disk I/O metrics
|
||||
self.disk_io_read = self.meter.create_observable_counter(
|
||||
name="system.disk.io.read",
|
||||
description="Disk bytes read",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_disk_io_read]
|
||||
)
|
||||
|
||||
self.disk_io_write = self.meter.create_observable_counter(
|
||||
name="system.disk.io.write",
|
||||
description="Disk bytes written",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_disk_io_write]
|
||||
)
|
||||
|
||||
# Network I/O metrics
|
||||
self.network_io_sent = self.meter.create_observable_counter(
|
||||
name="system.network.io.sent",
|
||||
description="Network bytes sent",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_network_io_sent]
|
||||
)
|
||||
|
||||
self.network_io_recv = self.meter.create_observable_counter(
|
||||
name="system.network.io.received",
|
||||
description="Network bytes received",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_network_io_recv]
|
||||
)
|
||||
|
||||
# Callback methods for observable instruments
|
||||
|
||||
def _observe_process_cpu(self, options):
|
||||
"""Observe process CPU usage"""
|
||||
try:
|
||||
cpu_percent = self.process.cpu_percent(interval=None)
|
||||
yield metrics.Observation(
|
||||
cpu_percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process CPU metrics: {e}")
|
||||
|
||||
def _observe_process_memory(self, options):
|
||||
"""Observe process memory usage"""
|
||||
try:
|
||||
mem_info = self.process.memory_info()
|
||||
yield metrics.Observation(
|
||||
mem_info.rss, # Resident Set Size
|
||||
{"service": self.service_name, "type": "rss"}
|
||||
)
|
||||
yield metrics.Observation(
|
||||
mem_info.vms, # Virtual Memory Size
|
||||
{"service": self.service_name, "type": "vms"}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process memory metrics: {e}")
|
||||
|
||||
def _observe_process_memory_percent(self, options):
|
||||
"""Observe process memory percentage"""
|
||||
try:
|
||||
mem_percent = self.process.memory_percent()
|
||||
yield metrics.Observation(
|
||||
mem_percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process memory percent: {e}")
|
||||
|
||||
def _observe_process_threads(self, options):
|
||||
"""Observe process thread count"""
|
||||
try:
|
||||
num_threads = self.process.num_threads()
|
||||
yield metrics.Observation(
|
||||
num_threads,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process thread count: {e}")
|
||||
|
||||
def _observe_process_fds(self, options):
|
||||
"""Observe process file descriptors (Unix only)"""
|
||||
try:
|
||||
num_fds = self.process.num_fds()
|
||||
yield metrics.Observation(
|
||||
num_fds,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process FDs: {e}")
|
||||
|
||||
def _observe_system_cpu(self, options):
|
||||
"""Observe system-wide CPU usage"""
|
||||
try:
|
||||
cpu_percent = psutil.cpu_percent(interval=None)
|
||||
yield metrics.Observation(
|
||||
cpu_percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect system CPU metrics: {e}")
|
||||
|
||||
def _observe_system_memory(self, options):
|
||||
"""Observe system memory usage"""
|
||||
try:
|
||||
mem = psutil.virtual_memory()
|
||||
yield metrics.Observation(
|
||||
mem.used,
|
||||
{"service": self.service_name, "type": "used"}
|
||||
)
|
||||
yield metrics.Observation(
|
||||
mem.available,
|
||||
{"service": self.service_name, "type": "available"}
|
||||
)
|
||||
yield metrics.Observation(
|
||||
mem.total,
|
||||
{"service": self.service_name, "type": "total"}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect system memory metrics: {e}")
|
||||
|
||||
def _observe_system_memory_percent(self, options):
|
||||
"""Observe system memory percentage"""
|
||||
try:
|
||||
mem = psutil.virtual_memory()
|
||||
yield metrics.Observation(
|
||||
mem.percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect system memory percent: {e}")
|
||||
|
||||
def _observe_disk_io_read(self, options):
|
||||
"""Observe disk I/O read bytes"""
|
||||
try:
|
||||
disk_io = psutil.disk_io_counters()
|
||||
if disk_io:
|
||||
yield metrics.Observation(
|
||||
disk_io.read_bytes,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect disk I/O read metrics: {e}")
|
||||
|
||||
def _observe_disk_io_write(self, options):
|
||||
"""Observe disk I/O write bytes"""
|
||||
try:
|
||||
disk_io = psutil.disk_io_counters()
|
||||
if disk_io:
|
||||
yield metrics.Observation(
|
||||
disk_io.write_bytes,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect disk I/O write metrics: {e}")
|
||||
|
||||
def _observe_network_io_sent(self, options):
|
||||
"""Observe network bytes sent"""
|
||||
try:
|
||||
net_io = psutil.net_io_counters()
|
||||
yield metrics.Observation(
|
||||
net_io.bytes_sent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect network sent metrics: {e}")
|
||||
|
||||
def _observe_network_io_recv(self, options):
|
||||
"""Observe network bytes received"""
|
||||
try:
|
||||
net_io = psutil.net_io_counters()
|
||||
yield metrics.Observation(
|
||||
net_io.bytes_recv,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect network recv metrics: {e}")
|
||||
|
||||
|
||||
class ApplicationMetricsCollector:
|
||||
"""
|
||||
Collects application-level metrics (HTTP requests, database connections, etc.)
|
||||
using OpenTelemetry metrics API only (no Prometheus).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
):
|
||||
self.service_name = service_name
|
||||
|
||||
# Use provided meter provider or get global
|
||||
if meter_provider:
|
||||
self.meter = meter_provider.get_meter(__name__)
|
||||
else:
|
||||
self.meter = metrics.get_meter(__name__)
|
||||
|
||||
# HTTP metrics
|
||||
self.http_requests = self.meter.create_counter(
|
||||
name="http.server.requests",
|
||||
description="Total HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
self.http_request_duration = self.meter.create_histogram(
|
||||
name="http.server.request.duration",
|
||||
description="HTTP request duration",
|
||||
unit="ms"
|
||||
)
|
||||
|
||||
self.http_active_requests = self.meter.create_up_down_counter(
|
||||
name="http.server.active_requests",
|
||||
description="Active HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
# Database metrics
|
||||
self.db_connections = self.meter.create_up_down_counter(
|
||||
name="db.client.connections.usage",
|
||||
description="Database connections in use",
|
||||
unit="connections"
|
||||
)
|
||||
|
||||
self.db_query_duration = self.meter.create_histogram(
|
||||
name="db.client.operation.duration",
|
||||
description="Database query duration",
|
||||
unit="ms"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Application metrics collector initialized",
|
||||
service=service_name
|
||||
)
|
||||
|
||||
def record_http_request(
|
||||
self,
|
||||
method: str,
|
||||
endpoint: str,
|
||||
status_code: int,
|
||||
duration_ms: float
|
||||
):
|
||||
"""Record an HTTP request"""
|
||||
attributes = {
|
||||
"service": self.service_name,
|
||||
"http.method": method,
|
||||
"http.route": endpoint,
|
||||
"http.status_code": status_code
|
||||
}
|
||||
|
||||
self.http_requests.add(1, attributes)
|
||||
self.http_request_duration.record(duration_ms, attributes)
|
||||
|
||||
def increment_active_requests(self):
|
||||
"""Increment active request count"""
|
||||
self.http_active_requests.add(1, {"service": self.service_name})
|
||||
|
||||
def decrement_active_requests(self):
|
||||
"""Decrement active request count"""
|
||||
self.http_active_requests.add(-1, {"service": self.service_name})
|
||||
|
||||
def set_db_connections(self, count: int, state: str = "used"):
|
||||
"""Set database connection count"""
|
||||
self.db_connections.add(
|
||||
count,
|
||||
{"service": self.service_name, "state": state}
|
||||
)
|
||||
|
||||
def record_db_query(self, operation: str, duration_ms: float, table: str = ""):
|
||||
"""Record a database query"""
|
||||
attributes = {
|
||||
"service": self.service_name,
|
||||
"db.operation": operation
|
||||
}
|
||||
if table:
|
||||
attributes["db.table"] = table
|
||||
|
||||
self.db_query_duration.record(duration_ms, attributes)
|
||||
|
||||
|
||||
def setup_all_metrics(
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
) -> tuple[SystemMetricsCollector, ApplicationMetricsCollector]:
|
||||
"""
|
||||
Setup both system and application metrics collection.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service
|
||||
service_version: Version of the service
|
||||
meter_provider: Optional meter provider (will use global if not provided)
|
||||
|
||||
Returns:
|
||||
Tuple of (SystemMetricsCollector, ApplicationMetricsCollector)
|
||||
|
||||
Example:
|
||||
from shared.monitoring.system_metrics import setup_all_metrics
|
||||
|
||||
system_metrics, app_metrics = setup_all_metrics("auth-service", "1.0.0")
|
||||
|
||||
# Metrics are automatically collected
|
||||
# Use app_metrics to record custom application events:
|
||||
app_metrics.record_http_request("GET", "/api/users", 200, 45.2)
|
||||
"""
|
||||
system_metrics = SystemMetricsCollector(service_name, service_version, meter_provider)
|
||||
app_metrics = ApplicationMetricsCollector(service_name, service_version, meter_provider)
|
||||
|
||||
logger.info(
|
||||
"All metrics collectors initialized",
|
||||
service=service_name,
|
||||
collectors=["system", "application"]
|
||||
)
|
||||
|
||||
return system_metrics, app_metrics
|
||||
@@ -22,7 +22,7 @@ def setup_tracing(
|
||||
app,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
jaeger_endpoint: str = "http://jaeger-collector.monitoring:4317"
|
||||
otel_endpoint: str = "http://signoz-otel-collector.signoz:4318"
|
||||
):
|
||||
"""
|
||||
Setup OpenTelemetry distributed tracing for a FastAPI service.
|
||||
@@ -37,7 +37,7 @@ def setup_tracing(
|
||||
app: FastAPI application instance
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
jaeger_endpoint: Jaeger collector gRPC endpoint
|
||||
otel_endpoint: OpenTelemetry collector endpoint (SigNoz)
|
||||
|
||||
Example:
|
||||
from shared.monitoring.tracing import setup_tracing
|
||||
@@ -58,9 +58,9 @@ def setup_tracing(
|
||||
tracer_provider = TracerProvider(resource=resource)
|
||||
trace.set_tracer_provider(tracer_provider)
|
||||
|
||||
# Configure OTLP exporter to send to Jaeger
|
||||
# Configure OTLP exporter to send to SigNoz
|
||||
otlp_exporter = OTLPSpanExporter(
|
||||
endpoint=jaeger_endpoint,
|
||||
endpoint=otel_endpoint,
|
||||
insecure=True # Use TLS in production
|
||||
)
|
||||
|
||||
@@ -100,7 +100,7 @@ def setup_tracing(
|
||||
logger.info(
|
||||
"Distributed tracing configured",
|
||||
service=service_name,
|
||||
jaeger_endpoint=jaeger_endpoint
|
||||
otel_endpoint=otel_endpoint
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user