Update monitoring packages to latest versions
- Updated all OpenTelemetry packages to latest versions: - opentelemetry-api: 1.27.0 → 1.39.1 - opentelemetry-sdk: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-grpc: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-http: 1.27.0 → 1.39.1 - opentelemetry-instrumentation-fastapi: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-httpx: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-redis: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-sqlalchemy: 0.48b0 → 0.60b1 - Removed prometheus-client==0.23.1 from all services - Unified all services to use the same monitoring package versions Generated by Mistral Vibe. Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
This commit is contained in:
433
shared/monitoring/system_metrics.py
Normal file
433
shared/monitoring/system_metrics.py
Normal file
@@ -0,0 +1,433 @@
|
||||
"""
|
||||
System Metrics Collection for SigNoz
|
||||
Collects CPU, memory, disk, and process metrics via OpenTelemetry
|
||||
"""
|
||||
|
||||
import os
|
||||
import psutil
|
||||
import structlog
|
||||
from typing import Optional
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class SystemMetricsCollector:
|
||||
"""
|
||||
Collects system-level metrics (CPU, memory, disk, network, process info)
|
||||
and exports them to SigNoz via OpenTelemetry.
|
||||
|
||||
These metrics help monitor service health and resource utilization.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
):
|
||||
self.service_name = service_name
|
||||
self.service_version = service_version
|
||||
self.process = psutil.Process()
|
||||
|
||||
# Use provided meter provider or get global
|
||||
if meter_provider:
|
||||
self.meter = meter_provider.get_meter(__name__)
|
||||
else:
|
||||
self.meter = metrics.get_meter(__name__)
|
||||
|
||||
# Initialize metric instruments
|
||||
self._setup_metrics()
|
||||
|
||||
logger.info(
|
||||
"System metrics collector initialized",
|
||||
service=service_name,
|
||||
pid=os.getpid()
|
||||
)
|
||||
|
||||
def _setup_metrics(self):
|
||||
"""Setup all system metric instruments"""
|
||||
|
||||
# Process CPU metrics
|
||||
self.process_cpu_percent = self.meter.create_observable_gauge(
|
||||
name="process.cpu.utilization",
|
||||
description="Process CPU utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_process_cpu]
|
||||
)
|
||||
|
||||
# Process memory metrics
|
||||
self.process_memory_usage = self.meter.create_observable_gauge(
|
||||
name="process.memory.usage",
|
||||
description="Process memory usage in bytes",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_process_memory]
|
||||
)
|
||||
|
||||
self.process_memory_percent = self.meter.create_observable_gauge(
|
||||
name="process.memory.utilization",
|
||||
description="Process memory utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_process_memory_percent]
|
||||
)
|
||||
|
||||
# Process thread count
|
||||
self.process_threads = self.meter.create_observable_gauge(
|
||||
name="process.threads.count",
|
||||
description="Number of threads in the process",
|
||||
unit="threads",
|
||||
callbacks=[self._observe_process_threads]
|
||||
)
|
||||
|
||||
# Process file descriptors (Unix only)
|
||||
if hasattr(self.process, 'num_fds'):
|
||||
self.process_fds = self.meter.create_observable_gauge(
|
||||
name="process.open_file_descriptors",
|
||||
description="Number of open file descriptors",
|
||||
unit="fds",
|
||||
callbacks=[self._observe_process_fds]
|
||||
)
|
||||
|
||||
# System-wide CPU metrics
|
||||
self.system_cpu_percent = self.meter.create_observable_gauge(
|
||||
name="system.cpu.utilization",
|
||||
description="System-wide CPU utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_system_cpu]
|
||||
)
|
||||
|
||||
# System-wide memory metrics
|
||||
self.system_memory_usage = self.meter.create_observable_gauge(
|
||||
name="system.memory.usage",
|
||||
description="System memory usage in bytes",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_system_memory]
|
||||
)
|
||||
|
||||
self.system_memory_percent = self.meter.create_observable_gauge(
|
||||
name="system.memory.utilization",
|
||||
description="System memory utilization percentage",
|
||||
unit="percent",
|
||||
callbacks=[self._observe_system_memory_percent]
|
||||
)
|
||||
|
||||
# Disk I/O metrics
|
||||
self.disk_io_read = self.meter.create_observable_counter(
|
||||
name="system.disk.io.read",
|
||||
description="Disk bytes read",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_disk_io_read]
|
||||
)
|
||||
|
||||
self.disk_io_write = self.meter.create_observable_counter(
|
||||
name="system.disk.io.write",
|
||||
description="Disk bytes written",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_disk_io_write]
|
||||
)
|
||||
|
||||
# Network I/O metrics
|
||||
self.network_io_sent = self.meter.create_observable_counter(
|
||||
name="system.network.io.sent",
|
||||
description="Network bytes sent",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_network_io_sent]
|
||||
)
|
||||
|
||||
self.network_io_recv = self.meter.create_observable_counter(
|
||||
name="system.network.io.received",
|
||||
description="Network bytes received",
|
||||
unit="bytes",
|
||||
callbacks=[self._observe_network_io_recv]
|
||||
)
|
||||
|
||||
# Callback methods for observable instruments
|
||||
|
||||
def _observe_process_cpu(self, options):
|
||||
"""Observe process CPU usage"""
|
||||
try:
|
||||
cpu_percent = self.process.cpu_percent(interval=None)
|
||||
yield metrics.Observation(
|
||||
cpu_percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process CPU metrics: {e}")
|
||||
|
||||
def _observe_process_memory(self, options):
|
||||
"""Observe process memory usage"""
|
||||
try:
|
||||
mem_info = self.process.memory_info()
|
||||
yield metrics.Observation(
|
||||
mem_info.rss, # Resident Set Size
|
||||
{"service": self.service_name, "type": "rss"}
|
||||
)
|
||||
yield metrics.Observation(
|
||||
mem_info.vms, # Virtual Memory Size
|
||||
{"service": self.service_name, "type": "vms"}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process memory metrics: {e}")
|
||||
|
||||
def _observe_process_memory_percent(self, options):
|
||||
"""Observe process memory percentage"""
|
||||
try:
|
||||
mem_percent = self.process.memory_percent()
|
||||
yield metrics.Observation(
|
||||
mem_percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process memory percent: {e}")
|
||||
|
||||
def _observe_process_threads(self, options):
|
||||
"""Observe process thread count"""
|
||||
try:
|
||||
num_threads = self.process.num_threads()
|
||||
yield metrics.Observation(
|
||||
num_threads,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process thread count: {e}")
|
||||
|
||||
def _observe_process_fds(self, options):
|
||||
"""Observe process file descriptors (Unix only)"""
|
||||
try:
|
||||
num_fds = self.process.num_fds()
|
||||
yield metrics.Observation(
|
||||
num_fds,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect process FDs: {e}")
|
||||
|
||||
def _observe_system_cpu(self, options):
|
||||
"""Observe system-wide CPU usage"""
|
||||
try:
|
||||
cpu_percent = psutil.cpu_percent(interval=None)
|
||||
yield metrics.Observation(
|
||||
cpu_percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect system CPU metrics: {e}")
|
||||
|
||||
def _observe_system_memory(self, options):
|
||||
"""Observe system memory usage"""
|
||||
try:
|
||||
mem = psutil.virtual_memory()
|
||||
yield metrics.Observation(
|
||||
mem.used,
|
||||
{"service": self.service_name, "type": "used"}
|
||||
)
|
||||
yield metrics.Observation(
|
||||
mem.available,
|
||||
{"service": self.service_name, "type": "available"}
|
||||
)
|
||||
yield metrics.Observation(
|
||||
mem.total,
|
||||
{"service": self.service_name, "type": "total"}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect system memory metrics: {e}")
|
||||
|
||||
def _observe_system_memory_percent(self, options):
|
||||
"""Observe system memory percentage"""
|
||||
try:
|
||||
mem = psutil.virtual_memory()
|
||||
yield metrics.Observation(
|
||||
mem.percent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect system memory percent: {e}")
|
||||
|
||||
def _observe_disk_io_read(self, options):
|
||||
"""Observe disk I/O read bytes"""
|
||||
try:
|
||||
disk_io = psutil.disk_io_counters()
|
||||
if disk_io:
|
||||
yield metrics.Observation(
|
||||
disk_io.read_bytes,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect disk I/O read metrics: {e}")
|
||||
|
||||
def _observe_disk_io_write(self, options):
|
||||
"""Observe disk I/O write bytes"""
|
||||
try:
|
||||
disk_io = psutil.disk_io_counters()
|
||||
if disk_io:
|
||||
yield metrics.Observation(
|
||||
disk_io.write_bytes,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect disk I/O write metrics: {e}")
|
||||
|
||||
def _observe_network_io_sent(self, options):
|
||||
"""Observe network bytes sent"""
|
||||
try:
|
||||
net_io = psutil.net_io_counters()
|
||||
yield metrics.Observation(
|
||||
net_io.bytes_sent,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect network sent metrics: {e}")
|
||||
|
||||
def _observe_network_io_recv(self, options):
|
||||
"""Observe network bytes received"""
|
||||
try:
|
||||
net_io = psutil.net_io_counters()
|
||||
yield metrics.Observation(
|
||||
net_io.bytes_recv,
|
||||
{"service": self.service_name}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to collect network recv metrics: {e}")
|
||||
|
||||
|
||||
class ApplicationMetricsCollector:
|
||||
"""
|
||||
Collects application-level metrics (HTTP requests, database connections, etc.)
|
||||
using OpenTelemetry metrics API only (no Prometheus).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
):
|
||||
self.service_name = service_name
|
||||
|
||||
# Use provided meter provider or get global
|
||||
if meter_provider:
|
||||
self.meter = meter_provider.get_meter(__name__)
|
||||
else:
|
||||
self.meter = metrics.get_meter(__name__)
|
||||
|
||||
# HTTP metrics
|
||||
self.http_requests = self.meter.create_counter(
|
||||
name="http.server.requests",
|
||||
description="Total HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
self.http_request_duration = self.meter.create_histogram(
|
||||
name="http.server.request.duration",
|
||||
description="HTTP request duration",
|
||||
unit="ms"
|
||||
)
|
||||
|
||||
self.http_active_requests = self.meter.create_up_down_counter(
|
||||
name="http.server.active_requests",
|
||||
description="Active HTTP requests",
|
||||
unit="requests"
|
||||
)
|
||||
|
||||
# Database metrics
|
||||
self.db_connections = self.meter.create_up_down_counter(
|
||||
name="db.client.connections.usage",
|
||||
description="Database connections in use",
|
||||
unit="connections"
|
||||
)
|
||||
|
||||
self.db_query_duration = self.meter.create_histogram(
|
||||
name="db.client.operation.duration",
|
||||
description="Database query duration",
|
||||
unit="ms"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Application metrics collector initialized",
|
||||
service=service_name
|
||||
)
|
||||
|
||||
def record_http_request(
|
||||
self,
|
||||
method: str,
|
||||
endpoint: str,
|
||||
status_code: int,
|
||||
duration_ms: float
|
||||
):
|
||||
"""Record an HTTP request"""
|
||||
attributes = {
|
||||
"service": self.service_name,
|
||||
"http.method": method,
|
||||
"http.route": endpoint,
|
||||
"http.status_code": status_code
|
||||
}
|
||||
|
||||
self.http_requests.add(1, attributes)
|
||||
self.http_request_duration.record(duration_ms, attributes)
|
||||
|
||||
def increment_active_requests(self):
|
||||
"""Increment active request count"""
|
||||
self.http_active_requests.add(1, {"service": self.service_name})
|
||||
|
||||
def decrement_active_requests(self):
|
||||
"""Decrement active request count"""
|
||||
self.http_active_requests.add(-1, {"service": self.service_name})
|
||||
|
||||
def set_db_connections(self, count: int, state: str = "used"):
|
||||
"""Set database connection count"""
|
||||
self.db_connections.add(
|
||||
count,
|
||||
{"service": self.service_name, "state": state}
|
||||
)
|
||||
|
||||
def record_db_query(self, operation: str, duration_ms: float, table: str = ""):
|
||||
"""Record a database query"""
|
||||
attributes = {
|
||||
"service": self.service_name,
|
||||
"db.operation": operation
|
||||
}
|
||||
if table:
|
||||
attributes["db.table"] = table
|
||||
|
||||
self.db_query_duration.record(duration_ms, attributes)
|
||||
|
||||
|
||||
def setup_all_metrics(
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
meter_provider: Optional[MeterProvider] = None
|
||||
) -> tuple[SystemMetricsCollector, ApplicationMetricsCollector]:
|
||||
"""
|
||||
Setup both system and application metrics collection.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service
|
||||
service_version: Version of the service
|
||||
meter_provider: Optional meter provider (will use global if not provided)
|
||||
|
||||
Returns:
|
||||
Tuple of (SystemMetricsCollector, ApplicationMetricsCollector)
|
||||
|
||||
Example:
|
||||
from shared.monitoring.system_metrics import setup_all_metrics
|
||||
|
||||
system_metrics, app_metrics = setup_all_metrics("auth-service", "1.0.0")
|
||||
|
||||
# Metrics are automatically collected
|
||||
# Use app_metrics to record custom application events:
|
||||
app_metrics.record_http_request("GET", "/api/users", 200, 45.2)
|
||||
"""
|
||||
system_metrics = SystemMetricsCollector(service_name, service_version, meter_provider)
|
||||
app_metrics = ApplicationMetricsCollector(service_name, service_version, meter_provider)
|
||||
|
||||
logger.info(
|
||||
"All metrics collectors initialized",
|
||||
service=service_name,
|
||||
collectors=["system", "application"]
|
||||
)
|
||||
|
||||
return system_metrics, app_metrics
|
||||
Reference in New Issue
Block a user