- Updated all OpenTelemetry packages to latest versions: - opentelemetry-api: 1.27.0 → 1.39.1 - opentelemetry-sdk: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-grpc: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-http: 1.27.0 → 1.39.1 - opentelemetry-instrumentation-fastapi: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-httpx: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-redis: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-sqlalchemy: 0.48b0 → 0.60b1 - Removed prometheus-client==0.23.1 from all services - Unified all services to use the same monitoring package versions Generated by Mistral Vibe. Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
434 lines
14 KiB
Python
434 lines
14 KiB
Python
"""
|
|
System Metrics Collection for SigNoz
|
|
Collects CPU, memory, disk, and process metrics via OpenTelemetry
|
|
"""
|
|
|
|
import os
|
|
import psutil
|
|
import structlog
|
|
from typing import Optional
|
|
from opentelemetry import metrics
|
|
from opentelemetry.sdk.metrics import MeterProvider
|
|
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
|
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
|
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
|
|
class SystemMetricsCollector:
|
|
"""
|
|
Collects system-level metrics (CPU, memory, disk, network, process info)
|
|
and exports them to SigNoz via OpenTelemetry.
|
|
|
|
These metrics help monitor service health and resource utilization.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
service_name: str,
|
|
service_version: str = "1.0.0",
|
|
meter_provider: Optional[MeterProvider] = None
|
|
):
|
|
self.service_name = service_name
|
|
self.service_version = service_version
|
|
self.process = psutil.Process()
|
|
|
|
# Use provided meter provider or get global
|
|
if meter_provider:
|
|
self.meter = meter_provider.get_meter(__name__)
|
|
else:
|
|
self.meter = metrics.get_meter(__name__)
|
|
|
|
# Initialize metric instruments
|
|
self._setup_metrics()
|
|
|
|
logger.info(
|
|
"System metrics collector initialized",
|
|
service=service_name,
|
|
pid=os.getpid()
|
|
)
|
|
|
|
def _setup_metrics(self):
|
|
"""Setup all system metric instruments"""
|
|
|
|
# Process CPU metrics
|
|
self.process_cpu_percent = self.meter.create_observable_gauge(
|
|
name="process.cpu.utilization",
|
|
description="Process CPU utilization percentage",
|
|
unit="percent",
|
|
callbacks=[self._observe_process_cpu]
|
|
)
|
|
|
|
# Process memory metrics
|
|
self.process_memory_usage = self.meter.create_observable_gauge(
|
|
name="process.memory.usage",
|
|
description="Process memory usage in bytes",
|
|
unit="bytes",
|
|
callbacks=[self._observe_process_memory]
|
|
)
|
|
|
|
self.process_memory_percent = self.meter.create_observable_gauge(
|
|
name="process.memory.utilization",
|
|
description="Process memory utilization percentage",
|
|
unit="percent",
|
|
callbacks=[self._observe_process_memory_percent]
|
|
)
|
|
|
|
# Process thread count
|
|
self.process_threads = self.meter.create_observable_gauge(
|
|
name="process.threads.count",
|
|
description="Number of threads in the process",
|
|
unit="threads",
|
|
callbacks=[self._observe_process_threads]
|
|
)
|
|
|
|
# Process file descriptors (Unix only)
|
|
if hasattr(self.process, 'num_fds'):
|
|
self.process_fds = self.meter.create_observable_gauge(
|
|
name="process.open_file_descriptors",
|
|
description="Number of open file descriptors",
|
|
unit="fds",
|
|
callbacks=[self._observe_process_fds]
|
|
)
|
|
|
|
# System-wide CPU metrics
|
|
self.system_cpu_percent = self.meter.create_observable_gauge(
|
|
name="system.cpu.utilization",
|
|
description="System-wide CPU utilization percentage",
|
|
unit="percent",
|
|
callbacks=[self._observe_system_cpu]
|
|
)
|
|
|
|
# System-wide memory metrics
|
|
self.system_memory_usage = self.meter.create_observable_gauge(
|
|
name="system.memory.usage",
|
|
description="System memory usage in bytes",
|
|
unit="bytes",
|
|
callbacks=[self._observe_system_memory]
|
|
)
|
|
|
|
self.system_memory_percent = self.meter.create_observable_gauge(
|
|
name="system.memory.utilization",
|
|
description="System memory utilization percentage",
|
|
unit="percent",
|
|
callbacks=[self._observe_system_memory_percent]
|
|
)
|
|
|
|
# Disk I/O metrics
|
|
self.disk_io_read = self.meter.create_observable_counter(
|
|
name="system.disk.io.read",
|
|
description="Disk bytes read",
|
|
unit="bytes",
|
|
callbacks=[self._observe_disk_io_read]
|
|
)
|
|
|
|
self.disk_io_write = self.meter.create_observable_counter(
|
|
name="system.disk.io.write",
|
|
description="Disk bytes written",
|
|
unit="bytes",
|
|
callbacks=[self._observe_disk_io_write]
|
|
)
|
|
|
|
# Network I/O metrics
|
|
self.network_io_sent = self.meter.create_observable_counter(
|
|
name="system.network.io.sent",
|
|
description="Network bytes sent",
|
|
unit="bytes",
|
|
callbacks=[self._observe_network_io_sent]
|
|
)
|
|
|
|
self.network_io_recv = self.meter.create_observable_counter(
|
|
name="system.network.io.received",
|
|
description="Network bytes received",
|
|
unit="bytes",
|
|
callbacks=[self._observe_network_io_recv]
|
|
)
|
|
|
|
# Callback methods for observable instruments
|
|
|
|
def _observe_process_cpu(self, options):
|
|
"""Observe process CPU usage"""
|
|
try:
|
|
cpu_percent = self.process.cpu_percent(interval=None)
|
|
yield metrics.Observation(
|
|
cpu_percent,
|
|
{"service": self.service_name}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect process CPU metrics: {e}")
|
|
|
|
def _observe_process_memory(self, options):
|
|
"""Observe process memory usage"""
|
|
try:
|
|
mem_info = self.process.memory_info()
|
|
yield metrics.Observation(
|
|
mem_info.rss, # Resident Set Size
|
|
{"service": self.service_name, "type": "rss"}
|
|
)
|
|
yield metrics.Observation(
|
|
mem_info.vms, # Virtual Memory Size
|
|
{"service": self.service_name, "type": "vms"}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect process memory metrics: {e}")
|
|
|
|
def _observe_process_memory_percent(self, options):
|
|
"""Observe process memory percentage"""
|
|
try:
|
|
mem_percent = self.process.memory_percent()
|
|
yield metrics.Observation(
|
|
mem_percent,
|
|
{"service": self.service_name}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect process memory percent: {e}")
|
|
|
|
def _observe_process_threads(self, options):
|
|
"""Observe process thread count"""
|
|
try:
|
|
num_threads = self.process.num_threads()
|
|
yield metrics.Observation(
|
|
num_threads,
|
|
{"service": self.service_name}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect process thread count: {e}")
|
|
|
|
def _observe_process_fds(self, options):
|
|
"""Observe process file descriptors (Unix only)"""
|
|
try:
|
|
num_fds = self.process.num_fds()
|
|
yield metrics.Observation(
|
|
num_fds,
|
|
{"service": self.service_name}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect process FDs: {e}")
|
|
|
|
def _observe_system_cpu(self, options):
|
|
"""Observe system-wide CPU usage"""
|
|
try:
|
|
cpu_percent = psutil.cpu_percent(interval=None)
|
|
yield metrics.Observation(
|
|
cpu_percent,
|
|
{"service": self.service_name}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect system CPU metrics: {e}")
|
|
|
|
def _observe_system_memory(self, options):
|
|
"""Observe system memory usage"""
|
|
try:
|
|
mem = psutil.virtual_memory()
|
|
yield metrics.Observation(
|
|
mem.used,
|
|
{"service": self.service_name, "type": "used"}
|
|
)
|
|
yield metrics.Observation(
|
|
mem.available,
|
|
{"service": self.service_name, "type": "available"}
|
|
)
|
|
yield metrics.Observation(
|
|
mem.total,
|
|
{"service": self.service_name, "type": "total"}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect system memory metrics: {e}")
|
|
|
|
def _observe_system_memory_percent(self, options):
|
|
"""Observe system memory percentage"""
|
|
try:
|
|
mem = psutil.virtual_memory()
|
|
yield metrics.Observation(
|
|
mem.percent,
|
|
{"service": self.service_name}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect system memory percent: {e}")
|
|
|
|
def _observe_disk_io_read(self, options):
|
|
"""Observe disk I/O read bytes"""
|
|
try:
|
|
disk_io = psutil.disk_io_counters()
|
|
if disk_io:
|
|
yield metrics.Observation(
|
|
disk_io.read_bytes,
|
|
{"service": self.service_name}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect disk I/O read metrics: {e}")
|
|
|
|
def _observe_disk_io_write(self, options):
|
|
"""Observe disk I/O write bytes"""
|
|
try:
|
|
disk_io = psutil.disk_io_counters()
|
|
if disk_io:
|
|
yield metrics.Observation(
|
|
disk_io.write_bytes,
|
|
{"service": self.service_name}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect disk I/O write metrics: {e}")
|
|
|
|
def _observe_network_io_sent(self, options):
|
|
"""Observe network bytes sent"""
|
|
try:
|
|
net_io = psutil.net_io_counters()
|
|
yield metrics.Observation(
|
|
net_io.bytes_sent,
|
|
{"service": self.service_name}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect network sent metrics: {e}")
|
|
|
|
def _observe_network_io_recv(self, options):
|
|
"""Observe network bytes received"""
|
|
try:
|
|
net_io = psutil.net_io_counters()
|
|
yield metrics.Observation(
|
|
net_io.bytes_recv,
|
|
{"service": self.service_name}
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to collect network recv metrics: {e}")
|
|
|
|
|
|
class ApplicationMetricsCollector:
|
|
"""
|
|
Collects application-level metrics (HTTP requests, database connections, etc.)
|
|
using OpenTelemetry metrics API only (no Prometheus).
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
service_name: str,
|
|
service_version: str = "1.0.0",
|
|
meter_provider: Optional[MeterProvider] = None
|
|
):
|
|
self.service_name = service_name
|
|
|
|
# Use provided meter provider or get global
|
|
if meter_provider:
|
|
self.meter = meter_provider.get_meter(__name__)
|
|
else:
|
|
self.meter = metrics.get_meter(__name__)
|
|
|
|
# HTTP metrics
|
|
self.http_requests = self.meter.create_counter(
|
|
name="http.server.requests",
|
|
description="Total HTTP requests",
|
|
unit="requests"
|
|
)
|
|
|
|
self.http_request_duration = self.meter.create_histogram(
|
|
name="http.server.request.duration",
|
|
description="HTTP request duration",
|
|
unit="ms"
|
|
)
|
|
|
|
self.http_active_requests = self.meter.create_up_down_counter(
|
|
name="http.server.active_requests",
|
|
description="Active HTTP requests",
|
|
unit="requests"
|
|
)
|
|
|
|
# Database metrics
|
|
self.db_connections = self.meter.create_up_down_counter(
|
|
name="db.client.connections.usage",
|
|
description="Database connections in use",
|
|
unit="connections"
|
|
)
|
|
|
|
self.db_query_duration = self.meter.create_histogram(
|
|
name="db.client.operation.duration",
|
|
description="Database query duration",
|
|
unit="ms"
|
|
)
|
|
|
|
logger.info(
|
|
"Application metrics collector initialized",
|
|
service=service_name
|
|
)
|
|
|
|
def record_http_request(
|
|
self,
|
|
method: str,
|
|
endpoint: str,
|
|
status_code: int,
|
|
duration_ms: float
|
|
):
|
|
"""Record an HTTP request"""
|
|
attributes = {
|
|
"service": self.service_name,
|
|
"http.method": method,
|
|
"http.route": endpoint,
|
|
"http.status_code": status_code
|
|
}
|
|
|
|
self.http_requests.add(1, attributes)
|
|
self.http_request_duration.record(duration_ms, attributes)
|
|
|
|
def increment_active_requests(self):
|
|
"""Increment active request count"""
|
|
self.http_active_requests.add(1, {"service": self.service_name})
|
|
|
|
def decrement_active_requests(self):
|
|
"""Decrement active request count"""
|
|
self.http_active_requests.add(-1, {"service": self.service_name})
|
|
|
|
def set_db_connections(self, count: int, state: str = "used"):
|
|
"""Set database connection count"""
|
|
self.db_connections.add(
|
|
count,
|
|
{"service": self.service_name, "state": state}
|
|
)
|
|
|
|
def record_db_query(self, operation: str, duration_ms: float, table: str = ""):
|
|
"""Record a database query"""
|
|
attributes = {
|
|
"service": self.service_name,
|
|
"db.operation": operation
|
|
}
|
|
if table:
|
|
attributes["db.table"] = table
|
|
|
|
self.db_query_duration.record(duration_ms, attributes)
|
|
|
|
|
|
def setup_all_metrics(
|
|
service_name: str,
|
|
service_version: str = "1.0.0",
|
|
meter_provider: Optional[MeterProvider] = None
|
|
) -> tuple[SystemMetricsCollector, ApplicationMetricsCollector]:
|
|
"""
|
|
Setup both system and application metrics collection.
|
|
|
|
Args:
|
|
service_name: Name of the service
|
|
service_version: Version of the service
|
|
meter_provider: Optional meter provider (will use global if not provided)
|
|
|
|
Returns:
|
|
Tuple of (SystemMetricsCollector, ApplicationMetricsCollector)
|
|
|
|
Example:
|
|
from shared.monitoring.system_metrics import setup_all_metrics
|
|
|
|
system_metrics, app_metrics = setup_all_metrics("auth-service", "1.0.0")
|
|
|
|
# Metrics are automatically collected
|
|
# Use app_metrics to record custom application events:
|
|
app_metrics.record_http_request("GET", "/api/users", 200, 45.2)
|
|
"""
|
|
system_metrics = SystemMetricsCollector(service_name, service_version, meter_provider)
|
|
app_metrics = ApplicationMetricsCollector(service_name, service_version, meter_provider)
|
|
|
|
logger.info(
|
|
"All metrics collectors initialized",
|
|
service=service_name,
|
|
collectors=["system", "application"]
|
|
)
|
|
|
|
return system_metrics, app_metrics
|