""" System Metrics Collection for SigNoz Collects CPU, memory, disk, and process metrics via OpenTelemetry """ import os import psutil import structlog from typing import Optional from opentelemetry import metrics from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION logger = structlog.get_logger() class SystemMetricsCollector: """ Collects system-level metrics (CPU, memory, disk, network, process info) and exports them to SigNoz via OpenTelemetry. These metrics help monitor service health and resource utilization. """ def __init__( self, service_name: str, service_version: str = "1.0.0", meter_provider: Optional[MeterProvider] = None ): self.service_name = service_name self.service_version = service_version self.process = psutil.Process() # Use provided meter provider or get global if meter_provider: self.meter = meter_provider.get_meter(__name__) else: self.meter = metrics.get_meter(__name__) # Initialize metric instruments self._setup_metrics() logger.info( "System metrics collector initialized", service=service_name, pid=os.getpid() ) def _setup_metrics(self): """Setup all system metric instruments""" # Process CPU metrics self.process_cpu_percent = self.meter.create_observable_gauge( name="process.cpu.utilization", description="Process CPU utilization percentage", unit="percent", callbacks=[self._observe_process_cpu] ) # Process memory metrics self.process_memory_usage = self.meter.create_observable_gauge( name="process.memory.usage", description="Process memory usage in bytes", unit="bytes", callbacks=[self._observe_process_memory] ) self.process_memory_percent = self.meter.create_observable_gauge( name="process.memory.utilization", description="Process memory utilization percentage", unit="percent", callbacks=[self._observe_process_memory_percent] ) # Process thread count self.process_threads = self.meter.create_observable_gauge( name="process.threads.count", description="Number of threads in the process", unit="threads", callbacks=[self._observe_process_threads] ) # Process file descriptors (Unix only) if hasattr(self.process, 'num_fds'): self.process_fds = self.meter.create_observable_gauge( name="process.open_file_descriptors", description="Number of open file descriptors", unit="fds", callbacks=[self._observe_process_fds] ) # System-wide CPU metrics self.system_cpu_percent = self.meter.create_observable_gauge( name="system.cpu.utilization", description="System-wide CPU utilization percentage", unit="percent", callbacks=[self._observe_system_cpu] ) # System-wide memory metrics self.system_memory_usage = self.meter.create_observable_gauge( name="system.memory.usage", description="System memory usage in bytes", unit="bytes", callbacks=[self._observe_system_memory] ) self.system_memory_percent = self.meter.create_observable_gauge( name="system.memory.utilization", description="System memory utilization percentage", unit="percent", callbacks=[self._observe_system_memory_percent] ) # Disk I/O metrics self.disk_io_read = self.meter.create_observable_counter( name="system.disk.io.read", description="Disk bytes read", unit="bytes", callbacks=[self._observe_disk_io_read] ) self.disk_io_write = self.meter.create_observable_counter( name="system.disk.io.write", description="Disk bytes written", unit="bytes", callbacks=[self._observe_disk_io_write] ) # Network I/O metrics self.network_io_sent = self.meter.create_observable_counter( name="system.network.io.sent", description="Network bytes sent", unit="bytes", callbacks=[self._observe_network_io_sent] ) self.network_io_recv = self.meter.create_observable_counter( name="system.network.io.received", description="Network bytes received", unit="bytes", callbacks=[self._observe_network_io_recv] ) # Callback methods for observable instruments def _observe_process_cpu(self, options): """Observe process CPU usage""" try: cpu_percent = self.process.cpu_percent(interval=None) yield metrics.Observation( cpu_percent, {"service": self.service_name} ) except Exception as e: logger.warning(f"Failed to collect process CPU metrics: {e}") def _observe_process_memory(self, options): """Observe process memory usage""" try: mem_info = self.process.memory_info() yield metrics.Observation( mem_info.rss, # Resident Set Size {"service": self.service_name, "type": "rss"} ) yield metrics.Observation( mem_info.vms, # Virtual Memory Size {"service": self.service_name, "type": "vms"} ) except Exception as e: logger.warning(f"Failed to collect process memory metrics: {e}") def _observe_process_memory_percent(self, options): """Observe process memory percentage""" try: mem_percent = self.process.memory_percent() yield metrics.Observation( mem_percent, {"service": self.service_name} ) except Exception as e: logger.warning(f"Failed to collect process memory percent: {e}") def _observe_process_threads(self, options): """Observe process thread count""" try: num_threads = self.process.num_threads() yield metrics.Observation( num_threads, {"service": self.service_name} ) except Exception as e: logger.warning(f"Failed to collect process thread count: {e}") def _observe_process_fds(self, options): """Observe process file descriptors (Unix only)""" try: num_fds = self.process.num_fds() yield metrics.Observation( num_fds, {"service": self.service_name} ) except Exception as e: logger.warning(f"Failed to collect process FDs: {e}") def _observe_system_cpu(self, options): """Observe system-wide CPU usage""" try: cpu_percent = psutil.cpu_percent(interval=None) yield metrics.Observation( cpu_percent, {"service": self.service_name} ) except Exception as e: logger.warning(f"Failed to collect system CPU metrics: {e}") def _observe_system_memory(self, options): """Observe system memory usage""" try: mem = psutil.virtual_memory() yield metrics.Observation( mem.used, {"service": self.service_name, "type": "used"} ) yield metrics.Observation( mem.available, {"service": self.service_name, "type": "available"} ) yield metrics.Observation( mem.total, {"service": self.service_name, "type": "total"} ) except Exception as e: logger.warning(f"Failed to collect system memory metrics: {e}") def _observe_system_memory_percent(self, options): """Observe system memory percentage""" try: mem = psutil.virtual_memory() yield metrics.Observation( mem.percent, {"service": self.service_name} ) except Exception as e: logger.warning(f"Failed to collect system memory percent: {e}") def _observe_disk_io_read(self, options): """Observe disk I/O read bytes""" try: disk_io = psutil.disk_io_counters() if disk_io: yield metrics.Observation( disk_io.read_bytes, {"service": self.service_name} ) except Exception as e: logger.warning(f"Failed to collect disk I/O read metrics: {e}") def _observe_disk_io_write(self, options): """Observe disk I/O write bytes""" try: disk_io = psutil.disk_io_counters() if disk_io: yield metrics.Observation( disk_io.write_bytes, {"service": self.service_name} ) except Exception as e: logger.warning(f"Failed to collect disk I/O write metrics: {e}") def _observe_network_io_sent(self, options): """Observe network bytes sent""" try: net_io = psutil.net_io_counters() yield metrics.Observation( net_io.bytes_sent, {"service": self.service_name} ) except Exception as e: logger.warning(f"Failed to collect network sent metrics: {e}") def _observe_network_io_recv(self, options): """Observe network bytes received""" try: net_io = psutil.net_io_counters() yield metrics.Observation( net_io.bytes_recv, {"service": self.service_name} ) except Exception as e: logger.warning(f"Failed to collect network recv metrics: {e}") class ApplicationMetricsCollector: """ Collects application-level metrics (HTTP requests, database connections, etc.) using OpenTelemetry metrics API only (no Prometheus). """ def __init__( self, service_name: str, service_version: str = "1.0.0", meter_provider: Optional[MeterProvider] = None ): self.service_name = service_name # Use provided meter provider or get global if meter_provider: self.meter = meter_provider.get_meter(__name__) else: self.meter = metrics.get_meter(__name__) # HTTP metrics self.http_requests = self.meter.create_counter( name="http.server.requests", description="Total HTTP requests", unit="requests" ) self.http_request_duration = self.meter.create_histogram( name="http.server.request.duration", description="HTTP request duration", unit="ms" ) self.http_active_requests = self.meter.create_up_down_counter( name="http.server.active_requests", description="Active HTTP requests", unit="requests" ) # Database metrics self.db_connections = self.meter.create_up_down_counter( name="db.client.connections.usage", description="Database connections in use", unit="connections" ) self.db_query_duration = self.meter.create_histogram( name="db.client.operation.duration", description="Database query duration", unit="ms" ) logger.info( "Application metrics collector initialized", service=service_name ) def record_http_request( self, method: str, endpoint: str, status_code: int, duration_ms: float ): """Record an HTTP request""" attributes = { "service": self.service_name, "http.method": method, "http.route": endpoint, "http.status_code": status_code } self.http_requests.add(1, attributes) self.http_request_duration.record(duration_ms, attributes) def increment_active_requests(self): """Increment active request count""" self.http_active_requests.add(1, {"service": self.service_name}) def decrement_active_requests(self): """Decrement active request count""" self.http_active_requests.add(-1, {"service": self.service_name}) def set_db_connections(self, count: int, state: str = "used"): """Set database connection count""" self.db_connections.add( count, {"service": self.service_name, "state": state} ) def record_db_query(self, operation: str, duration_ms: float, table: str = ""): """Record a database query""" attributes = { "service": self.service_name, "db.operation": operation } if table: attributes["db.table"] = table self.db_query_duration.record(duration_ms, attributes) def setup_all_metrics( service_name: str, service_version: str = "1.0.0", meter_provider: Optional[MeterProvider] = None ) -> tuple[SystemMetricsCollector, ApplicationMetricsCollector]: """ Setup both system and application metrics collection. Args: service_name: Name of the service service_version: Version of the service meter_provider: Optional meter provider (will use global if not provided) Returns: Tuple of (SystemMetricsCollector, ApplicationMetricsCollector) Example: from shared.monitoring.system_metrics import setup_all_metrics system_metrics, app_metrics = setup_all_metrics("auth-service", "1.0.0") # Metrics are automatically collected # Use app_metrics to record custom application events: app_metrics.record_http_request("GET", "/api/users", 200, 45.2) """ system_metrics = SystemMetricsCollector(service_name, service_version, meter_provider) app_metrics = ApplicationMetricsCollector(service_name, service_version, meter_provider) logger.info( "All metrics collectors initialized", service=service_name, collectors=["system", "application"] ) return system_metrics, app_metrics