Imporve monitoring 5
This commit is contained in:
271
shared/monitoring/telemetry.py
Normal file
271
shared/monitoring/telemetry.py
Normal file
@@ -0,0 +1,271 @@
|
||||
"""
|
||||
Unified OpenTelemetry Telemetry Setup
|
||||
|
||||
Provides a single entry point to configure all telemetry signals:
|
||||
- Traces: Distributed tracing across services
|
||||
- Metrics: OTLP metrics export + system metrics collection
|
||||
- Logs: Structured logs with trace correlation
|
||||
|
||||
All signals are exported to SigNoz via OTLP.
|
||||
"""
|
||||
|
||||
import os
|
||||
import structlog
|
||||
from typing import Optional, Dict, Any, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .otel_config import OTelConfig
|
||||
from .tracing import setup_tracing
|
||||
from .metrics_exporter import setup_otel_metrics
|
||||
from .logs_exporter import setup_otel_logging
|
||||
from .system_metrics import setup_all_metrics, SystemMetricsCollector, ApplicationMetricsCollector
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
@dataclass
|
||||
class TelemetryProviders:
|
||||
"""
|
||||
Container for all OpenTelemetry providers and collectors.
|
||||
|
||||
Attributes:
|
||||
tracer_provider: Provider for distributed tracing
|
||||
meter_provider: Provider for metrics export
|
||||
logging_handler: Handler for structured logs
|
||||
system_metrics: Collector for system-level metrics (CPU, memory, disk, network)
|
||||
app_metrics: Collector for application-level metrics (HTTP, DB)
|
||||
"""
|
||||
tracer_provider: Optional[Any] = None
|
||||
meter_provider: Optional[Any] = None
|
||||
logging_handler: Optional[Any] = None
|
||||
system_metrics: Optional[SystemMetricsCollector] = None
|
||||
app_metrics: Optional[ApplicationMetricsCollector] = None
|
||||
|
||||
|
||||
def setup_telemetry(
|
||||
app,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
enable_traces: bool = True,
|
||||
enable_metrics: bool = True,
|
||||
enable_logs: bool = True,
|
||||
enable_system_metrics: bool = True,
|
||||
metrics_protocol: Optional[str] = None, # "grpc" or "http", defaults to grpc
|
||||
export_interval_millis: int = 60000
|
||||
) -> TelemetryProviders:
|
||||
"""
|
||||
Setup all OpenTelemetry telemetry signals (traces, metrics, logs) for a service.
|
||||
|
||||
This is the UNIFIED setup function that configures everything:
|
||||
- Distributed tracing (gRPC, port 4317)
|
||||
- Metrics export (gRPC by default, port 4317)
|
||||
- System metrics collection (CPU, memory, disk, network)
|
||||
- Application metrics (HTTP requests, DB queries)
|
||||
- Structured logs export (HTTP, port 4318)
|
||||
|
||||
All signals use the centralized OTelConfig for endpoint management.
|
||||
|
||||
Args:
|
||||
app: FastAPI application instance
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
enable_traces: Enable distributed tracing (default: True)
|
||||
enable_metrics: Enable metrics export to OTLP (default: True)
|
||||
enable_logs: Enable logs export to OTLP (default: True)
|
||||
enable_system_metrics: Enable system metrics collection (default: True, can be disabled via ENABLE_SYSTEM_METRICS env)
|
||||
metrics_protocol: Protocol for metrics ("grpc" or "http", default: "grpc")
|
||||
export_interval_millis: How often to export metrics in milliseconds
|
||||
|
||||
Returns:
|
||||
TelemetryProviders containing all initialized providers and collectors
|
||||
|
||||
Example:
|
||||
from shared.monitoring.telemetry import setup_telemetry
|
||||
|
||||
app = FastAPI(title="Auth Service")
|
||||
providers = setup_telemetry(
|
||||
app,
|
||||
service_name="auth-service",
|
||||
service_version="1.0.0"
|
||||
)
|
||||
|
||||
# All telemetry is now configured:
|
||||
# - Traces automatically captured for HTTP requests
|
||||
# - System metrics automatically collected
|
||||
# - Application metrics via providers.app_metrics
|
||||
# - Logs automatically correlated with traces
|
||||
"""
|
||||
|
||||
logger.info(
|
||||
"Setting up unified OpenTelemetry telemetry",
|
||||
service=service_name,
|
||||
version=service_version,
|
||||
traces=enable_traces,
|
||||
metrics=enable_metrics,
|
||||
logs=enable_logs,
|
||||
system_metrics=enable_system_metrics
|
||||
)
|
||||
|
||||
providers = TelemetryProviders()
|
||||
|
||||
# Setup distributed tracing
|
||||
if enable_traces and OTelConfig.is_enabled("traces"):
|
||||
try:
|
||||
providers.tracer_provider = setup_tracing(
|
||||
app,
|
||||
service_name=service_name,
|
||||
service_version=service_version
|
||||
)
|
||||
if providers.tracer_provider:
|
||||
logger.info("✓ Distributed tracing configured", service=service_name)
|
||||
else:
|
||||
logger.warning("✗ Distributed tracing setup returned None", service=service_name)
|
||||
except Exception as e:
|
||||
logger.error("✗ Failed to setup distributed tracing", service=service_name, error=str(e))
|
||||
|
||||
# Setup OTLP metrics export
|
||||
if enable_metrics and OTelConfig.is_enabled("metrics"):
|
||||
try:
|
||||
providers.meter_provider = setup_otel_metrics(
|
||||
service_name=service_name,
|
||||
service_version=service_version,
|
||||
protocol=metrics_protocol,
|
||||
export_interval_millis=export_interval_millis
|
||||
)
|
||||
if providers.meter_provider:
|
||||
logger.info("✓ OTLP metrics export configured", service=service_name)
|
||||
|
||||
# Setup system and application metrics collectors
|
||||
if enable_system_metrics:
|
||||
enable_system_env = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
|
||||
if enable_system_env:
|
||||
try:
|
||||
providers.system_metrics, providers.app_metrics = setup_all_metrics(
|
||||
service_name=service_name,
|
||||
service_version=service_version,
|
||||
meter_provider=providers.meter_provider
|
||||
)
|
||||
logger.info(
|
||||
"✓ System and application metrics collectors initialized",
|
||||
service=service_name,
|
||||
system_metrics=["cpu", "memory", "disk", "network"],
|
||||
app_metrics=["http_requests", "db_queries"]
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("✗ Failed to setup metrics collectors", service=service_name, error=str(e))
|
||||
else:
|
||||
logger.warning("✗ OTLP metrics export setup returned None", service=service_name)
|
||||
except Exception as e:
|
||||
logger.error("✗ Failed to setup OTLP metrics export", service=service_name, error=str(e))
|
||||
|
||||
# Setup logs export
|
||||
if enable_logs and OTelConfig.is_enabled("logs"):
|
||||
try:
|
||||
providers.logging_handler = setup_otel_logging(
|
||||
service_name=service_name,
|
||||
service_version=service_version
|
||||
)
|
||||
if providers.logging_handler:
|
||||
logger.info("✓ Structured logs export configured", service=service_name)
|
||||
else:
|
||||
logger.warning("✗ Logs export setup returned None", service=service_name)
|
||||
except Exception as e:
|
||||
logger.error("✗ Failed to setup logs export", service=service_name, error=str(e))
|
||||
|
||||
# Log endpoint configuration summary
|
||||
try:
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
summary = {
|
||||
"service": service_name,
|
||||
"version": service_version,
|
||||
"traces": {
|
||||
"enabled": bool(providers.tracer_provider),
|
||||
"endpoint": endpoints.traces_grpc if providers.tracer_provider else "disabled"
|
||||
},
|
||||
"metrics": {
|
||||
"enabled": bool(providers.meter_provider),
|
||||
"endpoint": (endpoints.metrics_grpc if metrics_protocol != "http" else endpoints.metrics_http) if providers.meter_provider else "disabled",
|
||||
"system_metrics": bool(providers.system_metrics),
|
||||
"app_metrics": bool(providers.app_metrics)
|
||||
},
|
||||
"logs": {
|
||||
"enabled": bool(providers.logging_handler),
|
||||
"endpoint": endpoints.logs_http if providers.logging_handler else "disabled"
|
||||
}
|
||||
}
|
||||
logger.info("🎉 Telemetry setup complete", **summary)
|
||||
except Exception as e:
|
||||
logger.warning("Could not log endpoint summary", error=str(e))
|
||||
|
||||
return providers
|
||||
|
||||
|
||||
def setup_telemetry_simple(
|
||||
app,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0"
|
||||
) -> TelemetryProviders:
|
||||
"""
|
||||
Simplified telemetry setup with all defaults.
|
||||
|
||||
Uses:
|
||||
- gRPC for traces (port 4317)
|
||||
- gRPC for metrics (port 4317)
|
||||
- HTTP for logs (port 4318)
|
||||
|
||||
All settings are read from environment variables and OTelConfig.
|
||||
|
||||
Args:
|
||||
app: FastAPI application instance
|
||||
service_name: Name of the service
|
||||
service_version: Version of the service
|
||||
|
||||
Returns:
|
||||
TelemetryProviders containing all initialized providers
|
||||
|
||||
Example:
|
||||
from shared.monitoring.telemetry import setup_telemetry_simple
|
||||
|
||||
app = FastAPI(title="Auth Service")
|
||||
providers = setup_telemetry_simple(app, "auth-service")
|
||||
"""
|
||||
return setup_telemetry(
|
||||
app=app,
|
||||
service_name=service_name,
|
||||
service_version=service_version
|
||||
)
|
||||
|
||||
|
||||
def get_telemetry_status() -> Dict[str, Any]:
|
||||
"""
|
||||
Get current telemetry configuration status.
|
||||
|
||||
Returns:
|
||||
Dictionary with telemetry status information
|
||||
|
||||
Example:
|
||||
from shared.monitoring.telemetry import get_telemetry_status
|
||||
|
||||
status = get_telemetry_status()
|
||||
print(f"Tracing enabled: {status['traces']['enabled']}")
|
||||
"""
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
|
||||
return {
|
||||
"traces": {
|
||||
"enabled": OTelConfig.is_enabled("traces"),
|
||||
"protocol": "grpc",
|
||||
"endpoint": endpoints.traces_grpc
|
||||
},
|
||||
"metrics": {
|
||||
"enabled": OTelConfig.is_enabled("metrics"),
|
||||
"protocol": OTelConfig.get_protocol("metrics"),
|
||||
"grpc_endpoint": endpoints.metrics_grpc,
|
||||
"http_endpoint": endpoints.metrics_http
|
||||
},
|
||||
"logs": {
|
||||
"enabled": OTelConfig.is_enabled("logs"),
|
||||
"protocol": "http",
|
||||
"endpoint": endpoints.logs_http
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user