Imporve monitoring 5

This commit is contained in:
Urtzi Alfaro
2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions

View File

@@ -1,6 +1,6 @@
"""
OpenTelemetry Metrics Integration for SigNoz
Exports metrics to SigNoz via OpenTelemetry Collector in addition to Prometheus
Exports metrics to SigNoz via OpenTelemetry Collector using gRPC protocol
"""
import os
@@ -9,8 +9,24 @@ from typing import Optional
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from opentelemetry.sdk.resources import Resource
# Import both gRPC and HTTP exporters
try:
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GrpcMetricExporter
GRPC_AVAILABLE = True
except ImportError:
GRPC_AVAILABLE = False
GrpcMetricExporter = None
try:
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HttpMetricExporter
HTTP_AVAILABLE = True
except ImportError:
HTTP_AVAILABLE = False
HttpMetricExporter = None
from .otel_config import OTelConfig
logger = structlog.get_logger()
@@ -19,20 +35,21 @@ def setup_otel_metrics(
service_name: str,
service_version: str = "1.0.0",
otel_endpoint: Optional[str] = None,
export_interval_millis: int = 60000 # Export every 60 seconds
export_interval_millis: int = 60000, # Export every 60 seconds
protocol: Optional[str] = None # "grpc" or "http", defaults to grpc
) -> Optional[MeterProvider]:
"""
Setup OpenTelemetry metrics to export to SigNoz.
This creates a dual-export strategy:
- Prometheus exposition format at /metrics (for Prometheus scraping)
- OTLP push to SigNoz collector (for direct ingestion)
Supports both gRPC (recommended, port 4317) and HTTP (port 4318) protocols.
Default protocol is gRPC for better performance.
Args:
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (default from env)
export_interval_millis: How often to push metrics (default 60s)
otel_endpoint: Optional override for OTLP endpoint
export_interval_millis: How often to push metrics in milliseconds (default 60s)
protocol: Protocol to use ("grpc" or "http"). Defaults to "grpc"
Returns:
MeterProvider instance if successful, None otherwise
@@ -40,9 +57,12 @@ def setup_otel_metrics(
Example:
from shared.monitoring.metrics_exporter import setup_otel_metrics
# Setup during service initialization
# Setup with gRPC (default)
meter_provider = setup_otel_metrics("auth-service", "1.0.0")
# Or with HTTP
meter_provider = setup_otel_metrics("auth-service", "1.0.0", protocol="http")
# Create meters for your metrics
meter = meter_provider.get_meter(__name__)
request_counter = meter.create_counter(
@@ -56,8 +76,7 @@ def setup_otel_metrics(
"""
# Check if metrics export is enabled
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
if not enable_otel_metrics:
if not OTelConfig.is_enabled("metrics"):
logger.info(
"OpenTelemetry metrics export disabled",
service=service_name,
@@ -65,32 +84,66 @@ def setup_otel_metrics(
)
return None
# Get OTLP endpoint from environment or parameter
if otel_endpoint is None:
otel_endpoint = os.getenv(
"OTEL_EXPORTER_OTLP_ENDPOINT",
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
)
# Determine protocol to use
if protocol is None:
protocol = OTelConfig.get_protocol("metrics")
# Ensure endpoint has /v1/metrics path for HTTP
if not otel_endpoint.endswith("/v1/metrics"):
otel_endpoint = f"{otel_endpoint}/v1/metrics"
# Validate protocol is available
if protocol == "grpc" and not GRPC_AVAILABLE:
logger.warning(
"gRPC exporter not available, falling back to HTTP",
service=service_name
)
protocol = "http"
elif protocol == "http" and not HTTP_AVAILABLE:
logger.warning(
"HTTP exporter not available, falling back to gRPC",
service=service_name
)
protocol = "grpc"
if protocol not in ["grpc", "http"]:
logger.error(
"Invalid protocol specified",
service=service_name,
protocol=protocol
)
return None
try:
# Create resource with service information
resource = Resource(attributes={
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
})
# Get endpoints from centralized config
endpoints = OTelConfig.get_endpoints()
# Configure OTLP exporter for metrics
otlp_exporter = OTLPMetricExporter(
endpoint=otel_endpoint,
timeout=10
)
# Determine which endpoint to use
if otel_endpoint:
# User provided override
if protocol == "grpc":
endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
else:
endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/metrics")
else:
# Use config-determined endpoint
if protocol == "grpc":
endpoint = endpoints.metrics_grpc
else:
endpoint = endpoints.metrics_http
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure OTLP exporter based on protocol
if protocol == "grpc":
otlp_exporter = GrpcMetricExporter(
endpoint=endpoint,
insecure=True, # Use secure=False in production with proper TLS
timeout=10
)
else: # http
otlp_exporter = HttpMetricExporter(
endpoint=endpoint,
timeout=10
)
# Create periodic metric reader
metric_reader = PeriodicExportingMetricReader(
@@ -108,9 +161,10 @@ def setup_otel_metrics(
metrics.set_meter_provider(meter_provider)
logger.info(
"OpenTelemetry metrics export configured",
"OpenTelemetry metrics export configured successfully",
service=service_name,
otel_endpoint=otel_endpoint,
endpoint=endpoint,
protocol=protocol,
export_interval_seconds=export_interval_millis / 1000
)
@@ -121,7 +175,7 @@ def setup_otel_metrics(
"Failed to setup OpenTelemetry metrics export",
service=service_name,
error=str(e),
reason="Will continue with Prometheus-only metrics"
protocol=protocol
)
return None