Imporve monitoring 5

This commit is contained in:
Urtzi Alfaro
2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions

View File

@@ -3,17 +3,38 @@ OpenTelemetry distributed tracing integration
Provides end-to-end request tracking across all services
"""
import os
import structlog
from typing import Optional
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from opentelemetry.sdk.resources import Resource
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
# Core instrumentations (should always be available)
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
# Optional instrumentations (may not be installed in all services)
try:
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
HTTPX_AVAILABLE = True
except ImportError:
HTTPX_AVAILABLE = False
try:
from opentelemetry.instrumentation.redis import RedisInstrumentor
REDIS_AVAILABLE = True
except ImportError:
REDIS_AVAILABLE = False
try:
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
SQLALCHEMY_AVAILABLE = True
except ImportError:
SQLALCHEMY_AVAILABLE = False
from .otel_config import OTelConfig
logger = structlog.get_logger()
@@ -22,8 +43,8 @@ def setup_tracing(
app,
service_name: str,
service_version: str = "1.0.0",
otel_endpoint: str = "http://signoz-otel-collector.bakery-ia:4318"
):
otel_endpoint: Optional[str] = None
) -> Optional[TracerProvider]:
"""
Setup OpenTelemetry distributed tracing for a FastAPI service.
@@ -33,35 +54,56 @@ def setup_tracing(
- Redis operations
- PostgreSQL/SQLAlchemy queries
Uses gRPC protocol (port 4317) for sending traces to SigNoz.
Args:
app: FastAPI application instance
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (SigNoz)
otel_endpoint: Optional override for OTLP endpoint (gRPC format: host:port)
Returns:
TracerProvider instance if successful, None otherwise
Example:
from shared.monitoring.tracing import setup_tracing
app = FastAPI(title="Auth Service")
setup_tracing(app, "auth-service")
tracer_provider = setup_tracing(app, "auth-service", "1.0.0")
"""
# Check if tracing is enabled
if not OTelConfig.is_enabled("traces"):
logger.info(
"Distributed tracing disabled",
service=service_name,
reason="ENABLE_TRACING not set to 'true'"
)
return None
try:
# Create resource with service information
resource = Resource(attributes={
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": "production"
})
# Get endpoints from centralized config
endpoints = OTelConfig.get_endpoints()
# Use provided endpoint or get from config
if otel_endpoint:
# Clean user-provided endpoint for gRPC
grpc_endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
else:
grpc_endpoint = endpoints.traces_grpc
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure tracer provider
tracer_provider = TracerProvider(resource=resource)
trace.set_tracer_provider(tracer_provider)
# Configure OTLP exporter to send to SigNoz
# Configure OTLP gRPC exporter for traces
otlp_exporter = OTLPSpanExporter(
endpoint=otel_endpoint,
insecure=True # Use TLS in production
endpoint=grpc_endpoint,
insecure=True # Use secure=False in production with proper TLS
)
# Add span processor with batching for performance
@@ -75,40 +117,46 @@ def setup_tracing(
excluded_urls="health,metrics" # Don't trace health/metrics endpoints
)
# Auto-instrument HTTPX (inter-service communication)
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
# Auto-instrument HTTPX (inter-service communication) if available
if HTTPX_AVAILABLE:
try:
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("HTTPX instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument HTTPX: {e}")
# Auto-instrument Redis
try:
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
except Exception as e:
logger.warning(f"Failed to instrument Redis: {e}")
# Auto-instrument Redis if available
if REDIS_AVAILABLE:
try:
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("Redis instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument Redis: {e}")
# Auto-instrument PostgreSQL (psycopg2) - skip if not available
# Most services use asyncpg instead of psycopg2
# try:
# Psycopg2Instrumentor().instrument(tracer_provider=tracer_provider)
# except Exception as e:
# logger.warning(f"Failed to instrument Psycopg2: {e}")
# Auto-instrument SQLAlchemy
try:
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
except Exception as e:
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
# Auto-instrument SQLAlchemy if available
if SQLALCHEMY_AVAILABLE:
try:
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("SQLAlchemy instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
logger.info(
"Distributed tracing configured",
"Distributed tracing configured successfully",
service=service_name,
otel_endpoint=otel_endpoint
grpc_endpoint=grpc_endpoint,
protocol="grpc"
)
return tracer_provider
except Exception as e:
logger.error(
"Failed to setup tracing - continuing without it",
service=service_name,
error=str(e)
)
return None
def get_current_trace_id() -> Optional[str]: