Imporve monitoring 5

This commit is contained in:
Urtzi Alfaro
2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions

View File

@@ -1,14 +1,34 @@
"""
Shared monitoring package for microservices
Provides unified OpenTelemetry-based observability:
- Traces: Distributed tracing
- Metrics: System and application metrics
- Logs: Structured logging
All signals exported to SigNoz via OTLP.
"""
# Core setup - START HERE
from .logging import setup_logging
from .metrics import setup_metrics_early, get_metrics_collector, MetricsCollector
from .health_checks import (
HealthCheckManager,
FastAPIHealthChecker,
create_health_manager,
setup_fastapi_health_checks
from .telemetry import (
setup_telemetry,
setup_telemetry_simple,
get_telemetry_status,
TelemetryProviders
)
# Configuration
from .otel_config import OTelConfig, OTelEndpoints
# Individual signal setup (used by telemetry.py)
from .tracing import (
setup_tracing,
get_current_trace_id,
get_current_span_id,
add_trace_attributes,
add_trace_event,
record_exception
)
from .logs_exporter import (
setup_otel_logging,
@@ -27,23 +47,51 @@ from .system_metrics import (
setup_all_metrics
)
# Health checks
from .health_checks import (
HealthCheckManager,
FastAPIHealthChecker,
create_health_manager,
setup_fastapi_health_checks
)
__all__ = [
# CORE - Start with these
'setup_logging',
'setup_metrics_early',
'get_metrics_collector',
'MetricsCollector',
'HealthCheckManager',
'FastAPIHealthChecker',
'create_health_manager',
'setup_fastapi_health_checks',
'setup_telemetry',
'setup_telemetry_simple',
'get_telemetry_status',
'TelemetryProviders',
# Configuration
'OTelConfig',
'OTelEndpoints',
# Tracing
'setup_tracing',
'get_current_trace_id',
'get_current_span_id',
'add_trace_attributes',
'add_trace_event',
'record_exception',
# Logs
'setup_otel_logging',
'add_log_context',
'get_current_trace_context',
'StructlogOTELProcessor',
# Metrics
'setup_otel_metrics',
'OTelMetricsCollector',
'create_dual_metrics_collector',
'SystemMetricsCollector',
'ApplicationMetricsCollector',
'setup_all_metrics'
'setup_all_metrics',
# Health checks
'HealthCheckManager',
'FastAPIHealthChecker',
'create_health_manager',
'setup_fastapi_health_checks',
]

View File

@@ -1,6 +1,6 @@
"""
OpenTelemetry Logs Integration for SigNoz
Exports structured logs to SigNoz via OpenTelemetry Collector
Exports structured logs to SigNoz via OpenTelemetry Collector using HTTP protocol
"""
import os
@@ -10,14 +10,21 @@ from typing import Optional
from opentelemetry._logs import set_logger_provider
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.sdk.resources import Resource
# Try to import HTTP log exporter (logs always use HTTP)
try:
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
HTTP_LOG_EXPORTER_AVAILABLE = True
except ImportError:
try:
from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
HTTP_LOG_EXPORTER_AVAILABLE = True
except ImportError:
OTLPLogExporter = None
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
HTTP_LOG_EXPORTER_AVAILABLE = False
from .otel_config import OTelConfig
logger = structlog.get_logger()
@@ -31,13 +38,14 @@ def setup_otel_logging(
"""
Setup OpenTelemetry logging to export logs to SigNoz.
This integrates with Python's standard logging to automatically
export all log records to SigNoz via the OTLP protocol.
Uses HTTP protocol (port 4318) for sending logs to SigNoz.
Integrates with Python's standard logging to automatically export
all log records to SigNoz via the OTLP HTTP protocol.
Args:
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (default from env)
otel_endpoint: Optional override for OTLP endpoint (HTTP format with path)
enable_console: Whether to also log to console (default: True)
Returns:
@@ -47,7 +55,7 @@ def setup_otel_logging(
from shared.monitoring.logs_exporter import setup_otel_logging
# Setup during service initialization
setup_otel_logging("auth-service", "1.0.0")
handler = setup_otel_logging("auth-service", "1.0.0")
# Now all standard logging calls will be exported to SigNoz
import logging
@@ -56,7 +64,7 @@ def setup_otel_logging(
"""
# Check if logging export is enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() != "otlp":
if not OTelConfig.is_enabled("logs"):
logger.info(
"OpenTelemetry logs export disabled",
service=service_name,
@@ -64,59 +72,36 @@ def setup_otel_logging(
)
return None
# Get OTLP endpoint from environment or parameter
# For logs, we need to use the HTTP endpoint (port 4318), not the gRPC endpoint (port 4317)
if otel_endpoint is None:
# Try logs-specific endpoint first, then fall back to general OTLP endpoint
otel_endpoint = os.getenv(
"OTEL_EXPORTER_OTLP_LOGS_ENDPOINT",
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
# Check if HTTP log exporter is available
if not HTTP_LOG_EXPORTER_AVAILABLE or OTLPLogExporter is None:
logger.warning(
"OpenTelemetry HTTP log exporter not available",
service=service_name,
reason="opentelemetry-exporter-otlp-proto-http package not installed"
)
logger.info(f"Original OTLP endpoint for logs: {otel_endpoint}")
# If we got the tracing endpoint (4317), switch to logs endpoint (4318)
if otel_endpoint.endswith(":4317"):
logger.info("Converting tracing endpoint (4317) to logs endpoint (4318)")
otel_endpoint = otel_endpoint.replace(":4317", ":4318")
logger.info(f"Final OTLP endpoint for logs: {otel_endpoint}")
# Ensure endpoint has proper protocol prefix
if not otel_endpoint.startswith(("http://", "https://")):
# Default to HTTP for insecure connections
otel_endpoint = f"http://{otel_endpoint}"
# Ensure endpoint has /v1/logs path for HTTP
if not otel_endpoint.endswith("/v1/logs"):
otel_endpoint = f"{otel_endpoint}/v1/logs"
return None
try:
# Check if OTLPLogExporter is available
if OTLPLogExporter is None:
logger.warning(
"OpenTelemetry HTTP OTLP exporter not available",
service=service_name,
reason="opentelemetry-exporter-otlp-proto-http package not installed"
)
return None
# Get endpoints from centralized config
endpoints = OTelConfig.get_endpoints()
# Create resource with service information
resource = Resource(attributes={
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
})
# Use provided endpoint or get from config
if otel_endpoint:
http_endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/logs")
else:
http_endpoint = endpoints.logs_http
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure logger provider
logger_provider = LoggerProvider(resource=resource)
set_logger_provider(logger_provider)
# Configure OTLP exporter for logs
# Configure OTLP HTTP exporter for logs
otlp_exporter = OTLPLogExporter(
endpoint=otel_endpoint,
endpoint=http_endpoint,
timeout=10
)
@@ -135,9 +120,10 @@ def setup_otel_logging(
root_logger.addHandler(otel_handler)
logger.info(
"OpenTelemetry logs export configured",
"OpenTelemetry logs export configured successfully",
service=service_name,
otel_endpoint=otel_endpoint,
http_endpoint=http_endpoint,
protocol="http",
console_logging=enable_console
)
@@ -147,8 +133,7 @@ def setup_otel_logging(
logger.error(
"Failed to setup OpenTelemetry logs export",
service=service_name,
error=str(e),
reason="Will continue with standard logging only"
error=str(e)
)
return None

View File

@@ -1,6 +1,6 @@
"""
OpenTelemetry Metrics Integration for SigNoz
Exports metrics to SigNoz via OpenTelemetry Collector in addition to Prometheus
Exports metrics to SigNoz via OpenTelemetry Collector using gRPC protocol
"""
import os
@@ -9,8 +9,24 @@ from typing import Optional
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from opentelemetry.sdk.resources import Resource
# Import both gRPC and HTTP exporters
try:
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GrpcMetricExporter
GRPC_AVAILABLE = True
except ImportError:
GRPC_AVAILABLE = False
GrpcMetricExporter = None
try:
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HttpMetricExporter
HTTP_AVAILABLE = True
except ImportError:
HTTP_AVAILABLE = False
HttpMetricExporter = None
from .otel_config import OTelConfig
logger = structlog.get_logger()
@@ -19,20 +35,21 @@ def setup_otel_metrics(
service_name: str,
service_version: str = "1.0.0",
otel_endpoint: Optional[str] = None,
export_interval_millis: int = 60000 # Export every 60 seconds
export_interval_millis: int = 60000, # Export every 60 seconds
protocol: Optional[str] = None # "grpc" or "http", defaults to grpc
) -> Optional[MeterProvider]:
"""
Setup OpenTelemetry metrics to export to SigNoz.
This creates a dual-export strategy:
- Prometheus exposition format at /metrics (for Prometheus scraping)
- OTLP push to SigNoz collector (for direct ingestion)
Supports both gRPC (recommended, port 4317) and HTTP (port 4318) protocols.
Default protocol is gRPC for better performance.
Args:
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (default from env)
export_interval_millis: How often to push metrics (default 60s)
otel_endpoint: Optional override for OTLP endpoint
export_interval_millis: How often to push metrics in milliseconds (default 60s)
protocol: Protocol to use ("grpc" or "http"). Defaults to "grpc"
Returns:
MeterProvider instance if successful, None otherwise
@@ -40,9 +57,12 @@ def setup_otel_metrics(
Example:
from shared.monitoring.metrics_exporter import setup_otel_metrics
# Setup during service initialization
# Setup with gRPC (default)
meter_provider = setup_otel_metrics("auth-service", "1.0.0")
# Or with HTTP
meter_provider = setup_otel_metrics("auth-service", "1.0.0", protocol="http")
# Create meters for your metrics
meter = meter_provider.get_meter(__name__)
request_counter = meter.create_counter(
@@ -56,8 +76,7 @@ def setup_otel_metrics(
"""
# Check if metrics export is enabled
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
if not enable_otel_metrics:
if not OTelConfig.is_enabled("metrics"):
logger.info(
"OpenTelemetry metrics export disabled",
service=service_name,
@@ -65,32 +84,66 @@ def setup_otel_metrics(
)
return None
# Get OTLP endpoint from environment or parameter
if otel_endpoint is None:
otel_endpoint = os.getenv(
"OTEL_EXPORTER_OTLP_ENDPOINT",
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
)
# Determine protocol to use
if protocol is None:
protocol = OTelConfig.get_protocol("metrics")
# Ensure endpoint has /v1/metrics path for HTTP
if not otel_endpoint.endswith("/v1/metrics"):
otel_endpoint = f"{otel_endpoint}/v1/metrics"
# Validate protocol is available
if protocol == "grpc" and not GRPC_AVAILABLE:
logger.warning(
"gRPC exporter not available, falling back to HTTP",
service=service_name
)
protocol = "http"
elif protocol == "http" and not HTTP_AVAILABLE:
logger.warning(
"HTTP exporter not available, falling back to gRPC",
service=service_name
)
protocol = "grpc"
if protocol not in ["grpc", "http"]:
logger.error(
"Invalid protocol specified",
service=service_name,
protocol=protocol
)
return None
try:
# Create resource with service information
resource = Resource(attributes={
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
})
# Get endpoints from centralized config
endpoints = OTelConfig.get_endpoints()
# Configure OTLP exporter for metrics
otlp_exporter = OTLPMetricExporter(
endpoint=otel_endpoint,
timeout=10
)
# Determine which endpoint to use
if otel_endpoint:
# User provided override
if protocol == "grpc":
endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
else:
endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/metrics")
else:
# Use config-determined endpoint
if protocol == "grpc":
endpoint = endpoints.metrics_grpc
else:
endpoint = endpoints.metrics_http
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure OTLP exporter based on protocol
if protocol == "grpc":
otlp_exporter = GrpcMetricExporter(
endpoint=endpoint,
insecure=True, # Use secure=False in production with proper TLS
timeout=10
)
else: # http
otlp_exporter = HttpMetricExporter(
endpoint=endpoint,
timeout=10
)
# Create periodic metric reader
metric_reader = PeriodicExportingMetricReader(
@@ -108,9 +161,10 @@ def setup_otel_metrics(
metrics.set_meter_provider(meter_provider)
logger.info(
"OpenTelemetry metrics export configured",
"OpenTelemetry metrics export configured successfully",
service=service_name,
otel_endpoint=otel_endpoint,
endpoint=endpoint,
protocol=protocol,
export_interval_seconds=export_interval_millis / 1000
)
@@ -121,7 +175,7 @@ def setup_otel_metrics(
"Failed to setup OpenTelemetry metrics export",
service=service_name,
error=str(e),
reason="Will continue with Prometheus-only metrics"
protocol=protocol
)
return None

View File

@@ -0,0 +1,286 @@
"""
Centralized OpenTelemetry Configuration
Manages OTEL endpoints and settings for traces, metrics, and logs
"""
import os
from typing import Optional, Tuple
from dataclasses import dataclass
import structlog
logger = structlog.get_logger()
@dataclass
class OTelEndpoints:
"""
Container for OpenTelemetry endpoints.
SigNoz uses different protocols for different signals:
- Traces: gRPC (port 4317)
- Metrics: gRPC (port 4317) or HTTP (port 4318)
- Logs: HTTP (port 4318)
"""
traces_grpc: str # gRPC endpoint for traces (e.g., "host:4317")
metrics_grpc: str # gRPC endpoint for metrics (e.g., "host:4317")
metrics_http: str # HTTP endpoint for metrics (e.g., "http://host:4318/v1/metrics")
logs_http: str # HTTP endpoint for logs (e.g., "http://host:4318/v1/logs")
class OTelConfig:
"""
Centralized configuration for OpenTelemetry exporters.
This class manages endpoint URLs and ensures proper protocol usage:
- gRPC endpoints: host:port (no protocol prefix)
- HTTP endpoints: http://host:port/path (with protocol and path)
"""
# Default base endpoint (can be overridden by environment variables)
DEFAULT_OTEL_COLLECTOR_HOST = "signoz-otel-collector.bakery-ia.svc.cluster.local"
DEFAULT_GRPC_PORT = 4317
DEFAULT_HTTP_PORT = 4318
@classmethod
def get_endpoints(cls) -> OTelEndpoints:
"""
Get OpenTelemetry endpoints from environment variables with proper fallbacks.
Environment variables (in order of precedence):
1. OTEL_EXPORTER_OTLP_ENDPOINT - Base endpoint (gRPC format: host:port)
2. OTEL_EXPORTER_OTLP_TRACES_ENDPOINT - Specific traces endpoint
3. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT - Specific metrics endpoint
4. OTEL_EXPORTER_OTLP_LOGS_ENDPOINT - Specific logs endpoint
5. OTEL_COLLECTOR_ENDPOINT - Legacy variable (HTTP format)
Returns:
OTelEndpoints with all configured endpoints
"""
# Get base endpoint from environment
base_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
if base_endpoint:
# Clean and parse base endpoint
base_grpc = cls._clean_grpc_endpoint(base_endpoint)
base_http_host = cls._extract_host(base_endpoint)
else:
# Use default collector
base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
# Get signal-specific endpoints (or use base endpoint)
traces_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", base_grpc)
metrics_endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", base_grpc)
logs_endpoint = os.getenv("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT")
# Build final endpoints
traces_grpc = cls._clean_grpc_endpoint(traces_endpoint)
metrics_grpc = cls._clean_grpc_endpoint(metrics_endpoint)
# For metrics HTTP, convert gRPC endpoint to HTTP if needed
metrics_http = cls._grpc_to_http_endpoint(metrics_grpc, "/v1/metrics")
# For logs, use HTTP endpoint
if logs_endpoint:
logs_http = cls._ensure_http_endpoint(logs_endpoint, "/v1/logs")
else:
logs_http = cls._grpc_to_http_endpoint(base_grpc, "/v1/logs")
endpoints = OTelEndpoints(
traces_grpc=traces_grpc,
metrics_grpc=metrics_grpc,
metrics_http=metrics_http,
logs_http=logs_http
)
logger.info(
"OpenTelemetry endpoints configured",
traces_grpc=endpoints.traces_grpc,
metrics_grpc=endpoints.metrics_grpc,
metrics_http=endpoints.metrics_http,
logs_http=endpoints.logs_http
)
return endpoints
@staticmethod
def _clean_grpc_endpoint(endpoint: str) -> str:
"""
Clean endpoint for gRPC usage (remove protocol, paths).
Args:
endpoint: Raw endpoint string
Returns:
Cleaned endpoint in format "host:port"
"""
# Remove protocol prefixes
endpoint = endpoint.replace("http://", "").replace("https://", "")
# Remove paths (gRPC doesn't use paths)
if "/" in endpoint:
endpoint = endpoint.split("/")[0]
# Ensure it has a port
if ":" not in endpoint:
endpoint = f"{endpoint}:4317"
return endpoint
@staticmethod
def _extract_host(endpoint: str) -> str:
"""
Extract host and convert to HTTP endpoint.
Args:
endpoint: Raw endpoint string
Returns:
HTTP endpoint without path (e.g., "http://host:4318")
"""
# Remove protocol if present
clean = endpoint.replace("http://", "").replace("https://", "")
# Remove path if present
if "/" in clean:
clean = clean.split("/")[0]
# Extract host without port
if ":" in clean:
host = clean.split(":")[0]
else:
host = clean
return f"http://{host}:4318"
@staticmethod
def _grpc_to_http_endpoint(grpc_endpoint: str, path: str) -> str:
"""
Convert gRPC endpoint to HTTP endpoint with path.
Args:
grpc_endpoint: gRPC endpoint (e.g., "host:4317")
path: HTTP path (e.g., "/v1/metrics")
Returns:
HTTP endpoint (e.g., "http://host:4318/v1/metrics")
"""
# Extract host from gRPC endpoint
if ":" in grpc_endpoint:
host = grpc_endpoint.split(":")[0]
else:
host = grpc_endpoint
# Build HTTP endpoint with port 4318
return f"http://{host}:4318{path}"
@staticmethod
def _ensure_http_endpoint(endpoint: str, path: str) -> str:
"""
Ensure endpoint is in HTTP format with proper path.
Args:
endpoint: Raw endpoint string
path: Required path (e.g., "/v1/logs")
Returns:
HTTP endpoint with protocol and path
"""
# Add protocol if missing
if not endpoint.startswith(("http://", "https://")):
endpoint = f"http://{endpoint}"
# Ensure it has the correct port for HTTP
if ":4317" in endpoint:
endpoint = endpoint.replace(":4317", ":4318")
elif ":4318" not in endpoint and ":" in endpoint:
# Has a port but not the right one, replace it
parts = endpoint.split(":")
if len(parts) >= 2:
# Remove existing port and path
base = ":".join(parts[:-1])
endpoint = f"{base}:4318"
elif ":" not in endpoint.replace("http://", "").replace("https://", ""):
# No port at all, add it
endpoint = f"{endpoint}:4318"
# Ensure path is present
if not endpoint.endswith(path):
# Remove any existing path first
if "/" in endpoint.split("://")[1]:
base = endpoint.split("://")[0] + "://" + endpoint.split("://")[1].split("/")[0]
endpoint = base
endpoint = f"{endpoint}{path}"
return endpoint
@classmethod
def get_resource_attributes(
cls,
service_name: str,
service_version: str = "1.0.0"
) -> dict:
"""
Get common resource attributes for all OTEL signals.
Args:
service_name: Name of the service
service_version: Version of the service
Returns:
Dictionary of resource attributes
"""
return {
"service.name": service_name,
"service.version": service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
"k8s.cluster.name": os.getenv("K8S_CLUSTER_NAME", "bakery-ia-cluster"),
}
@classmethod
def is_enabled(cls, signal: str) -> bool:
"""
Check if a specific telemetry signal is enabled.
Args:
signal: One of "traces", "metrics", "logs"
Returns:
True if signal is enabled, False otherwise
"""
signal = signal.lower()
if signal == "traces":
return os.getenv("ENABLE_TRACING", "true").lower() == "true"
elif signal == "metrics":
return os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
elif signal == "logs":
return os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp"
else:
return False
@classmethod
def get_protocol(cls, signal: str) -> str:
"""
Get the preferred protocol for a signal.
Args:
signal: One of "traces", "metrics", "logs"
Returns:
Protocol name ("grpc" or "http")
"""
protocol = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc")
# Signal-specific overrides
if signal == "traces":
return os.getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", protocol)
elif signal == "metrics":
return os.getenv("OTEL_EXPORTER_OTLP_METRICS_PROTOCOL", protocol)
elif signal == "logs":
# Logs always use HTTP in our setup
return "http"
return protocol

View File

@@ -0,0 +1,271 @@
"""
Unified OpenTelemetry Telemetry Setup
Provides a single entry point to configure all telemetry signals:
- Traces: Distributed tracing across services
- Metrics: OTLP metrics export + system metrics collection
- Logs: Structured logs with trace correlation
All signals are exported to SigNoz via OTLP.
"""
import os
import structlog
from typing import Optional, Dict, Any, Tuple
from dataclasses import dataclass
from .otel_config import OTelConfig
from .tracing import setup_tracing
from .metrics_exporter import setup_otel_metrics
from .logs_exporter import setup_otel_logging
from .system_metrics import setup_all_metrics, SystemMetricsCollector, ApplicationMetricsCollector
logger = structlog.get_logger()
@dataclass
class TelemetryProviders:
"""
Container for all OpenTelemetry providers and collectors.
Attributes:
tracer_provider: Provider for distributed tracing
meter_provider: Provider for metrics export
logging_handler: Handler for structured logs
system_metrics: Collector for system-level metrics (CPU, memory, disk, network)
app_metrics: Collector for application-level metrics (HTTP, DB)
"""
tracer_provider: Optional[Any] = None
meter_provider: Optional[Any] = None
logging_handler: Optional[Any] = None
system_metrics: Optional[SystemMetricsCollector] = None
app_metrics: Optional[ApplicationMetricsCollector] = None
def setup_telemetry(
app,
service_name: str,
service_version: str = "1.0.0",
enable_traces: bool = True,
enable_metrics: bool = True,
enable_logs: bool = True,
enable_system_metrics: bool = True,
metrics_protocol: Optional[str] = None, # "grpc" or "http", defaults to grpc
export_interval_millis: int = 60000
) -> TelemetryProviders:
"""
Setup all OpenTelemetry telemetry signals (traces, metrics, logs) for a service.
This is the UNIFIED setup function that configures everything:
- Distributed tracing (gRPC, port 4317)
- Metrics export (gRPC by default, port 4317)
- System metrics collection (CPU, memory, disk, network)
- Application metrics (HTTP requests, DB queries)
- Structured logs export (HTTP, port 4318)
All signals use the centralized OTelConfig for endpoint management.
Args:
app: FastAPI application instance
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
enable_traces: Enable distributed tracing (default: True)
enable_metrics: Enable metrics export to OTLP (default: True)
enable_logs: Enable logs export to OTLP (default: True)
enable_system_metrics: Enable system metrics collection (default: True, can be disabled via ENABLE_SYSTEM_METRICS env)
metrics_protocol: Protocol for metrics ("grpc" or "http", default: "grpc")
export_interval_millis: How often to export metrics in milliseconds
Returns:
TelemetryProviders containing all initialized providers and collectors
Example:
from shared.monitoring.telemetry import setup_telemetry
app = FastAPI(title="Auth Service")
providers = setup_telemetry(
app,
service_name="auth-service",
service_version="1.0.0"
)
# All telemetry is now configured:
# - Traces automatically captured for HTTP requests
# - System metrics automatically collected
# - Application metrics via providers.app_metrics
# - Logs automatically correlated with traces
"""
logger.info(
"Setting up unified OpenTelemetry telemetry",
service=service_name,
version=service_version,
traces=enable_traces,
metrics=enable_metrics,
logs=enable_logs,
system_metrics=enable_system_metrics
)
providers = TelemetryProviders()
# Setup distributed tracing
if enable_traces and OTelConfig.is_enabled("traces"):
try:
providers.tracer_provider = setup_tracing(
app,
service_name=service_name,
service_version=service_version
)
if providers.tracer_provider:
logger.info("✓ Distributed tracing configured", service=service_name)
else:
logger.warning("✗ Distributed tracing setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup distributed tracing", service=service_name, error=str(e))
# Setup OTLP metrics export
if enable_metrics and OTelConfig.is_enabled("metrics"):
try:
providers.meter_provider = setup_otel_metrics(
service_name=service_name,
service_version=service_version,
protocol=metrics_protocol,
export_interval_millis=export_interval_millis
)
if providers.meter_provider:
logger.info("✓ OTLP metrics export configured", service=service_name)
# Setup system and application metrics collectors
if enable_system_metrics:
enable_system_env = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
if enable_system_env:
try:
providers.system_metrics, providers.app_metrics = setup_all_metrics(
service_name=service_name,
service_version=service_version,
meter_provider=providers.meter_provider
)
logger.info(
"✓ System and application metrics collectors initialized",
service=service_name,
system_metrics=["cpu", "memory", "disk", "network"],
app_metrics=["http_requests", "db_queries"]
)
except Exception as e:
logger.warning("✗ Failed to setup metrics collectors", service=service_name, error=str(e))
else:
logger.warning("✗ OTLP metrics export setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup OTLP metrics export", service=service_name, error=str(e))
# Setup logs export
if enable_logs and OTelConfig.is_enabled("logs"):
try:
providers.logging_handler = setup_otel_logging(
service_name=service_name,
service_version=service_version
)
if providers.logging_handler:
logger.info("✓ Structured logs export configured", service=service_name)
else:
logger.warning("✗ Logs export setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup logs export", service=service_name, error=str(e))
# Log endpoint configuration summary
try:
endpoints = OTelConfig.get_endpoints()
summary = {
"service": service_name,
"version": service_version,
"traces": {
"enabled": bool(providers.tracer_provider),
"endpoint": endpoints.traces_grpc if providers.tracer_provider else "disabled"
},
"metrics": {
"enabled": bool(providers.meter_provider),
"endpoint": (endpoints.metrics_grpc if metrics_protocol != "http" else endpoints.metrics_http) if providers.meter_provider else "disabled",
"system_metrics": bool(providers.system_metrics),
"app_metrics": bool(providers.app_metrics)
},
"logs": {
"enabled": bool(providers.logging_handler),
"endpoint": endpoints.logs_http if providers.logging_handler else "disabled"
}
}
logger.info("🎉 Telemetry setup complete", **summary)
except Exception as e:
logger.warning("Could not log endpoint summary", error=str(e))
return providers
def setup_telemetry_simple(
app,
service_name: str,
service_version: str = "1.0.0"
) -> TelemetryProviders:
"""
Simplified telemetry setup with all defaults.
Uses:
- gRPC for traces (port 4317)
- gRPC for metrics (port 4317)
- HTTP for logs (port 4318)
All settings are read from environment variables and OTelConfig.
Args:
app: FastAPI application instance
service_name: Name of the service
service_version: Version of the service
Returns:
TelemetryProviders containing all initialized providers
Example:
from shared.monitoring.telemetry import setup_telemetry_simple
app = FastAPI(title="Auth Service")
providers = setup_telemetry_simple(app, "auth-service")
"""
return setup_telemetry(
app=app,
service_name=service_name,
service_version=service_version
)
def get_telemetry_status() -> Dict[str, Any]:
"""
Get current telemetry configuration status.
Returns:
Dictionary with telemetry status information
Example:
from shared.monitoring.telemetry import get_telemetry_status
status = get_telemetry_status()
print(f"Tracing enabled: {status['traces']['enabled']}")
"""
endpoints = OTelConfig.get_endpoints()
return {
"traces": {
"enabled": OTelConfig.is_enabled("traces"),
"protocol": "grpc",
"endpoint": endpoints.traces_grpc
},
"metrics": {
"enabled": OTelConfig.is_enabled("metrics"),
"protocol": OTelConfig.get_protocol("metrics"),
"grpc_endpoint": endpoints.metrics_grpc,
"http_endpoint": endpoints.metrics_http
},
"logs": {
"enabled": OTelConfig.is_enabled("logs"),
"protocol": "http",
"endpoint": endpoints.logs_http
}
}

View File

@@ -3,17 +3,38 @@ OpenTelemetry distributed tracing integration
Provides end-to-end request tracking across all services
"""
import os
import structlog
from typing import Optional
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from opentelemetry.sdk.resources import Resource
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
# Core instrumentations (should always be available)
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
# Optional instrumentations (may not be installed in all services)
try:
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
HTTPX_AVAILABLE = True
except ImportError:
HTTPX_AVAILABLE = False
try:
from opentelemetry.instrumentation.redis import RedisInstrumentor
REDIS_AVAILABLE = True
except ImportError:
REDIS_AVAILABLE = False
try:
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
SQLALCHEMY_AVAILABLE = True
except ImportError:
SQLALCHEMY_AVAILABLE = False
from .otel_config import OTelConfig
logger = structlog.get_logger()
@@ -22,8 +43,8 @@ def setup_tracing(
app,
service_name: str,
service_version: str = "1.0.0",
otel_endpoint: str = "http://signoz-otel-collector.bakery-ia:4318"
):
otel_endpoint: Optional[str] = None
) -> Optional[TracerProvider]:
"""
Setup OpenTelemetry distributed tracing for a FastAPI service.
@@ -33,35 +54,56 @@ def setup_tracing(
- Redis operations
- PostgreSQL/SQLAlchemy queries
Uses gRPC protocol (port 4317) for sending traces to SigNoz.
Args:
app: FastAPI application instance
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (SigNoz)
otel_endpoint: Optional override for OTLP endpoint (gRPC format: host:port)
Returns:
TracerProvider instance if successful, None otherwise
Example:
from shared.monitoring.tracing import setup_tracing
app = FastAPI(title="Auth Service")
setup_tracing(app, "auth-service")
tracer_provider = setup_tracing(app, "auth-service", "1.0.0")
"""
# Check if tracing is enabled
if not OTelConfig.is_enabled("traces"):
logger.info(
"Distributed tracing disabled",
service=service_name,
reason="ENABLE_TRACING not set to 'true'"
)
return None
try:
# Create resource with service information
resource = Resource(attributes={
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": "production"
})
# Get endpoints from centralized config
endpoints = OTelConfig.get_endpoints()
# Use provided endpoint or get from config
if otel_endpoint:
# Clean user-provided endpoint for gRPC
grpc_endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
else:
grpc_endpoint = endpoints.traces_grpc
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure tracer provider
tracer_provider = TracerProvider(resource=resource)
trace.set_tracer_provider(tracer_provider)
# Configure OTLP exporter to send to SigNoz
# Configure OTLP gRPC exporter for traces
otlp_exporter = OTLPSpanExporter(
endpoint=otel_endpoint,
insecure=True # Use TLS in production
endpoint=grpc_endpoint,
insecure=True # Use secure=False in production with proper TLS
)
# Add span processor with batching for performance
@@ -75,40 +117,46 @@ def setup_tracing(
excluded_urls="health,metrics" # Don't trace health/metrics endpoints
)
# Auto-instrument HTTPX (inter-service communication)
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
# Auto-instrument HTTPX (inter-service communication) if available
if HTTPX_AVAILABLE:
try:
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("HTTPX instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument HTTPX: {e}")
# Auto-instrument Redis
try:
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
except Exception as e:
logger.warning(f"Failed to instrument Redis: {e}")
# Auto-instrument Redis if available
if REDIS_AVAILABLE:
try:
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("Redis instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument Redis: {e}")
# Auto-instrument PostgreSQL (psycopg2) - skip if not available
# Most services use asyncpg instead of psycopg2
# try:
# Psycopg2Instrumentor().instrument(tracer_provider=tracer_provider)
# except Exception as e:
# logger.warning(f"Failed to instrument Psycopg2: {e}")
# Auto-instrument SQLAlchemy
try:
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
except Exception as e:
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
# Auto-instrument SQLAlchemy if available
if SQLALCHEMY_AVAILABLE:
try:
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("SQLAlchemy instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
logger.info(
"Distributed tracing configured",
"Distributed tracing configured successfully",
service=service_name,
otel_endpoint=otel_endpoint
grpc_endpoint=grpc_endpoint,
protocol="grpc"
)
return tracer_provider
except Exception as e:
logger.error(
"Failed to setup tracing - continuing without it",
service=service_name,
error=str(e)
)
return None
def get_current_trace_id() -> Optional[str]:

View File

@@ -20,10 +20,11 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi.routing import APIRouter
from shared.monitoring import setup_logging, setup_otel_logging, setup_otel_metrics, setup_all_metrics
from shared.monitoring.metrics import setup_metrics_early
from shared.monitoring import (
setup_logging,
setup_telemetry
)
from shared.monitoring.health_checks import setup_fastapi_health_checks
from shared.monitoring.tracing import setup_tracing
from shared.database.base import DatabaseManager
if TYPE_CHECKING:
@@ -77,24 +78,13 @@ class BaseFastAPIService:
# Initialize logging
setup_logging(service_name, log_level)
# Setup OpenTelemetry logging export if enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
setup_otel_logging(service_name, version)
self.logger = structlog.get_logger()
self.logger.info(f"OpenTelemetry logs export enabled for {service_name}")
except Exception as e:
self.logger = structlog.get_logger()
self.logger.warning(f"Failed to setup OpenTelemetry logs export: {e}")
else:
self.logger = structlog.get_logger()
self.logger = structlog.get_logger()
# Will be set during app creation
self.app: Optional[FastAPI] = None
self.metrics_collector = None
self.health_manager = None
self.alert_service = None
self.telemetry_providers = None # Contains all OTEL providers and metrics collectors
def create_app(self, **fastapi_kwargs) -> FastAPI:
"""
@@ -116,49 +106,25 @@ class BaseFastAPIService:
# Create FastAPI app
self.app = FastAPI(**config)
# Setup metrics BEFORE middleware and lifespan
if self.enable_metrics:
self.metrics_collector = setup_metrics_early(self.app, self.service_name)
# Setup OpenTelemetry metrics export if enabled
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
if enable_otel_metrics:
try:
self.otel_meter_provider = setup_otel_metrics(self.service_name, self.version)
if self.otel_meter_provider:
self.logger.info(f"OpenTelemetry metrics export enabled for {self.service_name}")
# Setup system metrics collection (CPU, memory, disk, network)
enable_system_metrics = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
if enable_system_metrics:
try:
self.system_metrics, self.app_metrics = setup_all_metrics(
self.service_name,
self.version,
self.otel_meter_provider
)
self.logger.info(f"System metrics collection enabled for {self.service_name}")
except Exception as e:
self.logger.warning(f"Failed to setup system metrics: {e}")
except Exception as e:
self.logger.warning(f"Failed to setup OpenTelemetry metrics export: {e}")
# Setup distributed tracing
# Check both constructor flag and environment variable
tracing_enabled = self.enable_tracing and os.getenv("ENABLE_TRACING", "true").lower() == "true"
if tracing_enabled:
try:
otel_endpoint = os.getenv(
"OTEL_COLLECTOR_ENDPOINT",
"http://signoz-otel-collector.bakery-ia:4318"
)
setup_tracing(self.app, self.service_name, self.version, otel_endpoint)
self.logger.info(f"Distributed tracing enabled for {self.service_name}")
except Exception as e:
self.logger.warning(f"Failed to setup tracing, continuing without it: {e}")
else:
self.logger.info(f"Distributed tracing disabled for {self.service_name}")
# Setup unified OpenTelemetry telemetry
# This single call configures:
# - Distributed tracing (gRPC, port 4317)
# - OTLP metrics export (gRPC, port 4317)
# - System metrics collection (CPU, memory, disk, network)
# - Application metrics (HTTP requests, DB queries)
# - Structured logs export (HTTP, port 4318)
try:
self.telemetry_providers = setup_telemetry(
app=self.app,
service_name=self.service_name,
service_version=self.version,
enable_traces=self.enable_tracing,
enable_metrics=self.enable_metrics,
enable_logs=True, # Controlled by OTEL_LOGS_EXPORTER env var
enable_system_metrics=True # Controlled by ENABLE_SYSTEM_METRICS env var
)
except Exception as e:
self.logger.warning("Failed to setup telemetry", error=str(e))
# Setup lifespan
self.app.router.lifespan_context = self._create_lifespan()
@@ -361,10 +327,6 @@ class BaseFastAPIService:
method=request.method
)
# Record error metric if available
if self.metrics_collector:
self.metrics_collector.increment_counter("errors_total", labels={"type": "unhandled"})
return JSONResponse(
status_code=500,
content={
@@ -409,7 +371,10 @@ class BaseFastAPIService:
def register_custom_metrics(self, metrics_config: Dict[str, Dict[str, Any]]):
"""
Register custom metrics for the service
Register custom OTEL metrics for the service.
Note: System metrics (CPU, memory, disk, network) and application metrics (HTTP, DB)
are automatically created by setup_telemetry(). Use this for additional custom metrics.
Args:
metrics_config: Dict with metric name as key and config as value
@@ -417,25 +382,36 @@ class BaseFastAPIService:
"user_registrations": {
"type": "counter",
"description": "Total user registrations",
"labels": ["status"]
"unit": "registrations"
}
}
"""
if not self.metrics_collector:
self.logger.warning("Metrics collector not available")
if not self.telemetry_providers or not self.telemetry_providers.meter_provider:
self.logger.warning("OTEL meter provider not available - metrics not registered")
return
from opentelemetry.metrics import get_meter
meter = get_meter(self.service_name)
for metric_name, config in metrics_config.items():
metric_type = config.get("type", "counter")
description = config.get("description", f"{metric_name} metric")
labels = config.get("labels", [])
unit = config.get("unit", "1")
if metric_type == "counter":
self.metrics_collector.register_counter(metric_name, description, labels=labels)
elif metric_type == "histogram":
self.metrics_collector.register_histogram(metric_name, description, labels=labels)
else:
self.logger.warning(f"Unsupported metric type: {metric_type}")
try:
if metric_type == "counter":
meter.create_counter(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom counter: {metric_name}")
elif metric_type == "histogram":
meter.create_histogram(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom histogram: {metric_name}")
elif metric_type == "gauge":
meter.create_up_down_counter(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom gauge: {metric_name}")
else:
self.logger.warning(f"Unsupported metric type: {metric_type}")
except Exception as e:
self.logger.error(f"Failed to register metric {metric_name}", error=str(e))
def run_development_server(self, host: str = "0.0.0.0", port: int = 8000, reload: Optional[bool] = None):
"""