Imporve monitoring 5
This commit is contained in:
@@ -1,14 +1,34 @@
|
||||
"""
|
||||
Shared monitoring package for microservices
|
||||
|
||||
Provides unified OpenTelemetry-based observability:
|
||||
- Traces: Distributed tracing
|
||||
- Metrics: System and application metrics
|
||||
- Logs: Structured logging
|
||||
|
||||
All signals exported to SigNoz via OTLP.
|
||||
"""
|
||||
|
||||
# Core setup - START HERE
|
||||
from .logging import setup_logging
|
||||
from .metrics import setup_metrics_early, get_metrics_collector, MetricsCollector
|
||||
from .health_checks import (
|
||||
HealthCheckManager,
|
||||
FastAPIHealthChecker,
|
||||
create_health_manager,
|
||||
setup_fastapi_health_checks
|
||||
from .telemetry import (
|
||||
setup_telemetry,
|
||||
setup_telemetry_simple,
|
||||
get_telemetry_status,
|
||||
TelemetryProviders
|
||||
)
|
||||
|
||||
# Configuration
|
||||
from .otel_config import OTelConfig, OTelEndpoints
|
||||
|
||||
# Individual signal setup (used by telemetry.py)
|
||||
from .tracing import (
|
||||
setup_tracing,
|
||||
get_current_trace_id,
|
||||
get_current_span_id,
|
||||
add_trace_attributes,
|
||||
add_trace_event,
|
||||
record_exception
|
||||
)
|
||||
from .logs_exporter import (
|
||||
setup_otel_logging,
|
||||
@@ -27,23 +47,51 @@ from .system_metrics import (
|
||||
setup_all_metrics
|
||||
)
|
||||
|
||||
# Health checks
|
||||
from .health_checks import (
|
||||
HealthCheckManager,
|
||||
FastAPIHealthChecker,
|
||||
create_health_manager,
|
||||
setup_fastapi_health_checks
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# CORE - Start with these
|
||||
'setup_logging',
|
||||
'setup_metrics_early',
|
||||
'get_metrics_collector',
|
||||
'MetricsCollector',
|
||||
'HealthCheckManager',
|
||||
'FastAPIHealthChecker',
|
||||
'create_health_manager',
|
||||
'setup_fastapi_health_checks',
|
||||
'setup_telemetry',
|
||||
'setup_telemetry_simple',
|
||||
'get_telemetry_status',
|
||||
'TelemetryProviders',
|
||||
|
||||
# Configuration
|
||||
'OTelConfig',
|
||||
'OTelEndpoints',
|
||||
|
||||
# Tracing
|
||||
'setup_tracing',
|
||||
'get_current_trace_id',
|
||||
'get_current_span_id',
|
||||
'add_trace_attributes',
|
||||
'add_trace_event',
|
||||
'record_exception',
|
||||
|
||||
# Logs
|
||||
'setup_otel_logging',
|
||||
'add_log_context',
|
||||
'get_current_trace_context',
|
||||
'StructlogOTELProcessor',
|
||||
|
||||
# Metrics
|
||||
'setup_otel_metrics',
|
||||
'OTelMetricsCollector',
|
||||
'create_dual_metrics_collector',
|
||||
'SystemMetricsCollector',
|
||||
'ApplicationMetricsCollector',
|
||||
'setup_all_metrics'
|
||||
'setup_all_metrics',
|
||||
|
||||
# Health checks
|
||||
'HealthCheckManager',
|
||||
'FastAPIHealthChecker',
|
||||
'create_health_manager',
|
||||
'setup_fastapi_health_checks',
|
||||
]
|
||||
@@ -1,6 +1,6 @@
|
||||
"""
|
||||
OpenTelemetry Logs Integration for SigNoz
|
||||
Exports structured logs to SigNoz via OpenTelemetry Collector
|
||||
Exports structured logs to SigNoz via OpenTelemetry Collector using HTTP protocol
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -10,14 +10,21 @@ from typing import Optional
|
||||
from opentelemetry._logs import set_logger_provider
|
||||
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
|
||||
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
|
||||
# Try to import HTTP log exporter (logs always use HTTP)
|
||||
try:
|
||||
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
|
||||
HTTP_LOG_EXPORTER_AVAILABLE = True
|
||||
except ImportError:
|
||||
try:
|
||||
from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
|
||||
HTTP_LOG_EXPORTER_AVAILABLE = True
|
||||
except ImportError:
|
||||
OTLPLogExporter = None
|
||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
||||
HTTP_LOG_EXPORTER_AVAILABLE = False
|
||||
|
||||
from .otel_config import OTelConfig
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
@@ -31,13 +38,14 @@ def setup_otel_logging(
|
||||
"""
|
||||
Setup OpenTelemetry logging to export logs to SigNoz.
|
||||
|
||||
This integrates with Python's standard logging to automatically
|
||||
export all log records to SigNoz via the OTLP protocol.
|
||||
Uses HTTP protocol (port 4318) for sending logs to SigNoz.
|
||||
Integrates with Python's standard logging to automatically export
|
||||
all log records to SigNoz via the OTLP HTTP protocol.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
otel_endpoint: OpenTelemetry collector endpoint (default from env)
|
||||
otel_endpoint: Optional override for OTLP endpoint (HTTP format with path)
|
||||
enable_console: Whether to also log to console (default: True)
|
||||
|
||||
Returns:
|
||||
@@ -47,7 +55,7 @@ def setup_otel_logging(
|
||||
from shared.monitoring.logs_exporter import setup_otel_logging
|
||||
|
||||
# Setup during service initialization
|
||||
setup_otel_logging("auth-service", "1.0.0")
|
||||
handler = setup_otel_logging("auth-service", "1.0.0")
|
||||
|
||||
# Now all standard logging calls will be exported to SigNoz
|
||||
import logging
|
||||
@@ -56,7 +64,7 @@ def setup_otel_logging(
|
||||
"""
|
||||
|
||||
# Check if logging export is enabled
|
||||
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() != "otlp":
|
||||
if not OTelConfig.is_enabled("logs"):
|
||||
logger.info(
|
||||
"OpenTelemetry logs export disabled",
|
||||
service=service_name,
|
||||
@@ -64,59 +72,36 @@ def setup_otel_logging(
|
||||
)
|
||||
return None
|
||||
|
||||
# Get OTLP endpoint from environment or parameter
|
||||
# For logs, we need to use the HTTP endpoint (port 4318), not the gRPC endpoint (port 4317)
|
||||
if otel_endpoint is None:
|
||||
# Try logs-specific endpoint first, then fall back to general OTLP endpoint
|
||||
otel_endpoint = os.getenv(
|
||||
"OTEL_EXPORTER_OTLP_LOGS_ENDPOINT",
|
||||
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
|
||||
# Check if HTTP log exporter is available
|
||||
if not HTTP_LOG_EXPORTER_AVAILABLE or OTLPLogExporter is None:
|
||||
logger.warning(
|
||||
"OpenTelemetry HTTP log exporter not available",
|
||||
service=service_name,
|
||||
reason="opentelemetry-exporter-otlp-proto-http package not installed"
|
||||
)
|
||||
|
||||
logger.info(f"Original OTLP endpoint for logs: {otel_endpoint}")
|
||||
|
||||
# If we got the tracing endpoint (4317), switch to logs endpoint (4318)
|
||||
if otel_endpoint.endswith(":4317"):
|
||||
logger.info("Converting tracing endpoint (4317) to logs endpoint (4318)")
|
||||
otel_endpoint = otel_endpoint.replace(":4317", ":4318")
|
||||
|
||||
logger.info(f"Final OTLP endpoint for logs: {otel_endpoint}")
|
||||
|
||||
# Ensure endpoint has proper protocol prefix
|
||||
if not otel_endpoint.startswith(("http://", "https://")):
|
||||
# Default to HTTP for insecure connections
|
||||
otel_endpoint = f"http://{otel_endpoint}"
|
||||
|
||||
# Ensure endpoint has /v1/logs path for HTTP
|
||||
if not otel_endpoint.endswith("/v1/logs"):
|
||||
otel_endpoint = f"{otel_endpoint}/v1/logs"
|
||||
return None
|
||||
|
||||
try:
|
||||
# Check if OTLPLogExporter is available
|
||||
if OTLPLogExporter is None:
|
||||
logger.warning(
|
||||
"OpenTelemetry HTTP OTLP exporter not available",
|
||||
service=service_name,
|
||||
reason="opentelemetry-exporter-otlp-proto-http package not installed"
|
||||
)
|
||||
return None
|
||||
# Get endpoints from centralized config
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
|
||||
# Create resource with service information
|
||||
resource = Resource(attributes={
|
||||
SERVICE_NAME: service_name,
|
||||
SERVICE_VERSION: service_version,
|
||||
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
|
||||
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
|
||||
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
|
||||
})
|
||||
# Use provided endpoint or get from config
|
||||
if otel_endpoint:
|
||||
http_endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/logs")
|
||||
else:
|
||||
http_endpoint = endpoints.logs_http
|
||||
|
||||
# Get resource attributes
|
||||
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
|
||||
resource = Resource(attributes=resource_attrs)
|
||||
|
||||
# Configure logger provider
|
||||
logger_provider = LoggerProvider(resource=resource)
|
||||
set_logger_provider(logger_provider)
|
||||
|
||||
# Configure OTLP exporter for logs
|
||||
# Configure OTLP HTTP exporter for logs
|
||||
otlp_exporter = OTLPLogExporter(
|
||||
endpoint=otel_endpoint,
|
||||
endpoint=http_endpoint,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
@@ -135,9 +120,10 @@ def setup_otel_logging(
|
||||
root_logger.addHandler(otel_handler)
|
||||
|
||||
logger.info(
|
||||
"OpenTelemetry logs export configured",
|
||||
"OpenTelemetry logs export configured successfully",
|
||||
service=service_name,
|
||||
otel_endpoint=otel_endpoint,
|
||||
http_endpoint=http_endpoint,
|
||||
protocol="http",
|
||||
console_logging=enable_console
|
||||
)
|
||||
|
||||
@@ -147,8 +133,7 @@ def setup_otel_logging(
|
||||
logger.error(
|
||||
"Failed to setup OpenTelemetry logs export",
|
||||
service=service_name,
|
||||
error=str(e),
|
||||
reason="Will continue with standard logging only"
|
||||
error=str(e)
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""
|
||||
OpenTelemetry Metrics Integration for SigNoz
|
||||
Exports metrics to SigNoz via OpenTelemetry Collector in addition to Prometheus
|
||||
Exports metrics to SigNoz via OpenTelemetry Collector using gRPC protocol
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -9,8 +9,24 @@ from typing import Optional
|
||||
from opentelemetry import metrics
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
|
||||
# Import both gRPC and HTTP exporters
|
||||
try:
|
||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GrpcMetricExporter
|
||||
GRPC_AVAILABLE = True
|
||||
except ImportError:
|
||||
GRPC_AVAILABLE = False
|
||||
GrpcMetricExporter = None
|
||||
|
||||
try:
|
||||
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HttpMetricExporter
|
||||
HTTP_AVAILABLE = True
|
||||
except ImportError:
|
||||
HTTP_AVAILABLE = False
|
||||
HttpMetricExporter = None
|
||||
|
||||
from .otel_config import OTelConfig
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
@@ -19,20 +35,21 @@ def setup_otel_metrics(
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
otel_endpoint: Optional[str] = None,
|
||||
export_interval_millis: int = 60000 # Export every 60 seconds
|
||||
export_interval_millis: int = 60000, # Export every 60 seconds
|
||||
protocol: Optional[str] = None # "grpc" or "http", defaults to grpc
|
||||
) -> Optional[MeterProvider]:
|
||||
"""
|
||||
Setup OpenTelemetry metrics to export to SigNoz.
|
||||
|
||||
This creates a dual-export strategy:
|
||||
- Prometheus exposition format at /metrics (for Prometheus scraping)
|
||||
- OTLP push to SigNoz collector (for direct ingestion)
|
||||
Supports both gRPC (recommended, port 4317) and HTTP (port 4318) protocols.
|
||||
Default protocol is gRPC for better performance.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
otel_endpoint: OpenTelemetry collector endpoint (default from env)
|
||||
export_interval_millis: How often to push metrics (default 60s)
|
||||
otel_endpoint: Optional override for OTLP endpoint
|
||||
export_interval_millis: How often to push metrics in milliseconds (default 60s)
|
||||
protocol: Protocol to use ("grpc" or "http"). Defaults to "grpc"
|
||||
|
||||
Returns:
|
||||
MeterProvider instance if successful, None otherwise
|
||||
@@ -40,9 +57,12 @@ def setup_otel_metrics(
|
||||
Example:
|
||||
from shared.monitoring.metrics_exporter import setup_otel_metrics
|
||||
|
||||
# Setup during service initialization
|
||||
# Setup with gRPC (default)
|
||||
meter_provider = setup_otel_metrics("auth-service", "1.0.0")
|
||||
|
||||
# Or with HTTP
|
||||
meter_provider = setup_otel_metrics("auth-service", "1.0.0", protocol="http")
|
||||
|
||||
# Create meters for your metrics
|
||||
meter = meter_provider.get_meter(__name__)
|
||||
request_counter = meter.create_counter(
|
||||
@@ -56,8 +76,7 @@ def setup_otel_metrics(
|
||||
"""
|
||||
|
||||
# Check if metrics export is enabled
|
||||
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
|
||||
if not enable_otel_metrics:
|
||||
if not OTelConfig.is_enabled("metrics"):
|
||||
logger.info(
|
||||
"OpenTelemetry metrics export disabled",
|
||||
service=service_name,
|
||||
@@ -65,32 +84,66 @@ def setup_otel_metrics(
|
||||
)
|
||||
return None
|
||||
|
||||
# Get OTLP endpoint from environment or parameter
|
||||
if otel_endpoint is None:
|
||||
otel_endpoint = os.getenv(
|
||||
"OTEL_EXPORTER_OTLP_ENDPOINT",
|
||||
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
|
||||
)
|
||||
# Determine protocol to use
|
||||
if protocol is None:
|
||||
protocol = OTelConfig.get_protocol("metrics")
|
||||
|
||||
# Ensure endpoint has /v1/metrics path for HTTP
|
||||
if not otel_endpoint.endswith("/v1/metrics"):
|
||||
otel_endpoint = f"{otel_endpoint}/v1/metrics"
|
||||
# Validate protocol is available
|
||||
if protocol == "grpc" and not GRPC_AVAILABLE:
|
||||
logger.warning(
|
||||
"gRPC exporter not available, falling back to HTTP",
|
||||
service=service_name
|
||||
)
|
||||
protocol = "http"
|
||||
elif protocol == "http" and not HTTP_AVAILABLE:
|
||||
logger.warning(
|
||||
"HTTP exporter not available, falling back to gRPC",
|
||||
service=service_name
|
||||
)
|
||||
protocol = "grpc"
|
||||
|
||||
if protocol not in ["grpc", "http"]:
|
||||
logger.error(
|
||||
"Invalid protocol specified",
|
||||
service=service_name,
|
||||
protocol=protocol
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
# Create resource with service information
|
||||
resource = Resource(attributes={
|
||||
SERVICE_NAME: service_name,
|
||||
SERVICE_VERSION: service_version,
|
||||
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
|
||||
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
|
||||
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
|
||||
})
|
||||
# Get endpoints from centralized config
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
|
||||
# Configure OTLP exporter for metrics
|
||||
otlp_exporter = OTLPMetricExporter(
|
||||
endpoint=otel_endpoint,
|
||||
timeout=10
|
||||
)
|
||||
# Determine which endpoint to use
|
||||
if otel_endpoint:
|
||||
# User provided override
|
||||
if protocol == "grpc":
|
||||
endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
|
||||
else:
|
||||
endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/metrics")
|
||||
else:
|
||||
# Use config-determined endpoint
|
||||
if protocol == "grpc":
|
||||
endpoint = endpoints.metrics_grpc
|
||||
else:
|
||||
endpoint = endpoints.metrics_http
|
||||
|
||||
# Get resource attributes
|
||||
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
|
||||
resource = Resource(attributes=resource_attrs)
|
||||
|
||||
# Configure OTLP exporter based on protocol
|
||||
if protocol == "grpc":
|
||||
otlp_exporter = GrpcMetricExporter(
|
||||
endpoint=endpoint,
|
||||
insecure=True, # Use secure=False in production with proper TLS
|
||||
timeout=10
|
||||
)
|
||||
else: # http
|
||||
otlp_exporter = HttpMetricExporter(
|
||||
endpoint=endpoint,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
# Create periodic metric reader
|
||||
metric_reader = PeriodicExportingMetricReader(
|
||||
@@ -108,9 +161,10 @@ def setup_otel_metrics(
|
||||
metrics.set_meter_provider(meter_provider)
|
||||
|
||||
logger.info(
|
||||
"OpenTelemetry metrics export configured",
|
||||
"OpenTelemetry metrics export configured successfully",
|
||||
service=service_name,
|
||||
otel_endpoint=otel_endpoint,
|
||||
endpoint=endpoint,
|
||||
protocol=protocol,
|
||||
export_interval_seconds=export_interval_millis / 1000
|
||||
)
|
||||
|
||||
@@ -121,7 +175,7 @@ def setup_otel_metrics(
|
||||
"Failed to setup OpenTelemetry metrics export",
|
||||
service=service_name,
|
||||
error=str(e),
|
||||
reason="Will continue with Prometheus-only metrics"
|
||||
protocol=protocol
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
286
shared/monitoring/otel_config.py
Normal file
286
shared/monitoring/otel_config.py
Normal file
@@ -0,0 +1,286 @@
|
||||
"""
|
||||
Centralized OpenTelemetry Configuration
|
||||
Manages OTEL endpoints and settings for traces, metrics, and logs
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional, Tuple
|
||||
from dataclasses import dataclass
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
@dataclass
|
||||
class OTelEndpoints:
|
||||
"""
|
||||
Container for OpenTelemetry endpoints.
|
||||
|
||||
SigNoz uses different protocols for different signals:
|
||||
- Traces: gRPC (port 4317)
|
||||
- Metrics: gRPC (port 4317) or HTTP (port 4318)
|
||||
- Logs: HTTP (port 4318)
|
||||
"""
|
||||
traces_grpc: str # gRPC endpoint for traces (e.g., "host:4317")
|
||||
metrics_grpc: str # gRPC endpoint for metrics (e.g., "host:4317")
|
||||
metrics_http: str # HTTP endpoint for metrics (e.g., "http://host:4318/v1/metrics")
|
||||
logs_http: str # HTTP endpoint for logs (e.g., "http://host:4318/v1/logs")
|
||||
|
||||
|
||||
class OTelConfig:
|
||||
"""
|
||||
Centralized configuration for OpenTelemetry exporters.
|
||||
|
||||
This class manages endpoint URLs and ensures proper protocol usage:
|
||||
- gRPC endpoints: host:port (no protocol prefix)
|
||||
- HTTP endpoints: http://host:port/path (with protocol and path)
|
||||
"""
|
||||
|
||||
# Default base endpoint (can be overridden by environment variables)
|
||||
DEFAULT_OTEL_COLLECTOR_HOST = "signoz-otel-collector.bakery-ia.svc.cluster.local"
|
||||
DEFAULT_GRPC_PORT = 4317
|
||||
DEFAULT_HTTP_PORT = 4318
|
||||
|
||||
@classmethod
|
||||
def get_endpoints(cls) -> OTelEndpoints:
|
||||
"""
|
||||
Get OpenTelemetry endpoints from environment variables with proper fallbacks.
|
||||
|
||||
Environment variables (in order of precedence):
|
||||
1. OTEL_EXPORTER_OTLP_ENDPOINT - Base endpoint (gRPC format: host:port)
|
||||
2. OTEL_EXPORTER_OTLP_TRACES_ENDPOINT - Specific traces endpoint
|
||||
3. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT - Specific metrics endpoint
|
||||
4. OTEL_EXPORTER_OTLP_LOGS_ENDPOINT - Specific logs endpoint
|
||||
5. OTEL_COLLECTOR_ENDPOINT - Legacy variable (HTTP format)
|
||||
|
||||
Returns:
|
||||
OTelEndpoints with all configured endpoints
|
||||
"""
|
||||
# Get base endpoint from environment
|
||||
base_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||
|
||||
if base_endpoint:
|
||||
# Clean and parse base endpoint
|
||||
base_grpc = cls._clean_grpc_endpoint(base_endpoint)
|
||||
base_http_host = cls._extract_host(base_endpoint)
|
||||
else:
|
||||
# Use default collector
|
||||
base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
|
||||
base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
|
||||
|
||||
# Get signal-specific endpoints (or use base endpoint)
|
||||
traces_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", base_grpc)
|
||||
metrics_endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", base_grpc)
|
||||
logs_endpoint = os.getenv("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT")
|
||||
|
||||
# Build final endpoints
|
||||
traces_grpc = cls._clean_grpc_endpoint(traces_endpoint)
|
||||
metrics_grpc = cls._clean_grpc_endpoint(metrics_endpoint)
|
||||
|
||||
# For metrics HTTP, convert gRPC endpoint to HTTP if needed
|
||||
metrics_http = cls._grpc_to_http_endpoint(metrics_grpc, "/v1/metrics")
|
||||
|
||||
# For logs, use HTTP endpoint
|
||||
if logs_endpoint:
|
||||
logs_http = cls._ensure_http_endpoint(logs_endpoint, "/v1/logs")
|
||||
else:
|
||||
logs_http = cls._grpc_to_http_endpoint(base_grpc, "/v1/logs")
|
||||
|
||||
endpoints = OTelEndpoints(
|
||||
traces_grpc=traces_grpc,
|
||||
metrics_grpc=metrics_grpc,
|
||||
metrics_http=metrics_http,
|
||||
logs_http=logs_http
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"OpenTelemetry endpoints configured",
|
||||
traces_grpc=endpoints.traces_grpc,
|
||||
metrics_grpc=endpoints.metrics_grpc,
|
||||
metrics_http=endpoints.metrics_http,
|
||||
logs_http=endpoints.logs_http
|
||||
)
|
||||
|
||||
return endpoints
|
||||
|
||||
@staticmethod
|
||||
def _clean_grpc_endpoint(endpoint: str) -> str:
|
||||
"""
|
||||
Clean endpoint for gRPC usage (remove protocol, paths).
|
||||
|
||||
Args:
|
||||
endpoint: Raw endpoint string
|
||||
|
||||
Returns:
|
||||
Cleaned endpoint in format "host:port"
|
||||
"""
|
||||
# Remove protocol prefixes
|
||||
endpoint = endpoint.replace("http://", "").replace("https://", "")
|
||||
|
||||
# Remove paths (gRPC doesn't use paths)
|
||||
if "/" in endpoint:
|
||||
endpoint = endpoint.split("/")[0]
|
||||
|
||||
# Ensure it has a port
|
||||
if ":" not in endpoint:
|
||||
endpoint = f"{endpoint}:4317"
|
||||
|
||||
return endpoint
|
||||
|
||||
@staticmethod
|
||||
def _extract_host(endpoint: str) -> str:
|
||||
"""
|
||||
Extract host and convert to HTTP endpoint.
|
||||
|
||||
Args:
|
||||
endpoint: Raw endpoint string
|
||||
|
||||
Returns:
|
||||
HTTP endpoint without path (e.g., "http://host:4318")
|
||||
"""
|
||||
# Remove protocol if present
|
||||
clean = endpoint.replace("http://", "").replace("https://", "")
|
||||
|
||||
# Remove path if present
|
||||
if "/" in clean:
|
||||
clean = clean.split("/")[0]
|
||||
|
||||
# Extract host without port
|
||||
if ":" in clean:
|
||||
host = clean.split(":")[0]
|
||||
else:
|
||||
host = clean
|
||||
|
||||
return f"http://{host}:4318"
|
||||
|
||||
@staticmethod
|
||||
def _grpc_to_http_endpoint(grpc_endpoint: str, path: str) -> str:
|
||||
"""
|
||||
Convert gRPC endpoint to HTTP endpoint with path.
|
||||
|
||||
Args:
|
||||
grpc_endpoint: gRPC endpoint (e.g., "host:4317")
|
||||
path: HTTP path (e.g., "/v1/metrics")
|
||||
|
||||
Returns:
|
||||
HTTP endpoint (e.g., "http://host:4318/v1/metrics")
|
||||
"""
|
||||
# Extract host from gRPC endpoint
|
||||
if ":" in grpc_endpoint:
|
||||
host = grpc_endpoint.split(":")[0]
|
||||
else:
|
||||
host = grpc_endpoint
|
||||
|
||||
# Build HTTP endpoint with port 4318
|
||||
return f"http://{host}:4318{path}"
|
||||
|
||||
@staticmethod
|
||||
def _ensure_http_endpoint(endpoint: str, path: str) -> str:
|
||||
"""
|
||||
Ensure endpoint is in HTTP format with proper path.
|
||||
|
||||
Args:
|
||||
endpoint: Raw endpoint string
|
||||
path: Required path (e.g., "/v1/logs")
|
||||
|
||||
Returns:
|
||||
HTTP endpoint with protocol and path
|
||||
"""
|
||||
# Add protocol if missing
|
||||
if not endpoint.startswith(("http://", "https://")):
|
||||
endpoint = f"http://{endpoint}"
|
||||
|
||||
# Ensure it has the correct port for HTTP
|
||||
if ":4317" in endpoint:
|
||||
endpoint = endpoint.replace(":4317", ":4318")
|
||||
elif ":4318" not in endpoint and ":" in endpoint:
|
||||
# Has a port but not the right one, replace it
|
||||
parts = endpoint.split(":")
|
||||
if len(parts) >= 2:
|
||||
# Remove existing port and path
|
||||
base = ":".join(parts[:-1])
|
||||
endpoint = f"{base}:4318"
|
||||
elif ":" not in endpoint.replace("http://", "").replace("https://", ""):
|
||||
# No port at all, add it
|
||||
endpoint = f"{endpoint}:4318"
|
||||
|
||||
# Ensure path is present
|
||||
if not endpoint.endswith(path):
|
||||
# Remove any existing path first
|
||||
if "/" in endpoint.split("://")[1]:
|
||||
base = endpoint.split("://")[0] + "://" + endpoint.split("://")[1].split("/")[0]
|
||||
endpoint = base
|
||||
endpoint = f"{endpoint}{path}"
|
||||
|
||||
return endpoint
|
||||
|
||||
@classmethod
|
||||
def get_resource_attributes(
|
||||
cls,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0"
|
||||
) -> dict:
|
||||
"""
|
||||
Get common resource attributes for all OTEL signals.
|
||||
|
||||
Args:
|
||||
service_name: Name of the service
|
||||
service_version: Version of the service
|
||||
|
||||
Returns:
|
||||
Dictionary of resource attributes
|
||||
"""
|
||||
return {
|
||||
"service.name": service_name,
|
||||
"service.version": service_version,
|
||||
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
|
||||
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
|
||||
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
|
||||
"k8s.cluster.name": os.getenv("K8S_CLUSTER_NAME", "bakery-ia-cluster"),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def is_enabled(cls, signal: str) -> bool:
|
||||
"""
|
||||
Check if a specific telemetry signal is enabled.
|
||||
|
||||
Args:
|
||||
signal: One of "traces", "metrics", "logs"
|
||||
|
||||
Returns:
|
||||
True if signal is enabled, False otherwise
|
||||
"""
|
||||
signal = signal.lower()
|
||||
|
||||
if signal == "traces":
|
||||
return os.getenv("ENABLE_TRACING", "true").lower() == "true"
|
||||
elif signal == "metrics":
|
||||
return os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
|
||||
elif signal == "logs":
|
||||
return os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp"
|
||||
else:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def get_protocol(cls, signal: str) -> str:
|
||||
"""
|
||||
Get the preferred protocol for a signal.
|
||||
|
||||
Args:
|
||||
signal: One of "traces", "metrics", "logs"
|
||||
|
||||
Returns:
|
||||
Protocol name ("grpc" or "http")
|
||||
"""
|
||||
protocol = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc")
|
||||
|
||||
# Signal-specific overrides
|
||||
if signal == "traces":
|
||||
return os.getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", protocol)
|
||||
elif signal == "metrics":
|
||||
return os.getenv("OTEL_EXPORTER_OTLP_METRICS_PROTOCOL", protocol)
|
||||
elif signal == "logs":
|
||||
# Logs always use HTTP in our setup
|
||||
return "http"
|
||||
|
||||
return protocol
|
||||
271
shared/monitoring/telemetry.py
Normal file
271
shared/monitoring/telemetry.py
Normal file
@@ -0,0 +1,271 @@
|
||||
"""
|
||||
Unified OpenTelemetry Telemetry Setup
|
||||
|
||||
Provides a single entry point to configure all telemetry signals:
|
||||
- Traces: Distributed tracing across services
|
||||
- Metrics: OTLP metrics export + system metrics collection
|
||||
- Logs: Structured logs with trace correlation
|
||||
|
||||
All signals are exported to SigNoz via OTLP.
|
||||
"""
|
||||
|
||||
import os
|
||||
import structlog
|
||||
from typing import Optional, Dict, Any, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .otel_config import OTelConfig
|
||||
from .tracing import setup_tracing
|
||||
from .metrics_exporter import setup_otel_metrics
|
||||
from .logs_exporter import setup_otel_logging
|
||||
from .system_metrics import setup_all_metrics, SystemMetricsCollector, ApplicationMetricsCollector
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
@dataclass
|
||||
class TelemetryProviders:
|
||||
"""
|
||||
Container for all OpenTelemetry providers and collectors.
|
||||
|
||||
Attributes:
|
||||
tracer_provider: Provider for distributed tracing
|
||||
meter_provider: Provider for metrics export
|
||||
logging_handler: Handler for structured logs
|
||||
system_metrics: Collector for system-level metrics (CPU, memory, disk, network)
|
||||
app_metrics: Collector for application-level metrics (HTTP, DB)
|
||||
"""
|
||||
tracer_provider: Optional[Any] = None
|
||||
meter_provider: Optional[Any] = None
|
||||
logging_handler: Optional[Any] = None
|
||||
system_metrics: Optional[SystemMetricsCollector] = None
|
||||
app_metrics: Optional[ApplicationMetricsCollector] = None
|
||||
|
||||
|
||||
def setup_telemetry(
|
||||
app,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
enable_traces: bool = True,
|
||||
enable_metrics: bool = True,
|
||||
enable_logs: bool = True,
|
||||
enable_system_metrics: bool = True,
|
||||
metrics_protocol: Optional[str] = None, # "grpc" or "http", defaults to grpc
|
||||
export_interval_millis: int = 60000
|
||||
) -> TelemetryProviders:
|
||||
"""
|
||||
Setup all OpenTelemetry telemetry signals (traces, metrics, logs) for a service.
|
||||
|
||||
This is the UNIFIED setup function that configures everything:
|
||||
- Distributed tracing (gRPC, port 4317)
|
||||
- Metrics export (gRPC by default, port 4317)
|
||||
- System metrics collection (CPU, memory, disk, network)
|
||||
- Application metrics (HTTP requests, DB queries)
|
||||
- Structured logs export (HTTP, port 4318)
|
||||
|
||||
All signals use the centralized OTelConfig for endpoint management.
|
||||
|
||||
Args:
|
||||
app: FastAPI application instance
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
enable_traces: Enable distributed tracing (default: True)
|
||||
enable_metrics: Enable metrics export to OTLP (default: True)
|
||||
enable_logs: Enable logs export to OTLP (default: True)
|
||||
enable_system_metrics: Enable system metrics collection (default: True, can be disabled via ENABLE_SYSTEM_METRICS env)
|
||||
metrics_protocol: Protocol for metrics ("grpc" or "http", default: "grpc")
|
||||
export_interval_millis: How often to export metrics in milliseconds
|
||||
|
||||
Returns:
|
||||
TelemetryProviders containing all initialized providers and collectors
|
||||
|
||||
Example:
|
||||
from shared.monitoring.telemetry import setup_telemetry
|
||||
|
||||
app = FastAPI(title="Auth Service")
|
||||
providers = setup_telemetry(
|
||||
app,
|
||||
service_name="auth-service",
|
||||
service_version="1.0.0"
|
||||
)
|
||||
|
||||
# All telemetry is now configured:
|
||||
# - Traces automatically captured for HTTP requests
|
||||
# - System metrics automatically collected
|
||||
# - Application metrics via providers.app_metrics
|
||||
# - Logs automatically correlated with traces
|
||||
"""
|
||||
|
||||
logger.info(
|
||||
"Setting up unified OpenTelemetry telemetry",
|
||||
service=service_name,
|
||||
version=service_version,
|
||||
traces=enable_traces,
|
||||
metrics=enable_metrics,
|
||||
logs=enable_logs,
|
||||
system_metrics=enable_system_metrics
|
||||
)
|
||||
|
||||
providers = TelemetryProviders()
|
||||
|
||||
# Setup distributed tracing
|
||||
if enable_traces and OTelConfig.is_enabled("traces"):
|
||||
try:
|
||||
providers.tracer_provider = setup_tracing(
|
||||
app,
|
||||
service_name=service_name,
|
||||
service_version=service_version
|
||||
)
|
||||
if providers.tracer_provider:
|
||||
logger.info("✓ Distributed tracing configured", service=service_name)
|
||||
else:
|
||||
logger.warning("✗ Distributed tracing setup returned None", service=service_name)
|
||||
except Exception as e:
|
||||
logger.error("✗ Failed to setup distributed tracing", service=service_name, error=str(e))
|
||||
|
||||
# Setup OTLP metrics export
|
||||
if enable_metrics and OTelConfig.is_enabled("metrics"):
|
||||
try:
|
||||
providers.meter_provider = setup_otel_metrics(
|
||||
service_name=service_name,
|
||||
service_version=service_version,
|
||||
protocol=metrics_protocol,
|
||||
export_interval_millis=export_interval_millis
|
||||
)
|
||||
if providers.meter_provider:
|
||||
logger.info("✓ OTLP metrics export configured", service=service_name)
|
||||
|
||||
# Setup system and application metrics collectors
|
||||
if enable_system_metrics:
|
||||
enable_system_env = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
|
||||
if enable_system_env:
|
||||
try:
|
||||
providers.system_metrics, providers.app_metrics = setup_all_metrics(
|
||||
service_name=service_name,
|
||||
service_version=service_version,
|
||||
meter_provider=providers.meter_provider
|
||||
)
|
||||
logger.info(
|
||||
"✓ System and application metrics collectors initialized",
|
||||
service=service_name,
|
||||
system_metrics=["cpu", "memory", "disk", "network"],
|
||||
app_metrics=["http_requests", "db_queries"]
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("✗ Failed to setup metrics collectors", service=service_name, error=str(e))
|
||||
else:
|
||||
logger.warning("✗ OTLP metrics export setup returned None", service=service_name)
|
||||
except Exception as e:
|
||||
logger.error("✗ Failed to setup OTLP metrics export", service=service_name, error=str(e))
|
||||
|
||||
# Setup logs export
|
||||
if enable_logs and OTelConfig.is_enabled("logs"):
|
||||
try:
|
||||
providers.logging_handler = setup_otel_logging(
|
||||
service_name=service_name,
|
||||
service_version=service_version
|
||||
)
|
||||
if providers.logging_handler:
|
||||
logger.info("✓ Structured logs export configured", service=service_name)
|
||||
else:
|
||||
logger.warning("✗ Logs export setup returned None", service=service_name)
|
||||
except Exception as e:
|
||||
logger.error("✗ Failed to setup logs export", service=service_name, error=str(e))
|
||||
|
||||
# Log endpoint configuration summary
|
||||
try:
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
summary = {
|
||||
"service": service_name,
|
||||
"version": service_version,
|
||||
"traces": {
|
||||
"enabled": bool(providers.tracer_provider),
|
||||
"endpoint": endpoints.traces_grpc if providers.tracer_provider else "disabled"
|
||||
},
|
||||
"metrics": {
|
||||
"enabled": bool(providers.meter_provider),
|
||||
"endpoint": (endpoints.metrics_grpc if metrics_protocol != "http" else endpoints.metrics_http) if providers.meter_provider else "disabled",
|
||||
"system_metrics": bool(providers.system_metrics),
|
||||
"app_metrics": bool(providers.app_metrics)
|
||||
},
|
||||
"logs": {
|
||||
"enabled": bool(providers.logging_handler),
|
||||
"endpoint": endpoints.logs_http if providers.logging_handler else "disabled"
|
||||
}
|
||||
}
|
||||
logger.info("🎉 Telemetry setup complete", **summary)
|
||||
except Exception as e:
|
||||
logger.warning("Could not log endpoint summary", error=str(e))
|
||||
|
||||
return providers
|
||||
|
||||
|
||||
def setup_telemetry_simple(
|
||||
app,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0"
|
||||
) -> TelemetryProviders:
|
||||
"""
|
||||
Simplified telemetry setup with all defaults.
|
||||
|
||||
Uses:
|
||||
- gRPC for traces (port 4317)
|
||||
- gRPC for metrics (port 4317)
|
||||
- HTTP for logs (port 4318)
|
||||
|
||||
All settings are read from environment variables and OTelConfig.
|
||||
|
||||
Args:
|
||||
app: FastAPI application instance
|
||||
service_name: Name of the service
|
||||
service_version: Version of the service
|
||||
|
||||
Returns:
|
||||
TelemetryProviders containing all initialized providers
|
||||
|
||||
Example:
|
||||
from shared.monitoring.telemetry import setup_telemetry_simple
|
||||
|
||||
app = FastAPI(title="Auth Service")
|
||||
providers = setup_telemetry_simple(app, "auth-service")
|
||||
"""
|
||||
return setup_telemetry(
|
||||
app=app,
|
||||
service_name=service_name,
|
||||
service_version=service_version
|
||||
)
|
||||
|
||||
|
||||
def get_telemetry_status() -> Dict[str, Any]:
|
||||
"""
|
||||
Get current telemetry configuration status.
|
||||
|
||||
Returns:
|
||||
Dictionary with telemetry status information
|
||||
|
||||
Example:
|
||||
from shared.monitoring.telemetry import get_telemetry_status
|
||||
|
||||
status = get_telemetry_status()
|
||||
print(f"Tracing enabled: {status['traces']['enabled']}")
|
||||
"""
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
|
||||
return {
|
||||
"traces": {
|
||||
"enabled": OTelConfig.is_enabled("traces"),
|
||||
"protocol": "grpc",
|
||||
"endpoint": endpoints.traces_grpc
|
||||
},
|
||||
"metrics": {
|
||||
"enabled": OTelConfig.is_enabled("metrics"),
|
||||
"protocol": OTelConfig.get_protocol("metrics"),
|
||||
"grpc_endpoint": endpoints.metrics_grpc,
|
||||
"http_endpoint": endpoints.metrics_http
|
||||
},
|
||||
"logs": {
|
||||
"enabled": OTelConfig.is_enabled("logs"),
|
||||
"protocol": "http",
|
||||
"endpoint": endpoints.logs_http
|
||||
}
|
||||
}
|
||||
@@ -3,17 +3,38 @@ OpenTelemetry distributed tracing integration
|
||||
Provides end-to-end request tracking across all services
|
||||
"""
|
||||
|
||||
import os
|
||||
import structlog
|
||||
from typing import Optional
|
||||
from opentelemetry import trace
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||
|
||||
# Core instrumentations (should always be available)
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
||||
|
||||
# Optional instrumentations (may not be installed in all services)
|
||||
try:
|
||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||
HTTPX_AVAILABLE = True
|
||||
except ImportError:
|
||||
HTTPX_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
REDIS_AVAILABLE = True
|
||||
except ImportError:
|
||||
REDIS_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
||||
SQLALCHEMY_AVAILABLE = True
|
||||
except ImportError:
|
||||
SQLALCHEMY_AVAILABLE = False
|
||||
|
||||
from .otel_config import OTelConfig
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
@@ -22,8 +43,8 @@ def setup_tracing(
|
||||
app,
|
||||
service_name: str,
|
||||
service_version: str = "1.0.0",
|
||||
otel_endpoint: str = "http://signoz-otel-collector.bakery-ia:4318"
|
||||
):
|
||||
otel_endpoint: Optional[str] = None
|
||||
) -> Optional[TracerProvider]:
|
||||
"""
|
||||
Setup OpenTelemetry distributed tracing for a FastAPI service.
|
||||
|
||||
@@ -33,35 +54,56 @@ def setup_tracing(
|
||||
- Redis operations
|
||||
- PostgreSQL/SQLAlchemy queries
|
||||
|
||||
Uses gRPC protocol (port 4317) for sending traces to SigNoz.
|
||||
|
||||
Args:
|
||||
app: FastAPI application instance
|
||||
service_name: Name of the service (e.g., "auth-service")
|
||||
service_version: Version of the service
|
||||
otel_endpoint: OpenTelemetry collector endpoint (SigNoz)
|
||||
otel_endpoint: Optional override for OTLP endpoint (gRPC format: host:port)
|
||||
|
||||
Returns:
|
||||
TracerProvider instance if successful, None otherwise
|
||||
|
||||
Example:
|
||||
from shared.monitoring.tracing import setup_tracing
|
||||
|
||||
app = FastAPI(title="Auth Service")
|
||||
setup_tracing(app, "auth-service")
|
||||
tracer_provider = setup_tracing(app, "auth-service", "1.0.0")
|
||||
"""
|
||||
|
||||
# Check if tracing is enabled
|
||||
if not OTelConfig.is_enabled("traces"):
|
||||
logger.info(
|
||||
"Distributed tracing disabled",
|
||||
service=service_name,
|
||||
reason="ENABLE_TRACING not set to 'true'"
|
||||
)
|
||||
return None
|
||||
|
||||
try:
|
||||
# Create resource with service information
|
||||
resource = Resource(attributes={
|
||||
SERVICE_NAME: service_name,
|
||||
SERVICE_VERSION: service_version,
|
||||
"deployment.environment": "production"
|
||||
})
|
||||
# Get endpoints from centralized config
|
||||
endpoints = OTelConfig.get_endpoints()
|
||||
|
||||
# Use provided endpoint or get from config
|
||||
if otel_endpoint:
|
||||
# Clean user-provided endpoint for gRPC
|
||||
grpc_endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
|
||||
else:
|
||||
grpc_endpoint = endpoints.traces_grpc
|
||||
|
||||
# Get resource attributes
|
||||
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
|
||||
resource = Resource(attributes=resource_attrs)
|
||||
|
||||
# Configure tracer provider
|
||||
tracer_provider = TracerProvider(resource=resource)
|
||||
trace.set_tracer_provider(tracer_provider)
|
||||
|
||||
# Configure OTLP exporter to send to SigNoz
|
||||
# Configure OTLP gRPC exporter for traces
|
||||
otlp_exporter = OTLPSpanExporter(
|
||||
endpoint=otel_endpoint,
|
||||
insecure=True # Use TLS in production
|
||||
endpoint=grpc_endpoint,
|
||||
insecure=True # Use secure=False in production with proper TLS
|
||||
)
|
||||
|
||||
# Add span processor with batching for performance
|
||||
@@ -75,40 +117,46 @@ def setup_tracing(
|
||||
excluded_urls="health,metrics" # Don't trace health/metrics endpoints
|
||||
)
|
||||
|
||||
# Auto-instrument HTTPX (inter-service communication)
|
||||
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||
# Auto-instrument HTTPX (inter-service communication) if available
|
||||
if HTTPX_AVAILABLE:
|
||||
try:
|
||||
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||
logger.debug("HTTPX instrumentation enabled")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to instrument HTTPX: {e}")
|
||||
|
||||
# Auto-instrument Redis
|
||||
try:
|
||||
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to instrument Redis: {e}")
|
||||
# Auto-instrument Redis if available
|
||||
if REDIS_AVAILABLE:
|
||||
try:
|
||||
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||
logger.debug("Redis instrumentation enabled")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to instrument Redis: {e}")
|
||||
|
||||
# Auto-instrument PostgreSQL (psycopg2) - skip if not available
|
||||
# Most services use asyncpg instead of psycopg2
|
||||
# try:
|
||||
# Psycopg2Instrumentor().instrument(tracer_provider=tracer_provider)
|
||||
# except Exception as e:
|
||||
# logger.warning(f"Failed to instrument Psycopg2: {e}")
|
||||
|
||||
# Auto-instrument SQLAlchemy
|
||||
try:
|
||||
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
|
||||
# Auto-instrument SQLAlchemy if available
|
||||
if SQLALCHEMY_AVAILABLE:
|
||||
try:
|
||||
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||
logger.debug("SQLAlchemy instrumentation enabled")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
|
||||
|
||||
logger.info(
|
||||
"Distributed tracing configured",
|
||||
"Distributed tracing configured successfully",
|
||||
service=service_name,
|
||||
otel_endpoint=otel_endpoint
|
||||
grpc_endpoint=grpc_endpoint,
|
||||
protocol="grpc"
|
||||
)
|
||||
|
||||
return tracer_provider
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to setup tracing - continuing without it",
|
||||
service=service_name,
|
||||
error=str(e)
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def get_current_trace_id() -> Optional[str]:
|
||||
|
||||
@@ -20,10 +20,11 @@ from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi.routing import APIRouter
|
||||
|
||||
from shared.monitoring import setup_logging, setup_otel_logging, setup_otel_metrics, setup_all_metrics
|
||||
from shared.monitoring.metrics import setup_metrics_early
|
||||
from shared.monitoring import (
|
||||
setup_logging,
|
||||
setup_telemetry
|
||||
)
|
||||
from shared.monitoring.health_checks import setup_fastapi_health_checks
|
||||
from shared.monitoring.tracing import setup_tracing
|
||||
from shared.database.base import DatabaseManager
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -77,24 +78,13 @@ class BaseFastAPIService:
|
||||
|
||||
# Initialize logging
|
||||
setup_logging(service_name, log_level)
|
||||
|
||||
# Setup OpenTelemetry logging export if enabled
|
||||
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
|
||||
try:
|
||||
setup_otel_logging(service_name, version)
|
||||
self.logger = structlog.get_logger()
|
||||
self.logger.info(f"OpenTelemetry logs export enabled for {service_name}")
|
||||
except Exception as e:
|
||||
self.logger = structlog.get_logger()
|
||||
self.logger.warning(f"Failed to setup OpenTelemetry logs export: {e}")
|
||||
else:
|
||||
self.logger = structlog.get_logger()
|
||||
self.logger = structlog.get_logger()
|
||||
|
||||
# Will be set during app creation
|
||||
self.app: Optional[FastAPI] = None
|
||||
self.metrics_collector = None
|
||||
self.health_manager = None
|
||||
self.alert_service = None
|
||||
self.telemetry_providers = None # Contains all OTEL providers and metrics collectors
|
||||
|
||||
def create_app(self, **fastapi_kwargs) -> FastAPI:
|
||||
"""
|
||||
@@ -116,49 +106,25 @@ class BaseFastAPIService:
|
||||
# Create FastAPI app
|
||||
self.app = FastAPI(**config)
|
||||
|
||||
# Setup metrics BEFORE middleware and lifespan
|
||||
if self.enable_metrics:
|
||||
self.metrics_collector = setup_metrics_early(self.app, self.service_name)
|
||||
|
||||
# Setup OpenTelemetry metrics export if enabled
|
||||
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
|
||||
if enable_otel_metrics:
|
||||
try:
|
||||
self.otel_meter_provider = setup_otel_metrics(self.service_name, self.version)
|
||||
if self.otel_meter_provider:
|
||||
self.logger.info(f"OpenTelemetry metrics export enabled for {self.service_name}")
|
||||
|
||||
# Setup system metrics collection (CPU, memory, disk, network)
|
||||
enable_system_metrics = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
|
||||
if enable_system_metrics:
|
||||
try:
|
||||
self.system_metrics, self.app_metrics = setup_all_metrics(
|
||||
self.service_name,
|
||||
self.version,
|
||||
self.otel_meter_provider
|
||||
)
|
||||
self.logger.info(f"System metrics collection enabled for {self.service_name}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to setup system metrics: {e}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to setup OpenTelemetry metrics export: {e}")
|
||||
|
||||
# Setup distributed tracing
|
||||
# Check both constructor flag and environment variable
|
||||
tracing_enabled = self.enable_tracing and os.getenv("ENABLE_TRACING", "true").lower() == "true"
|
||||
|
||||
if tracing_enabled:
|
||||
try:
|
||||
otel_endpoint = os.getenv(
|
||||
"OTEL_COLLECTOR_ENDPOINT",
|
||||
"http://signoz-otel-collector.bakery-ia:4318"
|
||||
)
|
||||
setup_tracing(self.app, self.service_name, self.version, otel_endpoint)
|
||||
self.logger.info(f"Distributed tracing enabled for {self.service_name}")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to setup tracing, continuing without it: {e}")
|
||||
else:
|
||||
self.logger.info(f"Distributed tracing disabled for {self.service_name}")
|
||||
# Setup unified OpenTelemetry telemetry
|
||||
# This single call configures:
|
||||
# - Distributed tracing (gRPC, port 4317)
|
||||
# - OTLP metrics export (gRPC, port 4317)
|
||||
# - System metrics collection (CPU, memory, disk, network)
|
||||
# - Application metrics (HTTP requests, DB queries)
|
||||
# - Structured logs export (HTTP, port 4318)
|
||||
try:
|
||||
self.telemetry_providers = setup_telemetry(
|
||||
app=self.app,
|
||||
service_name=self.service_name,
|
||||
service_version=self.version,
|
||||
enable_traces=self.enable_tracing,
|
||||
enable_metrics=self.enable_metrics,
|
||||
enable_logs=True, # Controlled by OTEL_LOGS_EXPORTER env var
|
||||
enable_system_metrics=True # Controlled by ENABLE_SYSTEM_METRICS env var
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.warning("Failed to setup telemetry", error=str(e))
|
||||
|
||||
# Setup lifespan
|
||||
self.app.router.lifespan_context = self._create_lifespan()
|
||||
@@ -361,10 +327,6 @@ class BaseFastAPIService:
|
||||
method=request.method
|
||||
)
|
||||
|
||||
# Record error metric if available
|
||||
if self.metrics_collector:
|
||||
self.metrics_collector.increment_counter("errors_total", labels={"type": "unhandled"})
|
||||
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content={
|
||||
@@ -409,7 +371,10 @@ class BaseFastAPIService:
|
||||
|
||||
def register_custom_metrics(self, metrics_config: Dict[str, Dict[str, Any]]):
|
||||
"""
|
||||
Register custom metrics for the service
|
||||
Register custom OTEL metrics for the service.
|
||||
|
||||
Note: System metrics (CPU, memory, disk, network) and application metrics (HTTP, DB)
|
||||
are automatically created by setup_telemetry(). Use this for additional custom metrics.
|
||||
|
||||
Args:
|
||||
metrics_config: Dict with metric name as key and config as value
|
||||
@@ -417,25 +382,36 @@ class BaseFastAPIService:
|
||||
"user_registrations": {
|
||||
"type": "counter",
|
||||
"description": "Total user registrations",
|
||||
"labels": ["status"]
|
||||
"unit": "registrations"
|
||||
}
|
||||
}
|
||||
"""
|
||||
if not self.metrics_collector:
|
||||
self.logger.warning("Metrics collector not available")
|
||||
if not self.telemetry_providers or not self.telemetry_providers.meter_provider:
|
||||
self.logger.warning("OTEL meter provider not available - metrics not registered")
|
||||
return
|
||||
|
||||
from opentelemetry.metrics import get_meter
|
||||
meter = get_meter(self.service_name)
|
||||
|
||||
for metric_name, config in metrics_config.items():
|
||||
metric_type = config.get("type", "counter")
|
||||
description = config.get("description", f"{metric_name} metric")
|
||||
labels = config.get("labels", [])
|
||||
unit = config.get("unit", "1")
|
||||
|
||||
if metric_type == "counter":
|
||||
self.metrics_collector.register_counter(metric_name, description, labels=labels)
|
||||
elif metric_type == "histogram":
|
||||
self.metrics_collector.register_histogram(metric_name, description, labels=labels)
|
||||
else:
|
||||
self.logger.warning(f"Unsupported metric type: {metric_type}")
|
||||
try:
|
||||
if metric_type == "counter":
|
||||
meter.create_counter(metric_name, description=description, unit=unit)
|
||||
self.logger.info(f"Registered custom counter: {metric_name}")
|
||||
elif metric_type == "histogram":
|
||||
meter.create_histogram(metric_name, description=description, unit=unit)
|
||||
self.logger.info(f"Registered custom histogram: {metric_name}")
|
||||
elif metric_type == "gauge":
|
||||
meter.create_up_down_counter(metric_name, description=description, unit=unit)
|
||||
self.logger.info(f"Registered custom gauge: {metric_name}")
|
||||
else:
|
||||
self.logger.warning(f"Unsupported metric type: {metric_type}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to register metric {metric_name}", error=str(e))
|
||||
|
||||
def run_development_server(self, host: str = "0.0.0.0", port: int = 8000, reload: Optional[bool] = None):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user