Files
bakery-ia/shared/monitoring/otel_config.py

294 lines
9.8 KiB
Python

"""
Centralized OpenTelemetry Configuration
Manages OTEL endpoints and settings for traces, metrics, and logs
"""
import os
from typing import Optional, Tuple
from dataclasses import dataclass
import structlog
logger = structlog.get_logger()
@dataclass
class OTelEndpoints:
"""
Container for OpenTelemetry endpoints.
SigNoz uses different protocols for different signals:
- Traces: gRPC (port 4317)
- Metrics: gRPC (port 4317) or HTTP (port 4318)
- Logs: HTTP (port 4318)
"""
traces_grpc: str # gRPC endpoint for traces (e.g., "host:4317")
metrics_grpc: str # gRPC endpoint for metrics (e.g., "host:4317")
metrics_http: str # HTTP endpoint for metrics (e.g., "http://host:4318/v1/metrics")
logs_http: str # HTTP endpoint for logs (e.g., "http://host:4318/v1/logs")
class OTelConfig:
"""
Centralized configuration for OpenTelemetry exporters.
This class manages endpoint URLs and ensures proper protocol usage:
- gRPC endpoints: host:port (no protocol prefix)
- HTTP endpoints: http://host:port/path (with protocol and path)
"""
# Default base endpoint (can be overridden by environment variables)
DEFAULT_OTEL_COLLECTOR_HOST = "signoz-otel-collector.bakery-ia.svc.cluster.local"
DEFAULT_GRPC_PORT = 4317
DEFAULT_HTTP_PORT = 4318
@classmethod
def get_endpoints(cls) -> OTelEndpoints:
"""
Get OpenTelemetry endpoints from environment variables with proper fallbacks.
Environment variables (in order of precedence):
1. OTEL_EXPORTER_OTLP_ENDPOINT - Base endpoint (gRPC format: host:port)
2. OTEL_EXPORTER_OTLP_TRACES_ENDPOINT - Specific traces endpoint
3. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT - Specific metrics endpoint
4. OTEL_EXPORTER_OTLP_LOGS_ENDPOINT - Specific logs endpoint
5. OTEL_COLLECTOR_ENDPOINT - Legacy variable (HTTP format)
Returns:
OTelEndpoints with all configured endpoints
"""
# Get base endpoint from environment
base_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
if base_endpoint:
# Clean and parse base endpoint
base_grpc = cls._clean_grpc_endpoint(base_endpoint)
base_http_host = cls._extract_host(base_endpoint)
# Validate that the endpoint doesn't contain secret references or malformed data
if cls._contains_secret_reference(base_grpc):
logger.error("OTEL endpoint contains secret reference, falling back to default",
malformed_endpoint=base_endpoint)
base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
else:
# Use default collector
base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
# Get signal-specific endpoints (or use base endpoint)
traces_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", base_grpc)
metrics_endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", base_grpc)
logs_endpoint = os.getenv("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT")
# Validate and clean signal-specific endpoints
traces_grpc = cls._clean_and_validate_grpc_endpoint(traces_endpoint)
metrics_grpc = cls._clean_and_validate_grpc_endpoint(metrics_endpoint)
# For metrics HTTP, convert gRPC endpoint to HTTP if needed
metrics_http = cls._grpc_to_http_endpoint(metrics_grpc, "/v1/metrics")
# For logs, use HTTP endpoint
if logs_endpoint:
logs_http = cls._ensure_http_endpoint(logs_endpoint, "/v1/logs")
else:
logs_http = cls._grpc_to_http_endpoint(base_grpc, "/v1/logs")
endpoints = OTelEndpoints(
traces_grpc=traces_grpc,
metrics_grpc=metrics_grpc,
metrics_http=metrics_http,
logs_http=logs_http
)
logger.info(
"OpenTelemetry endpoints configured",
traces_grpc=endpoints.traces_grpc,
metrics_grpc=endpoints.metrics_grpc,
metrics_http=endpoints.metrics_http,
logs_http=endpoints.logs_http
)
return endpoints
@staticmethod
def _clean_grpc_endpoint(endpoint: str) -> str:
"""
Clean endpoint for gRPC usage (remove protocol, paths).
Args:
endpoint: Raw endpoint string
Returns:
Cleaned endpoint in format "host:port"
"""
# Remove protocol prefixes
endpoint = endpoint.replace("http://", "").replace("https://", "")
# Remove paths (gRPC doesn't use paths)
if "/" in endpoint:
endpoint = endpoint.split("/")[0]
# Ensure it has a port
if ":" not in endpoint:
endpoint = f"{endpoint}:4317"
return endpoint
@staticmethod
def _extract_host(endpoint: str) -> str:
"""
Extract host and convert to HTTP endpoint.
Args:
endpoint: Raw endpoint string
Returns:
HTTP endpoint without path (e.g., "http://host:4318")
"""
# Remove protocol if present
clean = endpoint.replace("http://", "").replace("https://", "")
# Remove path if present
if "/" in clean:
clean = clean.split("/")[0]
# Extract host without port
if ":" in clean:
host = clean.split(":")[0]
else:
host = clean
return f"http://{host}:4318"
@staticmethod
def _grpc_to_http_endpoint(grpc_endpoint: str, path: str) -> str:
"""
Convert gRPC endpoint to HTTP endpoint with path.
Args:
grpc_endpoint: gRPC endpoint (e.g., "host:4317")
path: HTTP path (e.g., "/v1/metrics")
Returns:
HTTP endpoint (e.g., "http://host:4318/v1/metrics")
"""
# Extract host from gRPC endpoint
if ":" in grpc_endpoint:
host = grpc_endpoint.split(":")[0]
else:
host = grpc_endpoint
# Build HTTP endpoint with port 4318
return f"http://{host}:4318{path}"
@staticmethod
def _ensure_http_endpoint(endpoint: str, path: str) -> str:
"""
Ensure endpoint is in HTTP format with proper path.
Args:
endpoint: Raw endpoint string
path: Required path (e.g., "/v1/logs")
Returns:
HTTP endpoint with protocol and path
"""
# Add protocol if missing
if not endpoint.startswith(("http://", "https://")):
endpoint = f"http://{endpoint}"
# Ensure it has the correct port for HTTP
if ":4317" in endpoint:
endpoint = endpoint.replace(":4317", ":4318")
elif ":4318" not in endpoint and ":" in endpoint:
# Has a port but not the right one, replace it
parts = endpoint.split(":")
if len(parts) >= 2:
# Remove existing port and path
base = ":".join(parts[:-1])
endpoint = f"{base}:4318"
elif ":" not in endpoint.replace("http://", "").replace("https://", ""):
# No port at all, add it
endpoint = f"{endpoint}:4318"
# Ensure path is present
if not endpoint.endswith(path):
# Remove any existing path first
if "/" in endpoint.split("://")[1]:
base = endpoint.split("://")[0] + "://" + endpoint.split("://")[1].split("/")[0]
endpoint = base
endpoint = f"{endpoint}{path}"
return endpoint
@classmethod
def get_resource_attributes(
cls,
service_name: str,
service_version: str = "1.0.0"
) -> dict:
"""
Get common resource attributes for all OTEL signals.
Args:
service_name: Name of the service
service_version: Version of the service
Returns:
Dictionary of resource attributes
"""
return {
"service.name": service_name,
"service.version": service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
"k8s.cluster.name": os.getenv("K8S_CLUSTER_NAME", "bakery-ia-cluster"),
}
@classmethod
def is_enabled(cls, signal: str) -> bool:
"""
Check if a specific telemetry signal is enabled.
Args:
signal: One of "traces", "metrics", "logs"
Returns:
True if signal is enabled, False otherwise
"""
signal = signal.lower()
if signal == "traces":
return os.getenv("ENABLE_TRACING", "true").lower() == "true"
elif signal == "metrics":
return os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
elif signal == "logs":
return os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp"
else:
return False
@classmethod
def get_protocol(cls, signal: str) -> str:
"""
Get the preferred protocol for a signal.
Args:
signal: One of "traces", "metrics", "logs"
Returns:
Protocol name ("grpc" or "http")
"""
protocol = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc")
# Signal-specific overrides
if signal == "traces":
return os.getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", protocol)
elif signal == "metrics":
return os.getenv("OTEL_EXPORTER_OTLP_METRICS_PROTOCOL", protocol)
elif signal == "logs":
# Logs always use HTTP in our setup
return "http"
return protocol