Imporve monitoring 5

This commit is contained in:
Urtzi Alfaro
2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions

View File

@@ -4,90 +4,28 @@ Alert Processor Service v2.0
Main FastAPI application with RabbitMQ consumer lifecycle management.
"""
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import structlog
import os
from app.core.config import settings
from app.consumer.event_consumer import EventConsumer
from app.api import alerts, sse
from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
from shared.service_base import StandardFastAPIService
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "alert-processor"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("alert-processor")
# Setup logging
setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
# Setup OpenTelemetry logging export if enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
from shared.monitoring.logs_exporter import setup_otel_logging
result = setup_otel_logging("alert-processor", settings.VERSION)
if result:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export enabled for alert-processor")
else:
logger = structlog.get_logger()
logger.warning("OpenTelemetry logs export setup returned None")
except Exception as e:
logger = structlog.get_logger()
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
else:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
# Initialize logger
logger = structlog.get_logger()
# Global consumer instance
consumer: EventConsumer = None
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
Application lifecycle manager.
class AlertProcessorService(StandardFastAPIService):
"""Alert Processor Service with standardized monitoring setup and RabbitMQ consumer"""
Startup: Initialize Redis and RabbitMQ consumer
Shutdown: Close consumer and Redis connections
"""
global consumer
async def on_startup(self, app):
"""Custom startup logic for Alert Processor"""
global consumer
logger.info("alert_processor_starting", version=settings.VERSION)
# Startup: Initialize Redis and start consumer
try:
# Initialize Redis connection
await initialize_redis(
settings.REDIS_URL,
@@ -96,69 +34,48 @@ async def lifespan(app: FastAPI):
)
logger.info("redis_initialized")
# Start RabbitMQ consumer
consumer = EventConsumer()
await consumer.start()
logger.info("alert_processor_started")
logger.info("rabbitmq_consumer_started")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("alert-processor")
logger.info("System metrics collection started")
await super().on_startup(app)
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
logger.info("Metrics export configured via OpenTelemetry OTLP")
except Exception as e:
logger.error("alert_processor_startup_failed", error=str(e))
raise
async def on_shutdown(self, app):
"""Custom shutdown logic for Alert Processor"""
global consumer
yield
await super().on_shutdown(app)
# Shutdown: Stop consumer and close Redis
try:
# Stop RabbitMQ consumer
if consumer:
await consumer.stop()
logger.info("rabbitmq_consumer_stopped")
# Close Redis
await close_redis()
logger.info("alert_processor_shutdown")
except Exception as e:
logger.error("alert_processor_shutdown_failed", error=str(e))
logger.info("redis_closed")
# Create FastAPI app
app = FastAPI(
title="Alert Processor Service",
# Create service instance
service = AlertProcessorService(
service_name="alert-processor",
app_name="Alert Processor Service",
description="Event processing, enrichment, and alert management system",
version=settings.VERSION,
lifespan=lifespan,
debug=settings.DEBUG
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
cors_origins=["*"], # Configure appropriately for production
api_prefix="/api/v1",
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Create FastAPI app
app = service.create_app(debug=settings.DEBUG)
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Instrument SQLAlchemy
SQLAlchemyInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("alert-processor")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure appropriately for production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
# Add service-specific routers
app.include_router(
alerts.router,
prefix="/api/v1/tenants/{tenant_id}",
@@ -172,34 +89,6 @@ app.include_router(
)
@app.get("/health")
async def health_check():
"""
Health check endpoint.
Returns service status and version.
"""
return {
"status": "healthy",
"service": settings.SERVICE_NAME,
"version": settings.VERSION
}
@app.get("/")
async def root():
"""Root endpoint with service info"""
return {
"service": settings.SERVICE_NAME,
"version": settings.VERSION,
"description": "Event processing, enrichment, and alert management system"
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__":
import uvicorn