Imporve monitoring 5

This commit is contained in:
Urtzi Alfaro
2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions

View File

@@ -1,160 +1,61 @@
"""Main FastAPI application for AI Insights Service."""
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import structlog
import os
from app.core.config import settings
from app.core.database import init_db, close_db
from app.api import insights
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
from shared.service_base import StandardFastAPIService
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "ai-insights"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("ai-insights")
# Setup logging
setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO'))
# Initialize logger
logger = structlog.get_logger()
# Setup OpenTelemetry logging export if enabled
logger.info(f"OTEL_LOGS_EXPORTER env var: {os.getenv('OTEL_LOGS_EXPORTER', 'not set')}")
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
logger.info("Attempting to setup OpenTelemetry logging")
from shared.monitoring.logs_exporter import setup_otel_logging
result = setup_otel_logging("ai-insights", settings.SERVICE_VERSION)
if result:
logger.info("OpenTelemetry logs export enabled for ai-insights")
else:
logger.warning("OpenTelemetry logs export setup returned None")
except Exception as e:
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
else:
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
class AIInsightsService(StandardFastAPIService):
"""AI Insights Service with standardized monitoring setup"""
async def on_startup(self, app):
"""Custom startup logic for AI Insights"""
# Initialize database
await init_db()
logger.info("Database initialized")
await super().on_startup(app)
async def on_shutdown(self, app):
"""Custom shutdown logic for AI Insights"""
await super().on_shutdown(app)
# Close database
await close_db()
logger.info("Database connections closed")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Lifespan event handler for startup and shutdown."""
# Startup
logger.info("Starting AI Insights Service", service=settings.SERVICE_NAME, version=settings.SERVICE_VERSION)
await init_db()
logger.info("Database initialized")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("ai-insights")
logger.info("System metrics collection started")
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
logger.info("Metrics export configured via OpenTelemetry OTLP")
yield
# Shutdown
logger.info("Shutting down AI Insights Service")
await close_db()
logger.info("Database connections closed")
# Create FastAPI app
app = FastAPI(
title="AI Insights Service",
# Create service instance
service = AIInsightsService(
service_name="ai-insights",
app_name="AI Insights Service",
description="Intelligent insights and recommendations for bakery operations",
version=settings.SERVICE_VERSION,
lifespan=lifespan
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
cors_origins=getattr(settings, 'ALLOWED_ORIGINS', ["*"]),
api_prefix=settings.API_V1_PREFIX,
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Create FastAPI app
app = service.create_app()
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Instrument SQLAlchemy
SQLAlchemyInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("ai-insights")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=settings.ALLOWED_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router(
# Add service-specific routers
service.add_router(
insights.router,
prefix=settings.API_V1_PREFIX,
tags=["insights"]
)
@app.get("/")
async def root():
"""Root endpoint."""
return {
"service": settings.SERVICE_NAME,
"version": settings.SERVICE_VERSION,
"status": "running"
}
@app.get("/health")
async def health_check():
"""Health check endpoint."""
return {
"status": "healthy",
"service": settings.SERVICE_NAME,
"version": settings.SERVICE_VERSION
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__":
import uvicorn

View File

@@ -4,90 +4,28 @@ Alert Processor Service v2.0
Main FastAPI application with RabbitMQ consumer lifecycle management.
"""
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import structlog
import os
from app.core.config import settings
from app.consumer.event_consumer import EventConsumer
from app.api import alerts, sse
from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
from shared.service_base import StandardFastAPIService
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "alert-processor"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("alert-processor")
# Setup logging
setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
# Setup OpenTelemetry logging export if enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
from shared.monitoring.logs_exporter import setup_otel_logging
result = setup_otel_logging("alert-processor", settings.VERSION)
if result:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export enabled for alert-processor")
else:
logger = structlog.get_logger()
logger.warning("OpenTelemetry logs export setup returned None")
except Exception as e:
logger = structlog.get_logger()
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
else:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
# Initialize logger
logger = structlog.get_logger()
# Global consumer instance
consumer: EventConsumer = None
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
Application lifecycle manager.
class AlertProcessorService(StandardFastAPIService):
"""Alert Processor Service with standardized monitoring setup and RabbitMQ consumer"""
Startup: Initialize Redis and RabbitMQ consumer
Shutdown: Close consumer and Redis connections
"""
global consumer
async def on_startup(self, app):
"""Custom startup logic for Alert Processor"""
global consumer
logger.info("alert_processor_starting", version=settings.VERSION)
# Startup: Initialize Redis and start consumer
try:
# Initialize Redis connection
await initialize_redis(
settings.REDIS_URL,
@@ -96,69 +34,48 @@ async def lifespan(app: FastAPI):
)
logger.info("redis_initialized")
# Start RabbitMQ consumer
consumer = EventConsumer()
await consumer.start()
logger.info("alert_processor_started")
logger.info("rabbitmq_consumer_started")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("alert-processor")
logger.info("System metrics collection started")
await super().on_startup(app)
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
logger.info("Metrics export configured via OpenTelemetry OTLP")
except Exception as e:
logger.error("alert_processor_startup_failed", error=str(e))
raise
async def on_shutdown(self, app):
"""Custom shutdown logic for Alert Processor"""
global consumer
yield
await super().on_shutdown(app)
# Shutdown: Stop consumer and close Redis
try:
# Stop RabbitMQ consumer
if consumer:
await consumer.stop()
logger.info("rabbitmq_consumer_stopped")
# Close Redis
await close_redis()
logger.info("alert_processor_shutdown")
except Exception as e:
logger.error("alert_processor_shutdown_failed", error=str(e))
logger.info("redis_closed")
# Create FastAPI app
app = FastAPI(
title="Alert Processor Service",
# Create service instance
service = AlertProcessorService(
service_name="alert-processor",
app_name="Alert Processor Service",
description="Event processing, enrichment, and alert management system",
version=settings.VERSION,
lifespan=lifespan,
debug=settings.DEBUG
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
cors_origins=["*"], # Configure appropriately for production
api_prefix="/api/v1",
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Create FastAPI app
app = service.create_app(debug=settings.DEBUG)
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Instrument SQLAlchemy
SQLAlchemyInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("alert-processor")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure appropriately for production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
# Add service-specific routers
app.include_router(
alerts.router,
prefix="/api/v1/tenants/{tenant_id}",
@@ -172,34 +89,6 @@ app.include_router(
)
@app.get("/health")
async def health_check():
"""
Health check endpoint.
Returns service status and version.
"""
return {
"status": "healthy",
"service": settings.SERVICE_NAME,
"version": settings.VERSION
}
@app.get("/")
async def root():
"""Root endpoint with service info"""
return {
"service": settings.SERVICE_NAME,
"version": settings.VERSION,
"description": "Event processing, enrichment, and alert management system"
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__":
import uvicorn

View File

@@ -3,192 +3,74 @@ Demo Session Service - Main Application
Manages isolated demo sessions with ephemeral data
"""
from fastapi import FastAPI, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import structlog
from contextlib import asynccontextmanager
import os
from app.core import settings, DatabaseManager
from app.api import demo_sessions, demo_accounts, demo_operations, internal
from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
from shared.service_base import StandardFastAPIService
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.sdk.resources import Resource
# Initialize logger
logger = structlog.get_logger()
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "demo-session"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("demo-session")
# Setup logging
setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO'))
# Setup OpenTelemetry logging export if enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
from shared.monitoring.logs_exporter import setup_otel_logging
result = setup_otel_logging("demo-session", settings.VERSION)
if result:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export enabled for demo-session")
else:
logger = structlog.get_logger()
logger.warning("OpenTelemetry logs export setup returned None")
except Exception as e:
logger = structlog.get_logger()
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
else:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
# Initialize database
# Initialize database manager
db_manager = DatabaseManager()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan handler"""
logger.info("Starting Demo Session Service", version=settings.VERSION)
class DemoSessionService(StandardFastAPIService):
"""Demo Session Service with standardized monitoring setup"""
# Initialize database
db_manager.initialize()
async def on_startup(self, app):
"""Custom startup logic for Demo Session"""
# Initialize database
db_manager.initialize()
logger.info("Database initialized")
# Initialize Redis using shared implementation
await initialize_redis(
redis_url=settings.REDIS_URL,
db=0,
max_connections=50
)
# Initialize Redis
await initialize_redis(
redis_url=settings.REDIS_URL,
db=0,
max_connections=50
)
logger.info("Redis initialized")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("demo-session")
logger.info("System metrics collection started")
await super().on_startup(app)
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
logger.info("Metrics export configured via OpenTelemetry OTLP")
async def on_shutdown(self, app):
"""Custom shutdown logic for Demo Session"""
await super().on_shutdown(app)
logger.info("Demo Session Service started successfully")
yield
# Cleanup on shutdown
await db_manager.close()
await close_redis()
logger.info("Demo Session Service stopped")
# Cleanup
await db_manager.close()
await close_redis()
logger.info("Database and Redis connections closed")
app = FastAPI(
title="Demo Session Service",
# Create service instance
service = DemoSessionService(
service_name="demo-session",
app_name="Demo Session Service",
description="Manages isolated demo sessions for prospect users",
version=settings.VERSION,
lifespan=lifespan
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
cors_origins=["*"], # Configure appropriately for production
api_prefix="/api/v1",
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Create FastAPI app
app = service.create_app(debug=settings.DEBUG)
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("demo-session")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
"""Global exception handler"""
logger.error(
"Unhandled exception",
path=request.url.path,
method=request.method,
error=str(exc)
)
return JSONResponse(
status_code=500,
content={"detail": "Internal server error"}
)
# Include routers
# Add service-specific routers
app.include_router(demo_sessions.router)
app.include_router(demo_accounts.router)
app.include_router(demo_operations.router)
app.include_router(internal.router)
@app.get("/")
async def root():
"""Root endpoint"""
return {
"service": "demo-session",
"version": settings.VERSION,
"status": "running"
}
@app.get("/health")
async def health():
"""Health check endpoint"""
from shared.redis_utils import get_redis_manager
redis_manager = await get_redis_manager()
redis_ok = await redis_manager.health_check()
return {
"status": "healthy" if redis_ok else "degraded",
"service": "demo-session",
"version": settings.VERSION,
"redis": "connected" if redis_ok else "disconnected"
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__":
import uvicorn
uvicorn.run(