New alert service

This commit is contained in:
Urtzi Alfaro
2025-12-05 20:07:01 +01:00
parent 1fe3a73549
commit 667e6e0404
393 changed files with 26002 additions and 61033 deletions

View File

@@ -1,559 +1,137 @@
# services/alert_processor/app/main.py
"""
Alert Processor Service - Central hub for processing alerts and recommendations
Consumes from RabbitMQ, stores in database, and routes to notification service
Alert Processor Service v2.0
Main FastAPI application with RabbitMQ consumer lifecycle management.
"""
import asyncio
import json
import signal
import sys
from datetime import datetime
from typing import Dict, Any
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import structlog
from shared.redis_utils import initialize_redis, close_redis, get_redis_client
from aio_pika import connect_robust, IncomingMessage, ExchangeType
from app.config import AlertProcessorConfig
from shared.database.base import create_database_manager
from shared.clients.base_service_client import BaseServiceClient
from shared.config.rabbitmq_config import RABBITMQ_CONFIG
from app.core.config import settings
from app.consumer.event_consumer import EventConsumer
from app.api import alerts, sse
from shared.redis_utils import initialize_redis, close_redis
# Import enrichment services
from app.services.enrichment import (
PriorityScoringService,
ContextEnrichmentService,
TimingIntelligenceService,
OrchestratorClient
)
from shared.schemas.alert_types import RawAlert
# Setup logging
import logging
# Configure Python's standard logging first (required for structlog.stdlib.LoggerFactory)
logging.basicConfig(
format="%(message)s",
stream=sys.stdout,
level=logging.INFO,
)
# Configure structlog to use the standard logging backend
# Configure structured logging
structlog.configure(
processors=[
structlog.stdlib.filter_by_level,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
structlog.stdlib.PositionalArgumentsFormatter(),
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.add_log_level,
structlog.processors.JSONRenderer()
],
context_class=dict,
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
]
)
logger = structlog.get_logger()
# Global consumer instance
consumer: EventConsumer = None
class NotificationServiceClient(BaseServiceClient):
"""Client for notification service"""
def __init__(self, config: AlertProcessorConfig):
super().__init__("notification-service", config)
self.config = config
def get_service_base_path(self) -> str:
"""Return the base path for notification service APIs"""
return "/api/v1"
async def send_notification(self, tenant_id: str, notification: Dict[str, Any], channels: list) -> Dict[str, Any]:
"""Send notification via notification service"""
try:
response = await self.post(
"notifications/send",
data={
"tenant_id": tenant_id,
"notification": notification,
"channels": channels
}
)
return response if response else {"status": "failed", "error": "No response from notification service"}
except Exception as e:
logger.error("Failed to send notification", error=str(e), tenant_id=tenant_id)
return {"status": "failed", "error": str(e)}
class AlertProcessorService:
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
Central service for processing and routing alerts and recommendations
Integrates with notification service for multi-channel delivery
Application lifecycle manager.
Startup: Initialize Redis and RabbitMQ consumer
Shutdown: Close consumer and Redis connections
"""
def __init__(self, config: AlertProcessorConfig):
self.config = config
self.db_manager = create_database_manager(config.DATABASE_URL, "alert-processor")
self.notification_client = NotificationServiceClient(config)
self.redis = None
self.connection = None
self.channel = None
self.running = False
global consumer
# Initialize enrichment services (context_enrichment initialized after Redis connection)
self.orchestrator_client = OrchestratorClient(config.ORCHESTRATOR_SERVICE_URL)
self.context_enrichment = None # Initialized in start() after Redis connection
self.priority_scoring = PriorityScoringService(config)
self.timing_intelligence = TimingIntelligenceService(config)
# Metrics
self.items_processed = 0
self.items_stored = 0
self.notifications_sent = 0
self.errors_count = 0
self.enrichments_count = 0
async def start(self):
"""Start the alert processor service"""
try:
logger.info("Starting Alert Processor Service")
# Initialize shared Redis connection for SSE publishing
await initialize_redis(self.config.REDIS_URL, db=0, max_connections=20)
self.redis = await get_redis_client()
logger.info("Connected to Redis")
# Initialize context enrichment service now that Redis is available
self.context_enrichment = ContextEnrichmentService(self.config, self.db_manager, self.redis)
logger.info("Initialized context enrichment service")
# Connect to RabbitMQ
await self._setup_rabbitmq()
# Start consuming messages
await self._start_consuming()
self.running = True
logger.info("Alert Processor Service started successfully")
except Exception as e:
logger.error("Failed to start Alert Processor Service", error=str(e))
raise
async def _setup_rabbitmq(self):
"""Setup RabbitMQ connection and configuration"""
self.connection = await connect_robust(
self.config.RABBITMQ_URL,
heartbeat=30,
connection_attempts=5
)
self.channel = await self.connection.channel()
await self.channel.set_qos(prefetch_count=10) # Process 10 messages at a time
# Setup exchange and queue based on config
exchange_config = RABBITMQ_CONFIG["exchanges"]["alerts"]
self.exchange = await self.channel.declare_exchange(
exchange_config["name"],
getattr(ExchangeType, exchange_config["type"].upper()),
durable=exchange_config["durable"]
)
queue_config = RABBITMQ_CONFIG["queues"]["alert_processing"]
self.queue = await self.channel.declare_queue(
queue_config["name"],
durable=queue_config["durable"],
arguments=queue_config["arguments"]
)
# Bind to all alert and recommendation routing keys
await self.queue.bind(self.exchange, routing_key="*.*.*")
logger.info("RabbitMQ setup completed")
async def _start_consuming(self):
"""Start consuming messages from RabbitMQ"""
await self.queue.consume(self.process_item)
logger.info("Started consuming alert messages")
async def process_item(self, message: IncomingMessage):
"""Process incoming alert or recommendation"""
async with message.process():
try:
# Parse message
item = json.loads(message.body.decode())
logger.info("Processing item",
item_type=item.get('item_type'),
alert_type=item.get('type'),
priority_level=item.get('priority_level', 'standard'),
tenant_id=item.get('tenant_id'))
# ENRICH ALERT BEFORE STORAGE
enriched_item = await self.enrich_alert(item)
self.enrichments_count += 1
# Store enriched alert in database
stored_item = await self.store_enriched_item(enriched_item)
self.items_stored += 1
# Determine delivery channels based on priority score (not severity)
channels = self.get_channels_by_priority(enriched_item['priority_score'])
# Send via notification service if channels are specified
if channels:
notification_result = await self.notification_client.send_notification(
tenant_id=enriched_item['tenant_id'],
notification={
'type': enriched_item['item_type'],
'id': enriched_item['id'],
'title': enriched_item['title'],
'message': enriched_item['message'],
'priority_score': enriched_item['priority_score'],
'priority_level': enriched_item['priority_level'],
'type_class': enriched_item['type_class'],
'metadata': enriched_item.get('metadata', {}),
'actions': enriched_item.get('smart_actions', []),
'ai_reasoning_summary': enriched_item.get('ai_reasoning_summary'),
'email': enriched_item.get('email'),
'phone': enriched_item.get('phone'),
'user_id': enriched_item.get('user_id')
},
channels=channels
)
if notification_result and notification_result.get('status') == 'success':
self.notifications_sent += 1
# Stream enriched alert to SSE for real-time dashboard (always)
await self.stream_to_sse(enriched_item['tenant_id'], stored_item)
self.items_processed += 1
logger.info("Item processed successfully",
item_id=enriched_item['id'],
priority_score=enriched_item['priority_score'],
priority_level=enriched_item['priority_level'],
channels=len(channels))
except Exception as e:
self.errors_count += 1
logger.error("Item processing failed", error=str(e))
raise
async def enrich_alert(self, item: dict) -> dict:
"""
Enrich alert with priority scoring, context, and smart actions.
All alerts MUST be enriched - no legacy support.
"""
try:
# Convert dict to RawAlert model
# Map 'type' to 'alert_type' and 'metadata' to 'alert_metadata'
raw_alert = RawAlert(
tenant_id=item['tenant_id'],
alert_type=item.get('type', item.get('alert_type', 'unknown')),
title=item['title'],
message=item['message'],
service=item['service'],
actions=item.get('actions', []),
alert_metadata=item.get('metadata', item.get('alert_metadata', {})),
item_type=item.get('item_type', 'alert')
)
# Enrich with orchestrator context (AI actions, business impact)
enriched = await self.context_enrichment.enrich_alert(raw_alert)
# Convert EnrichedAlert back to dict and merge with original item
# Use mode='json' to properly serialize datetime objects to ISO strings
enriched_dict = enriched.model_dump(mode='json') if hasattr(enriched, 'model_dump') else dict(enriched)
enriched_dict['id'] = item['id'] # Preserve original ID
enriched_dict['item_type'] = item.get('item_type', 'alert') # Preserve item_type
enriched_dict['type'] = enriched_dict.get('alert_type', item.get('type', 'unknown')) # Preserve type field
enriched_dict['timestamp'] = item.get('timestamp', datetime.utcnow().isoformat())
enriched_dict['timing_decision'] = enriched_dict.get('timing_decision', 'send_now') # Default timing decision
# Map 'actions' to 'smart_actions' for database storage
if 'actions' in enriched_dict and 'smart_actions' not in enriched_dict:
enriched_dict['smart_actions'] = enriched_dict['actions']
logger.info("Alert enriched successfully",
alert_id=enriched_dict['id'],
alert_type=enriched_dict.get('alert_type'),
priority_score=enriched_dict['priority_score'],
priority_level=enriched_dict['priority_level'],
type_class=enriched_dict['type_class'],
actions_count=len(enriched_dict.get('actions', [])),
smart_actions_count=len(enriched_dict.get('smart_actions', [])))
return enriched_dict
except Exception as e:
logger.error("Alert enrichment failed, using fallback", error=str(e), alert_id=item.get('id'))
# Fallback: basic enrichment with defaults
return self._create_fallback_enrichment(item)
def _create_fallback_enrichment(self, item: dict) -> dict:
"""
Create fallback enrichment when enrichment services fail.
Ensures all alerts have required enrichment fields.
"""
return {
**item,
'item_type': item.get('item_type', 'alert'), # Ensure item_type is preserved
'type': item.get('type', 'unknown'), # Ensure type field is preserved
'alert_type': item.get('type', item.get('alert_type', 'unknown')), # Ensure alert_type exists
'priority_score': 50,
'priority_level': 'standard',
'type_class': 'action_needed',
'orchestrator_context': None,
'business_impact': None,
'urgency_context': None,
'user_agency': None,
'trend_context': None,
'smart_actions': item.get('actions', []),
'ai_reasoning_summary': None,
'confidence_score': 0.5,
'timing_decision': 'send_now',
'scheduled_send_time': None,
'placement': ['dashboard']
}
async def store_enriched_item(self, enriched_item: dict) -> dict:
"""Store enriched alert in database with all enrichment fields"""
from app.models.events import Alert, AlertStatus
from sqlalchemy import select
async with self.db_manager.get_session() as session:
# Create enriched alert instance
alert = Alert(
id=enriched_item['id'],
tenant_id=enriched_item['tenant_id'],
item_type=enriched_item['item_type'],
alert_type=enriched_item['type'],
status='active',
service=enriched_item['service'],
title=enriched_item['title'],
message=enriched_item['message'],
# Enrichment fields (REQUIRED)
priority_score=enriched_item['priority_score'],
priority_level=enriched_item['priority_level'],
type_class=enriched_item['type_class'],
# Context enrichment (JSONB)
orchestrator_context=enriched_item.get('orchestrator_context'),
business_impact=enriched_item.get('business_impact'),
urgency_context=enriched_item.get('urgency_context'),
user_agency=enriched_item.get('user_agency'),
trend_context=enriched_item.get('trend_context'),
# Smart actions
smart_actions=enriched_item.get('smart_actions', []),
# AI reasoning
ai_reasoning_summary=enriched_item.get('ai_reasoning_summary'),
confidence_score=enriched_item.get('confidence_score', 0.8),
# Timing intelligence
timing_decision=enriched_item.get('timing_decision', 'send_now'),
scheduled_send_time=enriched_item.get('scheduled_send_time'),
# Placement
placement=enriched_item.get('placement', ['dashboard']),
# Metadata (legacy)
alert_metadata=enriched_item.get('metadata', {}),
# Timestamp
created_at=datetime.fromisoformat(enriched_item['timestamp']) if isinstance(enriched_item['timestamp'], str) else enriched_item['timestamp']
)
session.add(alert)
await session.commit()
await session.refresh(alert)
logger.debug("Enriched item stored in database",
item_id=enriched_item['id'],
priority_score=alert.priority_score,
type_class=alert.type_class)
# Convert to enriched dict for return
alert_dict = alert.to_dict()
# Cache active alerts in Redis for SSE initial_items
await self._cache_active_alerts(str(alert.tenant_id))
return alert_dict
async def _cache_active_alerts(self, tenant_id: str):
"""
Cache today's active alerts for a tenant in Redis for quick SSE access
Only caches alerts from today (00:00 UTC onwards) to avoid flooding
the dashboard with historical alerts on initial connection.
Analytics endpoints should query the database directly for historical data.
"""
try:
from app.models.events import Alert, AlertStatus
from sqlalchemy import select
async with self.db_manager.get_session() as session:
# Calculate start of today (UTC) to filter only today's alerts
today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
# Query only today's active alerts for this tenant
# This prevents showing yesterday's alerts on dashboard initial load
query = select(Alert).where(
Alert.tenant_id == tenant_id,
Alert.status == AlertStatus.ACTIVE,
Alert.created_at >= today_start # Only today's alerts
).order_by(Alert.created_at.desc()).limit(50)
result = await session.execute(query)
alerts = result.scalars().all()
# Convert to enriched JSON-serializable format
active_items = []
for alert in alerts:
active_items.append(alert.to_dict())
# Cache in Redis with 1 hour TTL
cache_key = f"active_alerts:{tenant_id}"
await self.redis.setex(
cache_key,
3600, # 1 hour TTL
json.dumps(active_items)
)
logger.debug("Cached today's active alerts in Redis",
tenant_id=tenant_id,
count=len(active_items),
filter_date=today_start.isoformat())
except Exception as e:
logger.error("Failed to cache active alerts",
tenant_id=tenant_id,
error=str(e))
async def stream_to_sse(self, tenant_id: str, item: dict):
"""Publish enriched item to Redis for SSE streaming"""
channel = f"alerts:{tenant_id}"
# Item is already enriched dict from store_enriched_item
# Just ensure timestamp is serializable
sse_message = {
**item,
'timestamp': item['created_at'].isoformat() if hasattr(item['created_at'], 'isoformat') else item['created_at']
}
# Publish to Redis channel for SSE
await self.redis.publish(channel, json.dumps(sse_message))
logger.debug("Enriched item published to SSE",
tenant_id=tenant_id,
item_id=item['id'],
priority_score=item.get('priority_score'))
def get_channels_by_priority(self, priority_score: int) -> list:
"""
Determine notification channels based on priority score and timing.
Uses multi-factor priority score (0-100) instead of legacy severity.
"""
current_hour = datetime.now().hour
channels = ['dashboard'] # Always include dashboard (SSE)
# Critical priority (90-100): All channels immediately
if priority_score >= self.config.CRITICAL_THRESHOLD:
channels.extend(['whatsapp', 'email', 'push'])
# Important priority (70-89): WhatsApp and email during extended hours
elif priority_score >= self.config.IMPORTANT_THRESHOLD:
if 6 <= current_hour <= 22:
channels.extend(['whatsapp', 'email'])
else:
channels.append('email') # Email only during night
# Standard priority (50-69): Email during business hours
elif priority_score >= self.config.STANDARD_THRESHOLD:
if 7 <= current_hour <= 20:
channels.append('email')
# Info priority (0-49): Dashboard only
return channels
async def stop(self):
"""Stop the alert processor service"""
self.running = False
logger.info("Stopping Alert Processor Service")
try:
# Close RabbitMQ connection
if self.connection and not self.connection.is_closed:
await self.connection.close()
# Close shared Redis connection
await close_redis()
logger.info("Alert Processor Service stopped")
except Exception as e:
logger.error("Error stopping service", error=str(e))
def get_metrics(self) -> Dict[str, Any]:
"""Get service metrics"""
return {
"items_processed": self.items_processed,
"items_stored": self.items_stored,
"enrichments_count": self.enrichments_count,
"notifications_sent": self.notifications_sent,
"errors_count": self.errors_count,
"running": self.running
}
async def main():
"""Main entry point"""
print("STARTUP: Inside main() function", file=sys.stderr, flush=True)
config = AlertProcessorConfig()
print("STARTUP: Config created", file=sys.stderr, flush=True)
service = AlertProcessorService(config)
print("STARTUP: Service created", file=sys.stderr, flush=True)
# Setup signal handlers for graceful shutdown
async def shutdown():
logger.info("Received shutdown signal")
await service.stop()
sys.exit(0)
# Register signal handlers
for sig in (signal.SIGTERM, signal.SIGINT):
signal.signal(sig, lambda s, f: asyncio.create_task(shutdown()))
logger.info("alert_processor_starting", version=settings.VERSION)
# Startup: Initialize Redis and start consumer
try:
# Start the service
print("STARTUP: About to start service", file=sys.stderr, flush=True)
await service.start()
print("STARTUP: Service started successfully", file=sys.stderr, flush=True)
# Initialize Redis connection
await initialize_redis(
settings.REDIS_URL,
db=settings.REDIS_DB,
max_connections=settings.REDIS_MAX_CONNECTIONS
)
logger.info("redis_initialized")
# Keep running
while service.running:
await asyncio.sleep(1)
except KeyboardInterrupt:
logger.info("Received keyboard interrupt")
consumer = EventConsumer()
await consumer.start()
logger.info("alert_processor_started")
except Exception as e:
logger.error("Service failed", error=str(e))
finally:
await service.stop()
logger.error("alert_processor_startup_failed", error=str(e))
raise
yield
# Shutdown: Stop consumer and close Redis
try:
if consumer:
await consumer.stop()
await close_redis()
logger.info("alert_processor_shutdown")
except Exception as e:
logger.error("alert_processor_shutdown_failed", error=str(e))
# Create FastAPI app
app = FastAPI(
title="Alert Processor Service",
description="Event processing, enrichment, and alert management system",
version=settings.VERSION,
lifespan=lifespan,
debug=settings.DEBUG
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure appropriately for production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router(
alerts.router,
prefix="/api/v1/tenants/{tenant_id}",
tags=["alerts"]
)
app.include_router(
sse.router,
prefix="/api/v1",
tags=["sse"]
)
@app.get("/health")
async def health_check():
"""
Health check endpoint.
Returns service status and version.
"""
return {
"status": "healthy",
"service": settings.SERVICE_NAME,
"version": settings.VERSION
}
@app.get("/")
async def root():
"""Root endpoint with service info"""
return {
"service": settings.SERVICE_NAME,
"version": settings.VERSION,
"description": "Event processing, enrichment, and alert management system"
}
if __name__ == "__main__":
print("STARTUP: Entering main block", file=sys.stderr, flush=True)
try:
print("STARTUP: About to run main()", file=sys.stderr, flush=True)
asyncio.run(main())
print("STARTUP: main() completed", file=sys.stderr, flush=True)
except Exception as e:
print(f"STARTUP: FATAL ERROR: {e}", file=sys.stderr, flush=True)
import traceback
traceback.print_exc(file=sys.stderr)
raise
import uvicorn
uvicorn.run(
"app.main:app",
host="0.0.0.0",
port=8000,
reload=settings.DEBUG
)