Add new alert architecture

This commit is contained in:
Urtzi Alfaro
2025-08-23 10:19:58 +02:00
parent 1a9839240e
commit 4b4268d640
45 changed files with 6518 additions and 1590 deletions

View File

@@ -0,0 +1 @@
# shared/alerts/__init__.py

View File

@@ -0,0 +1,353 @@
# shared/alerts/base_service.py
"""
Base alert service pattern for all microservices
Supports both alerts and recommendations through unified detection patterns
"""
import asyncio
import json
import uuid
from typing import List, Dict, Any, Optional
from uuid import UUID
from datetime import datetime, timedelta
import structlog
from redis.asyncio import Redis
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
from shared.messaging.rabbitmq import RabbitMQClient
from shared.database.base import DatabaseManager
from shared.config.rabbitmq_config import get_routing_key
logger = structlog.get_logger()
class BaseAlertService:
"""
Base class for service-specific alert and recommendation detection
Implements hybrid detection patterns: scheduled jobs, event-driven, and database triggers
"""
def __init__(self, config):
self.config = config
self.db_manager = DatabaseManager(config.DATABASE_URL)
self.rabbitmq_client = RabbitMQClient(config.RABBITMQ_URL, config.SERVICE_NAME)
self.redis = None
self.scheduler = AsyncIOScheduler()
self.is_leader = False
self.exchange = "alerts.exchange"
# Metrics
self._items_published = 0
self._checks_performed = 0
self._errors_count = 0
async def start(self):
"""Initialize all detection mechanisms"""
try:
# Connect to Redis for leader election and deduplication
self.redis = await Redis.from_url(self.config.REDIS_URL)
logger.info("Connected to Redis", service=self.config.SERVICE_NAME)
# Connect to RabbitMQ
await self.rabbitmq_client.connect()
logger.info("Connected to RabbitMQ", service=self.config.SERVICE_NAME)
# Start leader election for scheduled jobs
asyncio.create_task(self.maintain_leadership())
# Setup scheduled checks (runs only on leader)
self.setup_scheduled_checks()
# Start database listener (runs on all instances)
await self.start_database_listener()
# Start event listener (runs on all instances)
await self.start_event_listener()
logger.info("Alert service started", service=self.config.SERVICE_NAME)
except Exception as e:
logger.error("Failed to start alert service", service=self.config.SERVICE_NAME, error=str(e))
raise
async def stop(self):
"""Clean shutdown"""
try:
# Stop scheduler
if self.scheduler.running:
self.scheduler.shutdown()
# Close connections
if self.redis:
await self.redis.aclose() # Use aclose() for modern Redis client
await self.rabbitmq_client.disconnect()
logger.info("Alert service stopped", service=self.config.SERVICE_NAME)
except Exception as e:
logger.error("Error stopping alert service", service=self.config.SERVICE_NAME, error=str(e))
# PATTERN 1: Scheduled Background Jobs
def setup_scheduled_checks(self):
"""Configure scheduled alert checks - Override in service"""
raise NotImplementedError("Subclasses must implement setup_scheduled_checks")
async def maintain_leadership(self):
"""Leader election for scheduled jobs"""
lock_key = f"scheduler_lock:{self.config.SERVICE_NAME}"
lock_ttl = 60
while True:
try:
instance_id = getattr(self.config, 'INSTANCE_ID', 'default')
was_leader = self.is_leader
# Try to acquire new leadership if not currently leader
if not self.is_leader:
result = await self.redis.set(
lock_key,
instance_id,
ex=lock_ttl,
nx=True
)
self.is_leader = result is not None
else:
# Already leader - try to extend the lock
current_value = await self.redis.get(lock_key)
if current_value and current_value.decode() == instance_id:
# Still our lock, extend it
await self.redis.expire(lock_key, lock_ttl)
self.is_leader = True
else:
# Lock expired or taken by someone else
self.is_leader = False
# Handle leadership changes
if self.is_leader and not was_leader:
self.scheduler.start()
logger.info("Acquired scheduler leadership", service=self.config.SERVICE_NAME)
elif not self.is_leader and was_leader:
self.scheduler.shutdown()
logger.info("Lost scheduler leadership", service=self.config.SERVICE_NAME)
await asyncio.sleep(lock_ttl // 2)
except Exception as e:
logger.error("Leadership error", service=self.config.SERVICE_NAME, error=str(e))
self.is_leader = False
await asyncio.sleep(5)
# PATTERN 2: Event-Driven Detection
async def start_event_listener(self):
"""Listen for business events - Override in service"""
pass
# PATTERN 3: Database Triggers
async def start_database_listener(self):
"""Listen for database notifications"""
try:
import asyncpg
# Convert SQLAlchemy URL format to plain PostgreSQL for asyncpg
database_url = self.config.DATABASE_URL
if database_url.startswith('postgresql+asyncpg://'):
database_url = database_url.replace('postgresql+asyncpg://', 'postgresql://')
conn = await asyncpg.connect(database_url)
# Register listeners based on service
await self.register_db_listeners(conn)
logger.info("Database listeners registered", service=self.config.SERVICE_NAME)
except Exception as e:
logger.error("Failed to setup database listeners", service=self.config.SERVICE_NAME, error=str(e))
async def register_db_listeners(self, conn):
"""Register database listeners - Override in service"""
pass
# Publishing (Updated for type)
async def publish_item(self, tenant_id: UUID, item: Dict[str, Any], item_type: str = 'alert'):
"""Publish alert or recommendation to RabbitMQ with deduplication"""
try:
# Check for duplicate
item_key = f"{tenant_id}:{item_type}:{item['type']}:{item.get('metadata', {}).get('id', '')}"
if await self.is_duplicate_item(item_key):
logger.debug("Duplicate item skipped",
service=self.config.SERVICE_NAME,
item_type=item_type,
alert_type=item['type'])
return False
# Add metadata
item['id'] = str(uuid.uuid4())
item['tenant_id'] = str(tenant_id)
item['service'] = self.config.SERVICE_NAME
item['timestamp'] = datetime.utcnow().isoformat()
item['item_type'] = item_type # 'alert' or 'recommendation'
# Determine routing key based on severity and type
routing_key = get_routing_key(item_type, item['severity'], self.config.SERVICE_NAME)
# Publish to RabbitMQ
success = await self.rabbitmq_client.publish_event(
exchange_name=self.exchange,
routing_key=routing_key,
event_data=item
)
if success:
self._items_published += 1
logger.info("Item published successfully",
service=self.config.SERVICE_NAME,
item_type=item_type,
alert_type=item['type'],
severity=item['severity'],
routing_key=routing_key)
else:
self._errors_count += 1
logger.error("Failed to publish item",
service=self.config.SERVICE_NAME,
item_type=item_type,
alert_type=item['type'])
return success
except Exception as e:
self._errors_count += 1
logger.error("Error publishing item",
service=self.config.SERVICE_NAME,
error=str(e),
item_type=item_type)
return False
async def is_duplicate_item(self, item_key: str, window_minutes: int = 15) -> bool:
"""Prevent duplicate items within time window"""
key = f"item_sent:{item_key}"
try:
result = await self.redis.set(
key, "1",
ex=window_minutes * 60,
nx=True
)
return result is None # None means duplicate
except Exception as e:
logger.error("Error checking duplicate", error=str(e))
return False # Allow publishing if check fails
# Helper methods
async def get_active_tenants(self) -> List[UUID]:
"""Get list of active tenant IDs"""
try:
from sqlalchemy import text
query = text("SELECT DISTINCT tenant_id FROM tenants WHERE status = 'active'")
async with self.db_manager.get_session() as session:
result = await session.execute(query)
return [row.tenant_id for row in result.fetchall()]
except Exception as e:
# If tenants table doesn't exist, skip tenant-based processing
if "does not exist" in str(e):
logger.debug("Tenants table not found, skipping tenant-based alert processing")
return []
else:
logger.error("Error fetching active tenants", error=str(e))
return []
async def get_tenant_config(self, tenant_id: UUID) -> Dict[str, Any]:
"""Get tenant-specific configuration"""
try:
from sqlalchemy import text
query = text("SELECT config FROM tenants WHERE tenant_id = :tenant_id")
async with self.db_manager.get_session() as session:
result = await session.execute(query, {"tenant_id": tenant_id})
row = result.fetchone()
return json.loads(row.config) if row and row.config else {}
except Exception as e:
logger.error("Error fetching tenant config", tenant_id=str(tenant_id), error=str(e))
return {}
# Health and metrics
def get_metrics(self) -> Dict[str, Any]:
"""Get service metrics"""
return {
"items_published": self._items_published,
"checks_performed": self._checks_performed,
"errors_count": self._errors_count,
"is_leader": self.is_leader,
"scheduler_running": self.scheduler.running,
"redis_connected": self.redis and not self.redis.closed,
"rabbitmq_connected": self.rabbitmq_client.connected if self.rabbitmq_client else False
}
async def health_check(self) -> Dict[str, Any]:
"""Comprehensive health check"""
try:
# Check Redis
redis_healthy = False
if self.redis and not self.redis.closed:
await self.redis.ping()
redis_healthy = True
# Check RabbitMQ
rabbitmq_healthy = self.rabbitmq_client.connected if self.rabbitmq_client else False
# Check database
db_healthy = False
try:
from sqlalchemy import text
async with self.db_manager.get_session() as session:
await session.execute(text("SELECT 1"))
db_healthy = True
except:
pass
status = "healthy" if all([redis_healthy, rabbitmq_healthy, db_healthy]) else "unhealthy"
return {
"status": status,
"service": self.config.SERVICE_NAME,
"components": {
"redis": "healthy" if redis_healthy else "unhealthy",
"rabbitmq": "healthy" if rabbitmq_healthy else "unhealthy",
"database": "healthy" if db_healthy else "unhealthy",
"scheduler": "running" if self.scheduler.running else "stopped"
},
"metrics": self.get_metrics()
}
except Exception as e:
return {
"status": "error",
"service": self.config.SERVICE_NAME,
"error": str(e)
}
class AlertServiceMixin:
"""Mixin providing common alert helper methods"""
def format_spanish_message(self, template_key: str, **kwargs) -> Dict[str, Any]:
"""Format Spanish alert message"""
from shared.alerts.templates import format_item_message
return format_item_message(template_key, 'es', **kwargs)
def get_business_hours_severity(self, base_severity: str) -> str:
"""Adjust severity based on business hours"""
current_hour = datetime.now().hour
# Reduce non-critical severity outside business hours (7-20)
if not (7 <= current_hour <= 20):
if base_severity == 'medium':
return 'low'
elif base_severity == 'high' and current_hour < 6 or current_hour > 22:
return 'medium'
return base_severity
def should_send_recommendation(self, tenant_id: UUID, rec_type: str) -> bool:
"""Check if recommendation should be sent based on tenant preferences"""
# Implement tenant-specific recommendation frequency limits
# This is a simplified version
return True

218
shared/alerts/templates.py Normal file
View File

@@ -0,0 +1,218 @@
# shared/alerts/templates.py
"""
Alert and recommendation templates in Spanish for the bakery platform
"""
from typing import Dict, Any
ITEM_TEMPLATES = {
# ALERTS - Critical Issues Requiring Immediate Action
'critical_stock_shortage': {
'es': {
'title': '🚨 Stock Crítico: {ingredient_name}',
'message': 'Solo {current_stock}kg disponibles, necesarios {required_stock}kg para producción de mañana. Acción inmediata requerida.',
'actions': ['Realizar pedido de emergencia', 'Contactar proveedor', 'Ajustar plan de producción']
},
'en': {
'title': '🚨 Critical Stock: {ingredient_name}',
'message': 'Only {current_stock}kg available, {required_stock}kg needed for tomorrow\'s production. Immediate action required.',
'actions': ['Place emergency order', 'Contact supplier', 'Adjust production plan']
}
},
'temperature_breach': {
'es': {
'title': '🌡️ ALERTA TEMPERATURA',
'message': '{location}: {temperature}°C durante {duration} minutos. Revisar productos inmediatamente para evitar deterioro.',
'actions': ['Verificar productos', 'Llamar técnico refrigeración', 'Documentar incidencia', 'Mover productos']
},
'en': {
'title': '🌡️ TEMPERATURE ALERT',
'message': '{location}: {temperature}°C for {duration} minutes. Check products immediately to prevent spoilage.',
'actions': ['Check products', 'Call refrigeration technician', 'Document incident', 'Move products']
}
},
'production_delay': {
'es': {
'title': '⏰ Retraso en Producción',
'message': 'Lote {batch_name} con {delay_minutes} minutos de retraso. Impacto en entregas del día.',
'actions': ['Acelerar producción', 'Notificar clientes', 'Reorganizar horarios', 'Buscar capacidad adicional']
}
},
'expired_products': {
'es': {
'title': '📅 Productos Caducados',
'message': '{product_count} productos han caducado hoy. Retirar inmediatamente por seguridad alimentaria.',
'actions': ['Retirar productos', 'Revisar inventario', 'Ajustar pedidos', 'Documentar pérdidas']
}
},
'equipment_failure': {
'es': {
'title': '⚙️ Fallo de Equipo',
'message': '{equipment_name} no está funcionando correctamente. Producción afectada.',
'actions': ['Parar producción', 'Llamar mantenimiento', 'Usar equipo alternativo', 'Documentar fallo']
}
},
'order_overload': {
'es': {
'title': '📋 Sobrecarga de Pedidos',
'message': 'Capacidad excedida en {percentage}%. Riesgo de no cumplir entregas.',
'actions': ['Priorizar pedidos', 'Aumentar turnos', 'Rechazar nuevos pedidos', 'Buscar ayuda externa']
}
},
'supplier_delay': {
'es': {
'title': '🚚 Retraso de Proveedor',
'message': 'Entrega de {supplier_name} retrasada {hours} horas. Impacto en producción de {products}.',
'actions': ['Contactar proveedor', 'Buscar alternativas', 'Ajustar producción', 'Usar stock reserva']
}
},
# RECOMMENDATIONS - Proactive Suggestions for Optimization
'inventory_optimization': {
'es': {
'title': '📈 Optimización de Stock: {ingredient_name}',
'message': 'Basado en tendencias de {period} días, sugerimos aumentar stock mínimo en {suggested_increase}kg para reducir costos.',
'actions': ['Revisar niveles mínimos', 'Analizar proveedores', 'Actualizar configuración', 'Programar pedido mayor']
},
'en': {
'title': '📈 Stock Optimization: {ingredient_name}',
'message': 'Based on {period} day trends, suggest increasing minimum stock by {suggested_increase}kg to reduce costs.',
'actions': ['Review minimum levels', 'Analyze suppliers', 'Update configuration', 'Schedule larger order']
}
},
'production_efficiency': {
'es': {
'title': '⚙️ Mejora de Eficiencia',
'message': 'Cambiar horarios de horneado a {suggested_time} puede reducir costos energéticos en {savings_percent}%.',
'actions': ['Revisar horarios', 'Consultar personal', 'Probar nuevo horario', 'Medir resultados']
}
},
'sales_opportunity': {
'es': {
'title': '💰 Oportunidad de Venta',
'message': '{product_name} tiene alta demanda los {days}. Incrementar producción puede aumentar ventas {increase_percent}%.',
'actions': ['Aumentar producción', 'Promocionar producto', 'Revisar precios', 'Planificar ingredientes']
}
},
'seasonal_adjustment': {
'es': {
'title': '🍂 Ajuste Estacional',
'message': 'Época de {season}: ajustar producción de {products} según patrones históricos.',
'actions': ['Revisar recetas estacionales', 'Ajustar inventario', 'Planificar promociones', 'Entrenar personal']
}
},
'cost_reduction': {
'es': {
'title': '💡 Reducción de Costos',
'message': 'Cambiar a proveedor {supplier_name} para {ingredient} puede ahorrar {savings_euros}€/mes.',
'actions': ['Evaluar calidad', 'Negociar precios', 'Probar muestras', 'Cambiar proveedor gradualmente']
}
},
'waste_reduction': {
'es': {
'title': '♻️ Reducción de Desperdicio',
'message': 'Ajustar tamaños de lote de {product} puede reducir desperdicio en {waste_reduction_percent}%.',
'actions': ['Analizar ventas', 'Ajustar recetas', 'Cambiar lotes', 'Monitorear resultados']
}
},
'quality_improvement': {
'es': {
'title': '⭐ Mejora de Calidad',
'message': 'Temperatura de horneado de {product} puede optimizarse para mejor textura y sabor.',
'actions': ['Probar temperaturas', 'Documentar cambios', 'Entrenar panaderos', 'Obtener feedback']
}
},
'customer_satisfaction': {
'es': {
'title': '😊 Satisfacción del Cliente',
'message': 'Clientes solicitan más {product} los {days}. Considerar aumentar disponibilidad.',
'actions': ['Revisar comentarios', 'Aumentar producción', 'Crear promociones', 'Mejorar exhibición']
}
},
'energy_optimization': {
'es': {
'title': '⚡ Optimización Energética',
'message': 'Consolidar horneado entre {start_time} y {end_time} puede reducir costos energéticos {savings_euros}€/día.',
'actions': ['Revisar horarios energía', 'Reorganizar producción', 'Optimizar hornos', 'Medir consumo']
}
},
'staff_optimization': {
'es': {
'title': '👥 Optimización de Personal',
'message': 'Picos de trabajo los {days} a las {hours}. Considerar ajustar turnos para mejor eficiencia.',
'actions': ['Analizar cargas trabajo', 'Reorganizar turnos', 'Entrenar polivalencia', 'Contratar temporal']
}
}
}
def format_item_message(template_key: str, language: str, **kwargs) -> Dict[str, Any]:
"""Format item message using template with validation"""
template = ITEM_TEMPLATES.get(template_key, {}).get(language, {})
if not template:
# Fallback for missing templates
return {
'title': f'Notificación: {template_key}',
'message': f'Información: {", ".join([f"{k}: {v}" for k, v in kwargs.items()])}',
'actions': ['Revisar', 'Documentar']
}
try:
# Format with provided kwargs, handling missing values gracefully
formatted_title = template['title'].format(**kwargs)
formatted_message = template['message'].format(**kwargs)
return {
'title': formatted_title,
'message': formatted_message,
'actions': template.get('actions', [])
}
except KeyError as e:
# Handle missing format parameters
return {
'title': template.get('title', f'Notificación: {template_key}'),
'message': f"Error en plantilla - parámetro faltante: {e}. Datos: {kwargs}",
'actions': template.get('actions', ['Revisar configuración'])
}
def get_severity_emoji(severity: str) -> str:
"""Get emoji for severity level"""
emoji_map = {
'urgent': '🚨',
'high': '⚠️',
'medium': '💡',
'low': ''
}
return emoji_map.get(severity, '📋')
def get_item_type_emoji(item_type: str) -> str:
"""Get emoji for item type"""
emoji_map = {
'alert': '🚨',
'recommendation': '💡'
}
return emoji_map.get(item_type, '📋')
def format_business_time(hour: int) -> str:
"""Format hour in Spanish business context"""
if hour == 0:
return "medianoche"
elif hour < 12:
return f"{hour}:00 AM"
elif hour == 12:
return "12:00 PM (mediodía)"
else:
return f"{hour-12}:00 PM"
def get_spanish_day_name(day_number: int) -> str:
"""Get Spanish day name (0=Monday)"""
days = ["lunes", "martes", "miércoles", "jueves", "viernes", "sábado", "domingo"]
return days[day_number] if 0 <= day_number <= 6 else "día desconocido"
def format_currency(amount: float) -> str:
"""Format currency in Spanish Euro format"""
return f"{amount:.2f}"
def format_percentage(value: float) -> str:
"""Format percentage in Spanish format"""
return f"{value:.1f}%"