Files
bakery-ia/shared/alerts/base_service.py
2025-11-27 15:52:40 +01:00

573 lines
26 KiB
Python

# shared/alerts/base_service.py
"""
Base alert service pattern for all microservices
Supports both alerts and recommendations through unified detection patterns
"""
import asyncio
import json
import random
import uuid
from typing import List, Dict, Any, Optional
from uuid import UUID
from datetime import datetime, timedelta
import structlog
from redis.asyncio import Redis
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
from shared.messaging.rabbitmq import RabbitMQClient
from shared.database.base import DatabaseManager
from shared.config.rabbitmq_config import get_routing_key
logger = structlog.get_logger()
class BaseAlertService:
"""
Base class for service-specific alert and recommendation detection
Implements hybrid detection patterns: scheduled jobs, event-driven, and database triggers
"""
def __init__(self, config):
self.config = config
self.db_manager = DatabaseManager(config.DATABASE_URL)
self.rabbitmq_client = RabbitMQClient(config.RABBITMQ_URL, config.SERVICE_NAME)
self.redis = None
self.scheduler = AsyncIOScheduler()
self.is_leader = False
self.exchange = "alerts.exchange"
# Metrics
self._items_published = 0
self._checks_performed = 0
self._errors_count = 0
async def start(self):
"""Initialize all detection mechanisms"""
try:
# Connect to Redis for leader election and deduplication
# Use the shared Redis URL which includes TLS configuration
from redis.asyncio import from_url
redis_url = self.config.REDIS_URL
# Create Redis client from URL (supports TLS via rediss:// protocol)
# For self-signed certificates, disable SSL verification
redis_kwargs = {
'decode_responses': True,
'max_connections': 20
}
# If using SSL/TLS, add SSL parameters to handle self-signed certificates
if redis_url.startswith('rediss://'):
redis_kwargs.update({
'ssl_cert_reqs': None, # Disable certificate verification
'ssl_ca_certs': None, # Don't require CA certificates
'ssl_certfile': None, # Don't require client cert
'ssl_keyfile': None # Don't require client key
})
self.redis = await from_url(redis_url, **redis_kwargs)
logger.info("Connected to Redis", service=self.config.SERVICE_NAME, redis_url=redis_url.split("@")[-1])
# Connect to RabbitMQ
await self.rabbitmq_client.connect()
logger.info("Connected to RabbitMQ", service=self.config.SERVICE_NAME)
# Start leader election for scheduled jobs
asyncio.create_task(self.maintain_leadership())
# Setup scheduled checks (runs only on leader)
self.setup_scheduled_checks()
# Start database listener (runs on all instances)
await self.start_database_listener()
# Start event listener (runs on all instances)
await self.start_event_listener()
logger.info("Alert service started", service=self.config.SERVICE_NAME)
except Exception as e:
logger.error("Failed to start alert service", service=self.config.SERVICE_NAME, error=str(e))
raise
async def stop(self):
"""Clean shutdown"""
try:
# Stop scheduler
if self.scheduler.running:
self.scheduler.shutdown()
# Close connections
if self.redis:
await self.redis.aclose() # Use aclose() for modern Redis client
await self.rabbitmq_client.disconnect()
logger.info("Alert service stopped", service=self.config.SERVICE_NAME)
except Exception as e:
logger.error("Error stopping alert service", service=self.config.SERVICE_NAME, error=str(e))
# PATTERN 1: Scheduled Background Jobs
def setup_scheduled_checks(self):
"""Configure scheduled alert checks - Override in service"""
raise NotImplementedError("Subclasses must implement setup_scheduled_checks")
async def maintain_leadership(self):
"""Leader election for scheduled jobs"""
lock_key = f"scheduler_lock:{self.config.SERVICE_NAME}"
lock_ttl = 60
# Generate instance_id once for the lifetime of this leadership loop
# IMPORTANT: Don't regenerate on each iteration or lock extension will always fail!
instance_id = getattr(self.config, 'INSTANCE_ID', str(uuid.uuid4()))
logger.info("DEBUG: maintain_leadership starting",
service=self.config.SERVICE_NAME,
instance_id=instance_id,
redis_client_type=str(type(self.redis)))
while True:
try:
was_leader = self.is_leader
# Add jitter to avoid thundering herd when multiple instances start
if not was_leader:
await asyncio.sleep(random.uniform(0.1, 0.5)) # Small random delay before attempting to acquire
# Try to acquire new leadership if not currently leader
if not self.is_leader:
# Use atomic Redis operation to acquire lock
result = await self.redis.set(
lock_key,
instance_id,
ex=lock_ttl,
nx=True # Only set if key doesn't exist
)
acquired = result is not None
self.is_leader = acquired
else:
# Already leader - try to extend the lock atomically
# Use SET with EX and GET to atomically refresh the lock
try:
# SET key value EX ttl GET returns the old value (atomic check-and-set)
# This is atomic and works in both standalone and cluster mode
old_value = await self.redis.set(
lock_key,
instance_id,
ex=lock_ttl,
get=True # Return old value (Python redis uses 'get' param for GET option)
)
# If old value matches our instance_id, we successfully extended
self.is_leader = old_value == instance_id
if self.is_leader:
logger.debug("Lock extended successfully",
service=self.config.SERVICE_NAME,
instance_id=instance_id,
ttl=lock_ttl)
else:
# Lock was taken by someone else or expired
logger.info("Lost lock ownership during extension",
service=self.config.SERVICE_NAME,
old_owner=old_value,
instance_id=instance_id)
except Exception as e:
# If extend fails, try to verify we still have the lock
logger.warning("Failed to extend lock, verifying ownership",
service=self.config.SERVICE_NAME,
error=str(e))
current_check = await self.redis.get(lock_key)
self.is_leader = current_check == instance_id
# Handle leadership changes
if self.is_leader and not was_leader:
# Add a small delay to allow other instances to detect leadership change
await asyncio.sleep(0.1)
if self.is_leader: # Double-check we're still the leader
self.scheduler.start()
logger.info("Acquired scheduler leadership", service=self.config.SERVICE_NAME)
elif not self.is_leader and was_leader:
if self.scheduler.running:
self.scheduler.shutdown()
logger.info("Lost scheduler leadership", service=self.config.SERVICE_NAME)
# Add jitter to reduce contention between instances
await asyncio.sleep(lock_ttl // 2 + random.uniform(0, 2))
except Exception as e:
import traceback
logger.error("Leadership error",
service=self.config.SERVICE_NAME,
error=str(e),
error_type=type(e).__name__,
traceback=traceback.format_exc())
self.is_leader = False
await asyncio.sleep(5)
# PATTERN 2: Event-Driven Detection
async def start_event_listener(self):
"""Listen for business events - Override in service"""
pass
# PATTERN 3: Database Triggers
async def start_database_listener(self):
"""Listen for database notifications with connection management"""
try:
import asyncpg
# Convert SQLAlchemy URL format to plain PostgreSQL for asyncpg
database_url = self.config.DATABASE_URL
if database_url.startswith('postgresql+asyncpg://'):
database_url = database_url.replace('postgresql+asyncpg://', 'postgresql://')
# Add connection timeout and retry logic
max_retries = 3
retry_count = 0
conn = None
while retry_count < max_retries and not conn:
try:
conn = await asyncio.wait_for(
asyncpg.connect(database_url),
timeout=10.0
)
break
except (asyncio.TimeoutError, Exception) as e:
retry_count += 1
if retry_count < max_retries:
logger.warning(f"DB listener connection attempt {retry_count} failed, retrying...",
service=self.config.SERVICE_NAME, error=str(e))
await asyncio.sleep(2)
else:
raise
if conn:
# Register listeners based on service
await self.register_db_listeners(conn)
logger.info("Database listeners registered", service=self.config.SERVICE_NAME)
# Keep connection alive with periodic ping
asyncio.create_task(self._maintain_db_connection(conn))
except Exception as e:
logger.error("Failed to setup database listeners", service=self.config.SERVICE_NAME, error=str(e))
async def _maintain_db_connection(self, conn):
"""Maintain database connection for listeners"""
try:
while not conn.is_closed():
# Use a timeout to avoid hanging indefinitely
try:
await asyncio.wait_for(
conn.fetchval("SELECT 1"),
timeout=5.0
)
await asyncio.sleep(30) # Check every 30 seconds
except asyncio.TimeoutError:
logger.warning("DB ping timed out, connection may be dead", service=self.config.SERVICE_NAME)
break
except Exception as e:
logger.error("DB listener connection lost", service=self.config.SERVICE_NAME, error=str(e))
break
except Exception as e:
logger.error("Error maintaining DB connection", service=self.config.SERVICE_NAME, error=str(e))
async def register_db_listeners(self, conn):
"""Register database listeners - Override in service"""
pass
# Publishing (Updated for type)
async def publish_item(self, tenant_id: UUID, item: Dict[str, Any], item_type: str = 'alert'):
"""Publish alert or recommendation to RabbitMQ with deduplication and validation"""
try:
# Validate alert structure before publishing
from shared.schemas.alert_types import RawAlert
try:
raw_alert = RawAlert(
tenant_id=str(tenant_id),
alert_type=item.get('type'),
title=item.get('title'),
message=item.get('message'),
service=self.config.SERVICE_NAME,
actions=item.get('actions', []),
alert_metadata=item.get('metadata', {}),
item_type=item_type
)
# Validation passed, continue with validated data
logger.debug("Alert schema validation passed",
service=self.config.SERVICE_NAME,
alert_type=item.get('type'))
except Exception as validation_error:
logger.error("Alert schema validation failed",
service=self.config.SERVICE_NAME,
alert_type=item.get('type'),
error=str(validation_error))
self._errors_count += 1
return False
# Generate proper deduplication key based on alert type and specific identifiers
unique_id = self._generate_unique_identifier(item)
item_key = f"{tenant_id}:{item_type}:{item['type']}:{unique_id}"
if await self.is_duplicate_item(item_key):
logger.debug("Duplicate item skipped",
service=self.config.SERVICE_NAME,
item_type=item_type,
alert_type=item['type'],
dedup_key=item_key)
return False
# Add metadata
item['id'] = str(uuid.uuid4())
item['tenant_id'] = str(tenant_id)
item['service'] = self.config.SERVICE_NAME
item['timestamp'] = datetime.utcnow().isoformat()
item['item_type'] = item_type # 'alert' or 'recommendation'
# Determine routing key based on severity and type
routing_key = get_routing_key(item_type, item['severity'], self.config.SERVICE_NAME)
# Publish to RabbitMQ with timeout to prevent blocking
try:
success = await asyncio.wait_for(
self.rabbitmq_client.publish_event(
exchange_name=self.exchange,
routing_key=routing_key,
event_data=item
),
timeout=10.0 # 10 second timeout
)
except asyncio.TimeoutError:
logger.error("RabbitMQ publish timed out",
service=self.config.SERVICE_NAME,
item_type=item_type,
alert_type=item['type'])
return False
if success:
self._items_published += 1
logger.info("Item published successfully",
service=self.config.SERVICE_NAME,
item_type=item_type,
alert_type=item['type'],
severity=item['severity'],
routing_key=routing_key)
else:
self._errors_count += 1
logger.error("Failed to publish item",
service=self.config.SERVICE_NAME,
item_type=item_type,
alert_type=item['type'])
return success
except Exception as e:
self._errors_count += 1
logger.error("Error publishing item",
service=self.config.SERVICE_NAME,
error=str(e),
item_type=item_type)
return False
def _generate_unique_identifier(self, item: Dict[str, Any]) -> str:
"""Generate unique identifier for deduplication based on alert type and content"""
alert_type = item.get('type', '')
metadata = item.get('metadata', {})
# Generate unique identifier based on alert type
# Inventory alerts
if alert_type == 'overstock_warning':
return metadata.get('ingredient_id', '')
elif alert_type == 'critical_stock_shortage' or alert_type == 'low_stock_warning':
return metadata.get('ingredient_id', '')
elif alert_type == 'expired_products':
# For expired products alerts, create hash of all expired item IDs
expired_items = metadata.get('expired_items', [])
if expired_items:
expired_ids = sorted([str(item.get('id', '')) for item in expired_items])
import hashlib
return hashlib.md5(':'.join(expired_ids).encode()).hexdigest()[:16]
return ''
elif alert_type == 'urgent_expiry':
return f"{metadata.get('ingredient_id', '')}:{metadata.get('stock_id', '')}"
elif alert_type == 'temperature_breach':
return f"{metadata.get('sensor_id', '')}:{metadata.get('location', '')}"
elif alert_type == 'stock_depleted_by_order':
return f"{metadata.get('order_id', '')}:{metadata.get('ingredient_id', '')}"
elif alert_type == 'expired_batches_auto_processed':
# Use processing date and total batches as identifier
processing_date = metadata.get('processing_date', '')[:10] # Date only
total_batches = metadata.get('total_batches_processed', 0)
return f"{processing_date}:{total_batches}"
elif alert_type == 'inventory_optimization':
return f"opt:{metadata.get('ingredient_id', '')}:{metadata.get('recommendation_type', '')}"
elif alert_type == 'waste_reduction':
return f"waste:{metadata.get('ingredient_id', '')}"
# Procurement alerts
elif alert_type == 'procurement_pos_pending_approval':
# Use hash of PO IDs for grouped alerts
pos = metadata.get('pos', [])
if pos:
po_ids = sorted([str(po.get('po_id', '')) for po in pos])
import hashlib
return hashlib.md5(':'.join(po_ids).encode()).hexdigest()[:16]
return ''
elif alert_type == 'procurement_approval_reminder':
return metadata.get('po_id', '')
elif alert_type == 'procurement_critical_po':
return metadata.get('po_id', '')
elif alert_type == 'procurement_po_approved':
return metadata.get('po_id', '')
elif alert_type == 'procurement_auto_approval_summary':
# Daily summary - use date as identifier
summary_date = metadata.get('summary_date', '')[:10] # Date only
return f"summary:{summary_date}"
# Production alerts
elif alert_type in ['severe_capacity_overload', 'capacity_overload', 'near_capacity']:
return f"capacity:{metadata.get('planned_date', '')}"
elif alert_type == 'production_delay':
return metadata.get('batch_id', '')
elif alert_type == 'quality_control_failure':
return metadata.get('quality_check_id', '')
elif alert_type in ['equipment_failure', 'maintenance_required', 'low_equipment_efficiency']:
return metadata.get('equipment_id', '')
elif alert_type == 'production_ingredient_shortage':
return metadata.get('ingredient_id', '')
# Forecasting alerts
elif alert_type in ['demand_surge_weekend', 'holiday_preparation', 'demand_spike_detected', 'unexpected_demand_spike']:
return f"{alert_type}:{metadata.get('product_name', '')}:{metadata.get('forecast_date', '')}"
elif alert_type == 'weather_impact_alert':
return f"weather:{metadata.get('forecast_date', '')}"
elif alert_type == 'severe_weather_impact':
return f"severe_weather:{metadata.get('weather_type', '')}:{metadata.get('duration_hours', '')}"
else:
# Fallback to generic metadata.id or empty string
return metadata.get('id', '')
async def is_duplicate_item(self, item_key: str, window_minutes: int = 15) -> bool:
"""Prevent duplicate items within time window"""
key = f"item_sent:{item_key}"
try:
result = await self.redis.set(
key, "1",
ex=window_minutes * 60,
nx=True
)
return result is None # None means duplicate
except Exception as e:
logger.error("Error checking duplicate", error=str(e))
return False # Allow publishing if check fails
# Helper methods
async def get_active_tenants(self) -> List[UUID]:
"""Get list of active tenant IDs"""
try:
from sqlalchemy import text
query = text("SELECT DISTINCT tenant_id FROM tenants WHERE status = 'active'")
async with self.db_manager.get_session() as session:
result = await session.execute(query)
return [row.tenant_id for row in result.fetchall()]
except Exception as e:
# If tenants table doesn't exist, skip tenant-based processing
if "does not exist" in str(e):
logger.debug("Tenants table not found, skipping tenant-based alert processing")
return []
else:
logger.error("Error fetching active tenants", error=str(e))
return []
async def get_tenant_config(self, tenant_id: UUID) -> Dict[str, Any]:
"""Get tenant-specific configuration"""
try:
from sqlalchemy import text
query = text("SELECT config FROM tenants WHERE tenant_id = :tenant_id")
async with self.db_manager.get_session() as session:
result = await session.execute(query, {"tenant_id": tenant_id})
row = result.fetchone()
return json.loads(row.config) if row and row.config else {}
except Exception as e:
logger.error("Error fetching tenant config", tenant_id=str(tenant_id), error=str(e))
return {}
# Health and metrics
def get_metrics(self) -> Dict[str, Any]:
"""Get service metrics"""
return {
"items_published": self._items_published,
"checks_performed": self._checks_performed,
"errors_count": self._errors_count,
"is_leader": self.is_leader,
"scheduler_running": self.scheduler.running,
"redis_connected": self.redis and not self.redis.closed,
"rabbitmq_connected": self.rabbitmq_client.connected if self.rabbitmq_client else False
}
async def health_check(self) -> Dict[str, Any]:
"""Comprehensive health check"""
try:
# Check Redis
redis_healthy = False
if self.redis and not self.redis.closed:
await self.redis.ping()
redis_healthy = True
# Check RabbitMQ
rabbitmq_healthy = self.rabbitmq_client.connected if self.rabbitmq_client else False
# Check database
db_healthy = False
try:
from sqlalchemy import text
async with self.db_manager.get_session() as session:
await session.execute(text("SELECT 1"))
db_healthy = True
except:
pass
status = "healthy" if all([redis_healthy, rabbitmq_healthy, db_healthy]) else "unhealthy"
return {
"status": status,
"service": self.config.SERVICE_NAME,
"components": {
"redis": "healthy" if redis_healthy else "unhealthy",
"rabbitmq": "healthy" if rabbitmq_healthy else "unhealthy",
"database": "healthy" if db_healthy else "unhealthy",
"scheduler": "running" if self.scheduler.running else "stopped"
},
"metrics": self.get_metrics()
}
except Exception as e:
return {
"status": "error",
"service": self.config.SERVICE_NAME,
"error": str(e)
}
class AlertServiceMixin:
"""Mixin providing common alert helper methods"""
def get_business_hours_severity(self, base_severity: str) -> str:
"""Adjust severity based on business hours"""
current_hour = datetime.now().hour
# Reduce non-critical severity outside business hours (7-20)
if not (7 <= current_hour <= 20):
if base_severity == 'medium':
return 'low'
elif base_severity == 'high' and current_hour < 6 or current_hour > 22:
return 'medium'
return base_severity
def should_send_recommendation(self, tenant_id: UUID, rec_type: str) -> bool:
"""Check if recommendation should be sent based on tenant preferences"""
# Implement tenant-specific recommendation frequency limits
# This is a simplified version
return True