Files
bakery-ia/services/production/app/services/production_scheduler.py
2025-12-13 23:57:54 +01:00

610 lines
23 KiB
Python

"""
Production Scheduler Service
Background task that periodically checks for production alert conditions
and triggers appropriate alerts.
"""
import asyncio
from typing import Dict, Any, List, Optional
from uuid import UUID
from datetime import datetime, timedelta
import structlog
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import text
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.interval import IntervalTrigger
from app.repositories.production_batch_repository import ProductionBatchRepository
from app.repositories.equipment_repository import EquipmentRepository
from app.services.production_alert_service import ProductionAlertService
logger = structlog.get_logger()
class ProductionScheduler:
"""Production scheduler service that checks for alert conditions"""
def __init__(self, alert_service: ProductionAlertService, database_manager: Any):
self.alert_service = alert_service
self.database_manager = database_manager
self.scheduler = AsyncIOScheduler()
self.check_interval = 300 # 5 minutes
self.job_id = 'production_scheduler'
# Cache de alertas emitidas para evitar duplicados
self._emitted_alerts: set = set()
self._alert_cache_ttl = 3600 # 1 hora
self._last_cache_clear = datetime.utcnow()
async def start(self):
"""Start the production scheduler with APScheduler"""
if self.scheduler.running:
logger.warning("Production scheduler is already running")
return
# Add the periodic job
trigger = IntervalTrigger(seconds=self.check_interval)
self.scheduler.add_job(
self._run_scheduler_task,
trigger=trigger,
id=self.job_id,
name="Production Alert Checks",
max_instances=1 # Prevent overlapping executions
)
# Start the scheduler
self.scheduler.start()
logger.info("Production scheduler started", interval_seconds=self.check_interval)
async def stop(self):
"""Stop the production scheduler"""
if self.scheduler.running:
self.scheduler.shutdown(wait=True)
logger.info("Production scheduler stopped")
else:
logger.info("Production scheduler already stopped")
async def _run_scheduler_task(self):
"""Run scheduled production alert checks with leader election"""
# Try to acquire leader lock for this scheduler
lock_name = f"production_scheduler:{self.database_manager.database_url if hasattr(self.database_manager, 'database_url') else 'default'}"
lock_id = abs(hash(lock_name)) % (2**31) # Generate a unique integer ID for the lock
acquired = False
try:
# Try to acquire PostgreSQL advisory lock for leader election
async with self.database_manager.get_session() as session:
result = await session.execute(text("SELECT pg_try_advisory_lock(:lock_id)"), {"lock_id": lock_id})
acquired = True # If no exception, lock was acquired
start_time = datetime.now()
logger.info("Running scheduled production alert checks (as leader)")
# Run all alert checks
alerts_generated = await self.check_all_conditions()
duration = (datetime.now() - start_time).total_seconds()
logger.info(
"Completed scheduled production alert checks",
alerts_generated=alerts_generated,
duration_seconds=round(duration, 2)
)
except Exception as e:
# If it's a lock acquisition error, log and skip execution (another instance is running)
error_str = str(e).lower()
if "lock" in error_str or "timeout" in error_str or "could not acquire" in error_str:
logger.debug(
"Skipping production scheduler execution (not leader)",
lock_name=lock_name
)
return # Not an error, just not the leader
else:
logger.error(
"Error in production scheduler task",
error=str(e),
exc_info=True
)
finally:
if acquired:
# Release the lock
try:
async with self.database_manager.get_session() as session:
await session.execute(text("SELECT pg_advisory_unlock(:lock_id)"), {"lock_id": lock_id})
await session.commit()
except Exception as unlock_error:
logger.warning(
"Error releasing leader lock (may have been automatically released)",
error=str(unlock_error)
)
async def check_all_conditions(self) -> int:
"""
Check all production alert conditions and trigger alerts.
Returns:
int: Total number of alerts generated
"""
if not self.database_manager:
logger.warning("Database manager not available for production checks")
return 0
total_alerts = 0
try:
async with self.database_manager.get_session() as session:
# Get repositories
batch_repo = ProductionBatchRepository(session)
equipment_repo = EquipmentRepository(session)
# Check production delays
delay_alerts = await self._check_production_delays(batch_repo)
total_alerts += delay_alerts
# Check equipment maintenance
maintenance_alerts = await self._check_equipment_maintenance(equipment_repo)
total_alerts += maintenance_alerts
# Check batch start delays (batches that should have started but haven't)
start_delay_alerts = await self._check_batch_start_delays(batch_repo)
total_alerts += start_delay_alerts
logger.info(
"Production alert checks completed",
total_alerts=total_alerts,
production_delays=delay_alerts,
equipment_maintenance=maintenance_alerts,
batch_start_delays=start_delay_alerts
)
except Exception as e:
logger.error(
"Error during production alert checks",
error=str(e),
exc_info=True
)
return total_alerts
async def _check_production_delays(self, batch_repo: ProductionBatchRepository) -> int:
"""
Check for production delays and trigger alerts.
Args:
batch_repo: Production batch repository
Returns:
int: Number of delay alerts generated
"""
try:
# Get delayed batches from repository
delayed_batches = await batch_repo.get_production_delays()
logger.info("Found delayed production batches", count=len(delayed_batches))
# Limpiar cache si expiró
if (datetime.utcnow() - self._last_cache_clear).total_seconds() > self._alert_cache_ttl:
self._emitted_alerts.clear()
self._last_cache_clear = datetime.utcnow()
logger.info("Cleared alert cache due to TTL expiration")
alerts_generated = 0
for batch in delayed_batches:
try:
batch_id = UUID(str(batch["id"]))
# Verificar si ya emitimos alerta para este batch
alert_key = f"delay:{batch_id}"
if alert_key in self._emitted_alerts:
logger.debug("Skipping duplicate delay alert", batch_id=str(batch_id))
continue
tenant_id = UUID(str(batch["tenant_id"]))
delay_minutes = int(batch["delay_minutes"]) if batch.get("delay_minutes") else 0
affected_orders = int(batch.get("affected_orders", 0))
# Emit production delay alert
await self.alert_service.emit_production_delay(
tenant_id=tenant_id,
batch_id=batch_id,
product_name=batch.get("product_name", "Unknown Product"),
batch_number=batch.get("batch_number", "Unknown Batch"),
delay_minutes=delay_minutes,
affected_orders=affected_orders
)
# Registrar en cache
self._emitted_alerts.add(alert_key)
alerts_generated += 1
except Exception as e:
logger.error(
"Error emitting production delay alert",
batch_id=batch.get("id", "unknown"),
error=str(e)
)
continue
return alerts_generated
except Exception as e:
logger.error("Error checking production delays", error=str(e))
return 0
async def _check_equipment_maintenance(self, equipment_repo: EquipmentRepository) -> int:
"""
Check for equipment needing maintenance and trigger alerts.
Args:
equipment_repo: Equipment repository
Returns:
int: Number of maintenance alerts generated
"""
try:
# Get equipment that needs maintenance using repository method
equipment_needing_maintenance = await equipment_repo.get_equipment_needing_maintenance()
logger.info(
"Found equipment needing maintenance",
count=len(equipment_needing_maintenance)
)
alerts_generated = 0
for equipment in equipment_needing_maintenance:
try:
equipment_id = UUID(equipment["id"])
tenant_id = UUID(equipment["tenant_id"])
days_overdue = int(equipment.get("days_overdue", 0))
# Emit equipment maintenance alert
await self.alert_service.emit_equipment_maintenance_due(
tenant_id=tenant_id,
equipment_id=equipment_id,
equipment_name=equipment.get("name", "Unknown Equipment"),
equipment_type=equipment.get("type", "unknown"),
last_maintenance_date=equipment.get("last_maintenance_date"),
days_overdue=days_overdue
)
alerts_generated += 1
except Exception as e:
logger.error(
"Error emitting equipment maintenance alert",
equipment_id=equipment.get("id", "unknown"),
error=str(e)
)
continue
return alerts_generated
except Exception as e:
logger.error("Error checking equipment maintenance", error=str(e))
return 0
async def _check_batch_start_delays(self, batch_repo: ProductionBatchRepository) -> int:
"""
Check for batches that should have started but haven't.
Args:
batch_repo: Production batch repository
Returns:
int: Number of start delay alerts generated
"""
try:
# Get batches that should have started using repository method
delayed_start_batches = await batch_repo.get_batches_with_delayed_start()
logger.info(
"Found batches with delayed start",
count=len(delayed_start_batches)
)
alerts_generated = 0
for batch in delayed_start_batches:
try:
batch_id = UUID(batch["id"])
# Verificar si ya emitimos alerta para este batch
alert_key = f"start_delay:{batch_id}"
if alert_key in self._emitted_alerts:
logger.debug("Skipping duplicate start delay alert", batch_id=str(batch_id))
continue
tenant_id = UUID(batch["tenant_id"])
scheduled_start = batch.get("scheduled_start_time")
# Emit batch start delayed alert
await self.alert_service.emit_batch_start_delayed(
tenant_id=tenant_id,
batch_id=batch_id,
product_name=batch.get("product_name", "Unknown Product"),
batch_number=batch.get("batch_number", "Unknown Batch"),
scheduled_start=scheduled_start,
delay_reason="Batch has not started on time"
)
# Registrar en cache
self._emitted_alerts.add(alert_key)
alerts_generated += 1
except Exception as e:
logger.error(
"Error emitting batch start delay alert",
batch_id=batch.get("id", "unknown"),
error=str(e)
)
continue
return alerts_generated
except Exception as e:
logger.error("Error checking batch start delays", error=str(e))
return 0
async def trigger_manual_check(self, tenant_id: Optional[UUID] = None) -> Dict[str, Any]:
"""
Manually trigger production alert checks for a specific tenant or all tenants.
Args:
tenant_id: Optional tenant ID to check. If None, checks all tenants.
Returns:
Dict with alert generation results
"""
logger.info(
"Manually triggering production alert checks",
tenant_id=str(tenant_id) if tenant_id else "all_tenants"
)
try:
if tenant_id:
# Run tenant-specific alert checks
alerts_generated = await self.check_all_conditions_for_tenant(tenant_id)
else:
# Run all alert checks across all tenants
alerts_generated = await self.check_all_conditions()
return {
"success": True,
"tenant_id": str(tenant_id) if tenant_id else None,
"alerts_generated": alerts_generated,
"timestamp": datetime.now().isoformat(),
"message": "Production alert checks completed successfully"
}
except Exception as e:
logger.error(
"Error during manual production alert check",
error=str(e),
exc_info=True
)
return {
"success": False,
"tenant_id": str(tenant_id) if tenant_id else None,
"alerts_generated": 0,
"timestamp": datetime.now().isoformat(),
"error": str(e)
}
async def check_all_conditions_for_tenant(self, tenant_id: UUID) -> int:
"""
Check all production alert conditions for a specific tenant and trigger alerts.
Args:
tenant_id: Tenant ID to check conditions for
Returns:
int: Total number of alerts generated
"""
if not self.database_manager:
logger.warning("Database manager not available for production checks")
return 0
total_alerts = 0
try:
async with self.database_manager.get_session() as session:
# Get repositories
batch_repo = ProductionBatchRepository(session)
equipment_repo = EquipmentRepository(session)
# Check production delays for specific tenant
delay_alerts = await self._check_production_delays_for_tenant(batch_repo, tenant_id)
total_alerts += delay_alerts
# Check equipment maintenance for specific tenant
maintenance_alerts = await self._check_equipment_maintenance_for_tenant(equipment_repo, tenant_id)
total_alerts += maintenance_alerts
# Check batch start delays for specific tenant
start_delay_alerts = await self._check_batch_start_delays_for_tenant(batch_repo, tenant_id)
total_alerts += start_delay_alerts
logger.info(
"Tenant-specific production alert checks completed",
tenant_id=str(tenant_id),
total_alerts=total_alerts,
production_delays=delay_alerts,
equipment_maintenance=maintenance_alerts,
batch_start_delays=start_delay_alerts
)
except Exception as e:
logger.error(
"Error during tenant-specific production alert checks",
tenant_id=str(tenant_id),
error=str(e),
exc_info=True
)
return total_alerts
async def _check_production_delays_for_tenant(self, batch_repo: ProductionBatchRepository, tenant_id: UUID) -> int:
"""
Check for production delays for a specific tenant and trigger alerts.
Args:
batch_repo: Production batch repository
tenant_id: Tenant ID to check for
Returns:
int: Number of delay alerts generated
"""
try:
# Get delayed batches for the specific tenant using repository method
delayed_batches = await batch_repo.get_production_delays(tenant_id)
logger.info("Found delayed production batches for tenant", count=len(delayed_batches), tenant_id=str(tenant_id))
alerts_generated = 0
for batch in delayed_batches:
try:
batch_id = UUID(str(batch["id"]))
delay_minutes = int(batch["delay_minutes"]) if batch.get("delay_minutes") else 0
affected_orders = int(batch.get("affected_orders", 0))
# Emit production delay alert
await self.alert_service.emit_production_delay(
tenant_id=tenant_id,
batch_id=batch_id,
product_name=batch.get("product_name", "Unknown Product"),
batch_number=batch.get("batch_number", "Unknown Batch"),
delay_minutes=delay_minutes,
affected_orders=affected_orders
)
alerts_generated += 1
except Exception as e:
logger.error(
"Error emitting production delay alert",
tenant_id=str(tenant_id),
batch_id=batch.get("id", "unknown"),
error=str(e)
)
continue
return alerts_generated
except Exception as e:
logger.error("Error checking production delays for tenant", tenant_id=str(tenant_id), error=str(e))
return 0
async def _check_equipment_maintenance_for_tenant(self, equipment_repo: EquipmentRepository, tenant_id: UUID) -> int:
"""
Check for equipment needing maintenance for a specific tenant and trigger alerts.
Args:
equipment_repo: Equipment repository
tenant_id: Tenant ID to check for
Returns:
int: Number of maintenance alerts generated
"""
try:
# Get equipment that needs maintenance for specific tenant using repository method
equipment_needing_maintenance = await equipment_repo.get_equipment_needing_maintenance(tenant_id)
logger.info(
"Found equipment needing maintenance for tenant",
count=len(equipment_needing_maintenance),
tenant_id=str(tenant_id)
)
alerts_generated = 0
for equipment in equipment_needing_maintenance:
try:
equipment_id = UUID(equipment["id"])
days_overdue = int(equipment.get("days_overdue", 0))
# Emit equipment maintenance alert
await self.alert_service.emit_equipment_maintenance_due(
tenant_id=tenant_id,
equipment_id=equipment_id,
equipment_name=equipment.get("name", "Unknown Equipment"),
equipment_type=equipment.get("type", "unknown"),
last_maintenance_date=equipment.get("last_maintenance_date"),
days_overdue=days_overdue
)
alerts_generated += 1
except Exception as e:
logger.error(
"Error emitting equipment maintenance alert",
tenant_id=str(tenant_id),
equipment_id=equipment.get("id", "unknown"),
error=str(e)
)
continue
return alerts_generated
except Exception as e:
logger.error("Error checking equipment maintenance for tenant", tenant_id=str(tenant_id), error=str(e))
return 0
async def _check_batch_start_delays_for_tenant(self, batch_repo: ProductionBatchRepository, tenant_id: UUID) -> int:
"""
Check for batches that should have started but haven't for a specific tenant.
Args:
batch_repo: Production batch repository
tenant_id: Tenant ID to check for
Returns:
int: Number of start delay alerts generated
"""
try:
# Get batches that should have started for specific tenant using repository method
delayed_start_batches = await batch_repo.get_batches_with_delayed_start(tenant_id)
logger.info(
"Found batches with delayed start for tenant",
count=len(delayed_start_batches),
tenant_id=str(tenant_id)
)
alerts_generated = 0
for batch in delayed_start_batches:
try:
batch_id = UUID(batch["id"])
scheduled_start = batch.get("scheduled_start_time")
# Emit batch start delayed alert
await self.alert_service.emit_batch_start_delayed(
tenant_id=tenant_id,
batch_id=batch_id,
product_name=batch.get("product_name", "Unknown Product"),
batch_number=batch.get("batch_number", "Unknown Batch"),
scheduled_start=scheduled_start,
delay_reason="Batch has not started on time"
)
alerts_generated += 1
except Exception as e:
logger.error(
"Error emitting batch start delay alert",
tenant_id=str(tenant_id),
batch_id=batch.get("id", "unknown"),
error=str(e)
)
continue
return alerts_generated
except Exception as e:
logger.error("Error checking batch start delays for tenant", tenant_id=str(tenant_id), error=str(e))
return 0