653 lines
24 KiB
Python
653 lines
24 KiB
Python
"""
|
|
Production Scheduler Service
|
|
Background task that periodically checks for production alert conditions
|
|
and triggers appropriate alerts.
|
|
|
|
Uses shared leader election for horizontal scaling - only one pod runs the scheduler.
|
|
"""
|
|
|
|
import asyncio
|
|
from typing import Dict, Any, List, Optional
|
|
from uuid import UUID
|
|
from datetime import datetime, timedelta
|
|
import structlog
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
from sqlalchemy import text
|
|
|
|
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
from apscheduler.triggers.interval import IntervalTrigger
|
|
|
|
from app.repositories.production_batch_repository import ProductionBatchRepository
|
|
from app.repositories.equipment_repository import EquipmentRepository
|
|
from app.services.production_alert_service import ProductionAlertService
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
|
|
class ProductionScheduler:
|
|
"""Production scheduler service that checks for alert conditions.
|
|
|
|
Uses Redis-based leader election to ensure only one pod runs the scheduler.
|
|
"""
|
|
|
|
def __init__(self, alert_service: ProductionAlertService, database_manager: Any, redis_url: str = None):
|
|
self.alert_service = alert_service
|
|
self.database_manager = database_manager
|
|
self.redis_url = redis_url
|
|
self.scheduler = AsyncIOScheduler()
|
|
self.check_interval = 300 # 5 minutes
|
|
self.job_id = 'production_scheduler'
|
|
|
|
# Leader election
|
|
self._leader_election = None
|
|
self._redis_client = None
|
|
self._scheduler_started = False
|
|
|
|
# Cache de alertas emitidas para evitar duplicados
|
|
self._emitted_alerts: set = set()
|
|
self._alert_cache_ttl = 3600 # 1 hora
|
|
self._last_cache_clear = datetime.utcnow()
|
|
|
|
async def start(self):
|
|
"""Start the production scheduler with leader election"""
|
|
try:
|
|
# Initialize leader election if Redis URL is provided
|
|
if self.redis_url:
|
|
await self._setup_leader_election()
|
|
else:
|
|
# No Redis, start scheduler directly (standalone mode)
|
|
logger.warning("No Redis URL provided, starting scheduler in standalone mode")
|
|
await self._start_scheduler()
|
|
except Exception as e:
|
|
logger.error("Failed to setup leader election, starting in standalone mode",
|
|
error=str(e))
|
|
await self._start_scheduler()
|
|
|
|
async def _setup_leader_election(self):
|
|
"""Setup Redis-based leader election"""
|
|
from shared.leader_election import LeaderElectionService
|
|
import redis.asyncio as redis
|
|
|
|
self._redis_client = redis.from_url(self.redis_url, decode_responses=False)
|
|
await self._redis_client.ping()
|
|
|
|
self._leader_election = LeaderElectionService(
|
|
self._redis_client,
|
|
service_name="production-scheduler"
|
|
)
|
|
|
|
await self._leader_election.start(
|
|
on_become_leader=self._on_become_leader,
|
|
on_lose_leader=self._on_lose_leader
|
|
)
|
|
|
|
logger.info("Leader election initialized for production scheduler",
|
|
is_leader=self._leader_election.is_leader)
|
|
|
|
async def _on_become_leader(self):
|
|
"""Called when this instance becomes the leader"""
|
|
logger.info("Became leader for production scheduler - starting scheduler")
|
|
await self._start_scheduler()
|
|
|
|
async def _on_lose_leader(self):
|
|
"""Called when this instance loses leadership"""
|
|
logger.warning("Lost leadership for production scheduler - stopping scheduler")
|
|
await self._stop_scheduler()
|
|
|
|
async def _start_scheduler(self):
|
|
"""Start the APScheduler"""
|
|
if self._scheduler_started:
|
|
logger.debug("Production scheduler already started")
|
|
return
|
|
|
|
if not self.scheduler.running:
|
|
trigger = IntervalTrigger(seconds=self.check_interval)
|
|
self.scheduler.add_job(
|
|
self._run_scheduler_task,
|
|
trigger=trigger,
|
|
id=self.job_id,
|
|
name="Production Alert Checks",
|
|
max_instances=1
|
|
)
|
|
|
|
self.scheduler.start()
|
|
self._scheduler_started = True
|
|
logger.info("Production scheduler started", interval_seconds=self.check_interval)
|
|
|
|
async def _stop_scheduler(self):
|
|
"""Stop the APScheduler"""
|
|
if not self._scheduler_started:
|
|
return
|
|
|
|
if self.scheduler.running:
|
|
self.scheduler.shutdown(wait=False)
|
|
self._scheduler_started = False
|
|
logger.info("Production scheduler stopped")
|
|
|
|
async def stop(self):
|
|
"""Stop the production scheduler and leader election"""
|
|
if self._leader_election:
|
|
await self._leader_election.stop()
|
|
|
|
await self._stop_scheduler()
|
|
|
|
if self._redis_client:
|
|
await self._redis_client.close()
|
|
|
|
@property
|
|
def is_leader(self) -> bool:
|
|
"""Check if this instance is the leader"""
|
|
return self._leader_election.is_leader if self._leader_election else True
|
|
|
|
async def _run_scheduler_task(self):
|
|
"""Run scheduled production alert checks"""
|
|
start_time = datetime.now()
|
|
logger.info("Running scheduled production alert checks")
|
|
|
|
try:
|
|
# Run all alert checks
|
|
alerts_generated = await self.check_all_conditions()
|
|
|
|
duration = (datetime.now() - start_time).total_seconds()
|
|
logger.info(
|
|
"Completed scheduled production alert checks",
|
|
alerts_generated=alerts_generated,
|
|
duration_seconds=round(duration, 2)
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error in production scheduler task",
|
|
error=str(e),
|
|
exc_info=True
|
|
)
|
|
|
|
async def check_all_conditions(self) -> int:
|
|
"""
|
|
Check all production alert conditions and trigger alerts.
|
|
|
|
Returns:
|
|
int: Total number of alerts generated
|
|
"""
|
|
if not self.database_manager:
|
|
logger.warning("Database manager not available for production checks")
|
|
return 0
|
|
|
|
total_alerts = 0
|
|
|
|
try:
|
|
async with self.database_manager.get_session() as session:
|
|
# Get repositories
|
|
batch_repo = ProductionBatchRepository(session)
|
|
equipment_repo = EquipmentRepository(session)
|
|
|
|
# Check production delays
|
|
delay_alerts = await self._check_production_delays(batch_repo)
|
|
total_alerts += delay_alerts
|
|
|
|
# Check equipment maintenance
|
|
maintenance_alerts = await self._check_equipment_maintenance(equipment_repo)
|
|
total_alerts += maintenance_alerts
|
|
|
|
# Check batch start delays (batches that should have started but haven't)
|
|
start_delay_alerts = await self._check_batch_start_delays(batch_repo)
|
|
total_alerts += start_delay_alerts
|
|
|
|
logger.info(
|
|
"Production alert checks completed",
|
|
total_alerts=total_alerts,
|
|
production_delays=delay_alerts,
|
|
equipment_maintenance=maintenance_alerts,
|
|
batch_start_delays=start_delay_alerts
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error during production alert checks",
|
|
error=str(e),
|
|
exc_info=True
|
|
)
|
|
|
|
return total_alerts
|
|
|
|
async def _check_production_delays(self, batch_repo: ProductionBatchRepository) -> int:
|
|
"""
|
|
Check for production delays and trigger alerts.
|
|
|
|
Args:
|
|
batch_repo: Production batch repository
|
|
|
|
Returns:
|
|
int: Number of delay alerts generated
|
|
"""
|
|
try:
|
|
# Get delayed batches from repository
|
|
delayed_batches = await batch_repo.get_production_delays()
|
|
|
|
logger.info("Found delayed production batches", count=len(delayed_batches))
|
|
|
|
# Limpiar cache si expiró
|
|
if (datetime.utcnow() - self._last_cache_clear).total_seconds() > self._alert_cache_ttl:
|
|
self._emitted_alerts.clear()
|
|
self._last_cache_clear = datetime.utcnow()
|
|
logger.info("Cleared alert cache due to TTL expiration")
|
|
|
|
alerts_generated = 0
|
|
|
|
for batch in delayed_batches:
|
|
try:
|
|
batch_id = UUID(str(batch["id"]))
|
|
|
|
# Verificar si ya emitimos alerta para este batch
|
|
alert_key = f"delay:{batch_id}"
|
|
if alert_key in self._emitted_alerts:
|
|
logger.debug("Skipping duplicate delay alert", batch_id=str(batch_id))
|
|
continue
|
|
|
|
tenant_id = UUID(str(batch["tenant_id"]))
|
|
delay_minutes = int(batch["delay_minutes"]) if batch.get("delay_minutes") else 0
|
|
affected_orders = int(batch.get("affected_orders", 0))
|
|
|
|
# Emit production delay alert
|
|
await self.alert_service.emit_production_delay(
|
|
tenant_id=tenant_id,
|
|
batch_id=batch_id,
|
|
product_name=batch.get("product_name", "Unknown Product"),
|
|
batch_number=batch.get("batch_number", "Unknown Batch"),
|
|
delay_minutes=delay_minutes,
|
|
affected_orders=affected_orders
|
|
)
|
|
|
|
# Registrar en cache
|
|
self._emitted_alerts.add(alert_key)
|
|
alerts_generated += 1
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error emitting production delay alert",
|
|
batch_id=batch.get("id", "unknown"),
|
|
error=str(e)
|
|
)
|
|
continue
|
|
|
|
return alerts_generated
|
|
|
|
except Exception as e:
|
|
logger.error("Error checking production delays", error=str(e))
|
|
return 0
|
|
|
|
async def _check_equipment_maintenance(self, equipment_repo: EquipmentRepository) -> int:
|
|
"""
|
|
Check for equipment needing maintenance and trigger alerts.
|
|
|
|
Args:
|
|
equipment_repo: Equipment repository
|
|
|
|
Returns:
|
|
int: Number of maintenance alerts generated
|
|
"""
|
|
try:
|
|
# Get equipment that needs maintenance using repository method
|
|
equipment_needing_maintenance = await equipment_repo.get_equipment_needing_maintenance()
|
|
|
|
logger.info(
|
|
"Found equipment needing maintenance",
|
|
count=len(equipment_needing_maintenance)
|
|
)
|
|
|
|
alerts_generated = 0
|
|
|
|
for equipment in equipment_needing_maintenance:
|
|
try:
|
|
equipment_id = UUID(equipment["id"])
|
|
tenant_id = UUID(equipment["tenant_id"])
|
|
days_overdue = int(equipment.get("days_overdue", 0))
|
|
|
|
# Emit equipment maintenance alert
|
|
await self.alert_service.emit_equipment_maintenance_due(
|
|
tenant_id=tenant_id,
|
|
equipment_id=equipment_id,
|
|
equipment_name=equipment.get("name", "Unknown Equipment"),
|
|
equipment_type=equipment.get("type", "unknown"),
|
|
last_maintenance_date=equipment.get("last_maintenance_date"),
|
|
days_overdue=days_overdue
|
|
)
|
|
|
|
alerts_generated += 1
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error emitting equipment maintenance alert",
|
|
equipment_id=equipment.get("id", "unknown"),
|
|
error=str(e)
|
|
)
|
|
continue
|
|
|
|
return alerts_generated
|
|
|
|
except Exception as e:
|
|
logger.error("Error checking equipment maintenance", error=str(e))
|
|
return 0
|
|
|
|
async def _check_batch_start_delays(self, batch_repo: ProductionBatchRepository) -> int:
|
|
"""
|
|
Check for batches that should have started but haven't.
|
|
|
|
Args:
|
|
batch_repo: Production batch repository
|
|
|
|
Returns:
|
|
int: Number of start delay alerts generated
|
|
"""
|
|
try:
|
|
# Get batches that should have started using repository method
|
|
delayed_start_batches = await batch_repo.get_batches_with_delayed_start()
|
|
|
|
logger.info(
|
|
"Found batches with delayed start",
|
|
count=len(delayed_start_batches)
|
|
)
|
|
|
|
alerts_generated = 0
|
|
|
|
for batch in delayed_start_batches:
|
|
try:
|
|
batch_id = UUID(batch["id"])
|
|
|
|
# Verificar si ya emitimos alerta para este batch
|
|
alert_key = f"start_delay:{batch_id}"
|
|
if alert_key in self._emitted_alerts:
|
|
logger.debug("Skipping duplicate start delay alert", batch_id=str(batch_id))
|
|
continue
|
|
|
|
tenant_id = UUID(batch["tenant_id"])
|
|
scheduled_start = batch.get("scheduled_start_time")
|
|
|
|
# Emit batch start delayed alert
|
|
await self.alert_service.emit_batch_start_delayed(
|
|
tenant_id=tenant_id,
|
|
batch_id=batch_id,
|
|
product_name=batch.get("product_name", "Unknown Product"),
|
|
batch_number=batch.get("batch_number", "Unknown Batch"),
|
|
scheduled_start=scheduled_start,
|
|
delay_reason="Batch has not started on time"
|
|
)
|
|
|
|
# Registrar en cache
|
|
self._emitted_alerts.add(alert_key)
|
|
alerts_generated += 1
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error emitting batch start delay alert",
|
|
batch_id=batch.get("id", "unknown"),
|
|
error=str(e)
|
|
)
|
|
continue
|
|
|
|
return alerts_generated
|
|
|
|
except Exception as e:
|
|
logger.error("Error checking batch start delays", error=str(e))
|
|
return 0
|
|
|
|
async def trigger_manual_check(self, tenant_id: Optional[UUID] = None) -> Dict[str, Any]:
|
|
"""
|
|
Manually trigger production alert checks for a specific tenant or all tenants.
|
|
|
|
Args:
|
|
tenant_id: Optional tenant ID to check. If None, checks all tenants.
|
|
|
|
Returns:
|
|
Dict with alert generation results
|
|
"""
|
|
logger.info(
|
|
"Manually triggering production alert checks",
|
|
tenant_id=str(tenant_id) if tenant_id else "all_tenants"
|
|
)
|
|
|
|
try:
|
|
if tenant_id:
|
|
# Run tenant-specific alert checks
|
|
alerts_generated = await self.check_all_conditions_for_tenant(tenant_id)
|
|
else:
|
|
# Run all alert checks across all tenants
|
|
alerts_generated = await self.check_all_conditions()
|
|
|
|
return {
|
|
"success": True,
|
|
"tenant_id": str(tenant_id) if tenant_id else None,
|
|
"alerts_generated": alerts_generated,
|
|
"timestamp": datetime.now().isoformat(),
|
|
"message": "Production alert checks completed successfully"
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error during manual production alert check",
|
|
error=str(e),
|
|
exc_info=True
|
|
)
|
|
return {
|
|
"success": False,
|
|
"tenant_id": str(tenant_id) if tenant_id else None,
|
|
"alerts_generated": 0,
|
|
"timestamp": datetime.now().isoformat(),
|
|
"error": str(e)
|
|
}
|
|
|
|
async def check_all_conditions_for_tenant(self, tenant_id: UUID) -> int:
|
|
"""
|
|
Check all production alert conditions for a specific tenant and trigger alerts.
|
|
|
|
Args:
|
|
tenant_id: Tenant ID to check conditions for
|
|
|
|
Returns:
|
|
int: Total number of alerts generated
|
|
"""
|
|
if not self.database_manager:
|
|
logger.warning("Database manager not available for production checks")
|
|
return 0
|
|
|
|
total_alerts = 0
|
|
|
|
try:
|
|
async with self.database_manager.get_session() as session:
|
|
# Get repositories
|
|
batch_repo = ProductionBatchRepository(session)
|
|
equipment_repo = EquipmentRepository(session)
|
|
|
|
# Check production delays for specific tenant
|
|
delay_alerts = await self._check_production_delays_for_tenant(batch_repo, tenant_id)
|
|
total_alerts += delay_alerts
|
|
|
|
# Check equipment maintenance for specific tenant
|
|
maintenance_alerts = await self._check_equipment_maintenance_for_tenant(equipment_repo, tenant_id)
|
|
total_alerts += maintenance_alerts
|
|
|
|
# Check batch start delays for specific tenant
|
|
start_delay_alerts = await self._check_batch_start_delays_for_tenant(batch_repo, tenant_id)
|
|
total_alerts += start_delay_alerts
|
|
|
|
logger.info(
|
|
"Tenant-specific production alert checks completed",
|
|
tenant_id=str(tenant_id),
|
|
total_alerts=total_alerts,
|
|
production_delays=delay_alerts,
|
|
equipment_maintenance=maintenance_alerts,
|
|
batch_start_delays=start_delay_alerts
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error during tenant-specific production alert checks",
|
|
tenant_id=str(tenant_id),
|
|
error=str(e),
|
|
exc_info=True
|
|
)
|
|
|
|
return total_alerts
|
|
|
|
async def _check_production_delays_for_tenant(self, batch_repo: ProductionBatchRepository, tenant_id: UUID) -> int:
|
|
"""
|
|
Check for production delays for a specific tenant and trigger alerts.
|
|
|
|
Args:
|
|
batch_repo: Production batch repository
|
|
tenant_id: Tenant ID to check for
|
|
|
|
Returns:
|
|
int: Number of delay alerts generated
|
|
"""
|
|
try:
|
|
# Get delayed batches for the specific tenant using repository method
|
|
delayed_batches = await batch_repo.get_production_delays(tenant_id)
|
|
|
|
logger.info("Found delayed production batches for tenant", count=len(delayed_batches), tenant_id=str(tenant_id))
|
|
|
|
alerts_generated = 0
|
|
|
|
for batch in delayed_batches:
|
|
try:
|
|
batch_id = UUID(str(batch["id"]))
|
|
delay_minutes = int(batch["delay_minutes"]) if batch.get("delay_minutes") else 0
|
|
affected_orders = int(batch.get("affected_orders", 0))
|
|
|
|
# Emit production delay alert
|
|
await self.alert_service.emit_production_delay(
|
|
tenant_id=tenant_id,
|
|
batch_id=batch_id,
|
|
product_name=batch.get("product_name", "Unknown Product"),
|
|
batch_number=batch.get("batch_number", "Unknown Batch"),
|
|
delay_minutes=delay_minutes,
|
|
affected_orders=affected_orders
|
|
)
|
|
|
|
alerts_generated += 1
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error emitting production delay alert",
|
|
tenant_id=str(tenant_id),
|
|
batch_id=batch.get("id", "unknown"),
|
|
error=str(e)
|
|
)
|
|
continue
|
|
|
|
return alerts_generated
|
|
|
|
except Exception as e:
|
|
logger.error("Error checking production delays for tenant", tenant_id=str(tenant_id), error=str(e))
|
|
return 0
|
|
|
|
async def _check_equipment_maintenance_for_tenant(self, equipment_repo: EquipmentRepository, tenant_id: UUID) -> int:
|
|
"""
|
|
Check for equipment needing maintenance for a specific tenant and trigger alerts.
|
|
|
|
Args:
|
|
equipment_repo: Equipment repository
|
|
tenant_id: Tenant ID to check for
|
|
|
|
Returns:
|
|
int: Number of maintenance alerts generated
|
|
"""
|
|
try:
|
|
# Get equipment that needs maintenance for specific tenant using repository method
|
|
equipment_needing_maintenance = await equipment_repo.get_equipment_needing_maintenance(tenant_id)
|
|
|
|
logger.info(
|
|
"Found equipment needing maintenance for tenant",
|
|
count=len(equipment_needing_maintenance),
|
|
tenant_id=str(tenant_id)
|
|
)
|
|
|
|
alerts_generated = 0
|
|
|
|
for equipment in equipment_needing_maintenance:
|
|
try:
|
|
equipment_id = UUID(equipment["id"])
|
|
days_overdue = int(equipment.get("days_overdue", 0))
|
|
|
|
# Emit equipment maintenance alert
|
|
await self.alert_service.emit_equipment_maintenance_due(
|
|
tenant_id=tenant_id,
|
|
equipment_id=equipment_id,
|
|
equipment_name=equipment.get("name", "Unknown Equipment"),
|
|
equipment_type=equipment.get("type", "unknown"),
|
|
last_maintenance_date=equipment.get("last_maintenance_date"),
|
|
days_overdue=days_overdue
|
|
)
|
|
|
|
alerts_generated += 1
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error emitting equipment maintenance alert",
|
|
tenant_id=str(tenant_id),
|
|
equipment_id=equipment.get("id", "unknown"),
|
|
error=str(e)
|
|
)
|
|
continue
|
|
|
|
return alerts_generated
|
|
|
|
except Exception as e:
|
|
logger.error("Error checking equipment maintenance for tenant", tenant_id=str(tenant_id), error=str(e))
|
|
return 0
|
|
|
|
async def _check_batch_start_delays_for_tenant(self, batch_repo: ProductionBatchRepository, tenant_id: UUID) -> int:
|
|
"""
|
|
Check for batches that should have started but haven't for a specific tenant.
|
|
|
|
Args:
|
|
batch_repo: Production batch repository
|
|
tenant_id: Tenant ID to check for
|
|
|
|
Returns:
|
|
int: Number of start delay alerts generated
|
|
"""
|
|
try:
|
|
# Get batches that should have started for specific tenant using repository method
|
|
delayed_start_batches = await batch_repo.get_batches_with_delayed_start(tenant_id)
|
|
|
|
logger.info(
|
|
"Found batches with delayed start for tenant",
|
|
count=len(delayed_start_batches),
|
|
tenant_id=str(tenant_id)
|
|
)
|
|
|
|
alerts_generated = 0
|
|
|
|
for batch in delayed_start_batches:
|
|
try:
|
|
batch_id = UUID(batch["id"])
|
|
scheduled_start = batch.get("scheduled_start_time")
|
|
|
|
# Emit batch start delayed alert
|
|
await self.alert_service.emit_batch_start_delayed(
|
|
tenant_id=tenant_id,
|
|
batch_id=batch_id,
|
|
product_name=batch.get("product_name", "Unknown Product"),
|
|
batch_number=batch.get("batch_number", "Unknown Batch"),
|
|
scheduled_start=scheduled_start,
|
|
delay_reason="Batch has not started on time"
|
|
)
|
|
|
|
alerts_generated += 1
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error emitting batch start delay alert",
|
|
tenant_id=str(tenant_id),
|
|
batch_id=batch.get("id", "unknown"),
|
|
error=str(e)
|
|
)
|
|
continue
|
|
|
|
return alerts_generated
|
|
|
|
except Exception as e:
|
|
logger.error("Error checking batch start delays for tenant", tenant_id=str(tenant_id), error=str(e))
|
|
return 0
|