bakery-ia/services/production/app/services/production_scheduler.py

"""
Production Scheduler Service
Background task that periodically checks for production alert conditions
and triggers appropriate alerts.

Uses shared leader election for horizontal scaling - only one pod runs the scheduler.
"""

import asyncio
from typing import Dict, Any, List, Optional
from uuid import UUID
from datetime import datetime, timedelta
import structlog
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import text

from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.interval import IntervalTrigger

from app.repositories.production_batch_repository import ProductionBatchRepository
from app.repositories.equipment_repository import EquipmentRepository
from app.services.production_alert_service import ProductionAlertService

logger = structlog.get_logger()


class ProductionScheduler:
    """Production scheduler service that checks for alert conditions.

    Uses Redis-based leader election to ensure only one pod runs the scheduler.
    """

    def __init__(self, alert_service: ProductionAlertService, database_manager: Any, redis_url: str = None):
        self.alert_service = alert_service
        self.database_manager = database_manager
        self.redis_url = redis_url
        self.scheduler = AsyncIOScheduler()
        self.check_interval = 300  # 5 minutes
        self.job_id = 'production_scheduler'

        # Leader election
        self._leader_election = None
        self._redis_client = None
        self._scheduler_started = False

        # Cache de alertas emitidas para evitar duplicados
        self._emitted_alerts: set = set()
        self._alert_cache_ttl = 3600  # 1 hora
        self._last_cache_clear = datetime.utcnow()

    async def start(self):
        """Start the production scheduler with leader election"""
        try:
            # Initialize leader election if Redis URL is provided
            if self.redis_url:
                await self._setup_leader_election()
            else:
                # No Redis, start scheduler directly (standalone mode)
                logger.warning("No Redis URL provided, starting scheduler in standalone mode")
                await self._start_scheduler()
        except Exception as e:
            logger.error("Failed to setup leader election, starting in standalone mode",
                        error=str(e))
            await self._start_scheduler()

    async def _setup_leader_election(self):
        """Setup Redis-based leader election"""
        from shared.leader_election import LeaderElectionService
        import redis.asyncio as redis

        self._redis_client = redis.from_url(self.redis_url, decode_responses=False)
        await self._redis_client.ping()

        self._leader_election = LeaderElectionService(
            self._redis_client,
            service_name="production-scheduler"
        )

        await self._leader_election.start(
            on_become_leader=self._on_become_leader,
            on_lose_leader=self._on_lose_leader
        )

        logger.info("Leader election initialized for production scheduler",
                   is_leader=self._leader_election.is_leader)

    async def _on_become_leader(self):
        """Called when this instance becomes the leader"""
        logger.info("Became leader for production scheduler - starting scheduler")
        await self._start_scheduler()

    async def _on_lose_leader(self):
        """Called when this instance loses leadership"""
        logger.warning("Lost leadership for production scheduler - stopping scheduler")
        await self._stop_scheduler()

    async def _start_scheduler(self):
        """Start the APScheduler"""
        if self._scheduler_started:
            logger.debug("Production scheduler already started")
            return

        if not self.scheduler.running:
            trigger = IntervalTrigger(seconds=self.check_interval)
            self.scheduler.add_job(
                self._run_scheduler_task,
                trigger=trigger,
                id=self.job_id,
                name="Production Alert Checks",
                max_instances=1
            )

            self.scheduler.start()
            self._scheduler_started = True
            logger.info("Production scheduler started", interval_seconds=self.check_interval)

    async def _stop_scheduler(self):
        """Stop the APScheduler"""
        if not self._scheduler_started:
            return

        if self.scheduler.running:
            self.scheduler.shutdown(wait=False)
            self._scheduler_started = False
            logger.info("Production scheduler stopped")

    async def stop(self):
        """Stop the production scheduler and leader election"""
        if self._leader_election:
            await self._leader_election.stop()

        await self._stop_scheduler()

        if self._redis_client:
            await self._redis_client.close()

    @property
    def is_leader(self) -> bool:
        """Check if this instance is the leader"""
        return self._leader_election.is_leader if self._leader_election else True

    async def _run_scheduler_task(self):
        """Run scheduled production alert checks"""
        start_time = datetime.now()
        logger.info("Running scheduled production alert checks")

        try:
            # Run all alert checks
            alerts_generated = await self.check_all_conditions()

            duration = (datetime.now() - start_time).total_seconds()
            logger.info(
                "Completed scheduled production alert checks",
                alerts_generated=alerts_generated,
                duration_seconds=round(duration, 2)
            )

        except Exception as e:
            logger.error(
                "Error in production scheduler task",
                error=str(e),
                exc_info=True
            )

    async def check_all_conditions(self) -> int:
        """
        Check all production alert conditions and trigger alerts.

        Returns:
            int: Total number of alerts generated
        """
        if not self.database_manager:
            logger.warning("Database manager not available for production checks")
            return 0

        total_alerts = 0

        try:
            async with self.database_manager.get_session() as session:
                # Get repositories
                batch_repo = ProductionBatchRepository(session)
                equipment_repo = EquipmentRepository(session)

                # Check production delays
                delay_alerts = await self._check_production_delays(batch_repo)
                total_alerts += delay_alerts

                # Check equipment maintenance
                maintenance_alerts = await self._check_equipment_maintenance(equipment_repo)
                total_alerts += maintenance_alerts

                # Check batch start delays (batches that should have started but haven't)
                start_delay_alerts = await self._check_batch_start_delays(batch_repo)
                total_alerts += start_delay_alerts

                logger.info(
                    "Production alert checks completed",
                    total_alerts=total_alerts,
                    production_delays=delay_alerts,
                    equipment_maintenance=maintenance_alerts,
                    batch_start_delays=start_delay_alerts
                )

        except Exception as e:
            logger.error(
                "Error during production alert checks",
                error=str(e),
                exc_info=True
            )

        return total_alerts

    async def _check_production_delays(self, batch_repo: ProductionBatchRepository) -> int:
        """
        Check for production delays and trigger alerts.

        Args:
            batch_repo: Production batch repository

        Returns:
            int: Number of delay alerts generated
        """
        try:
            # Get delayed batches from repository
            delayed_batches = await batch_repo.get_production_delays()

            logger.info("Found delayed production batches", count=len(delayed_batches))

            # Limpiar cache si expiró
            if (datetime.utcnow() - self._last_cache_clear).total_seconds() > self._alert_cache_ttl:
                self._emitted_alerts.clear()
                self._last_cache_clear = datetime.utcnow()
                logger.info("Cleared alert cache due to TTL expiration")

            alerts_generated = 0

            for batch in delayed_batches:
                try:
                    batch_id = UUID(str(batch["id"]))

                    # Verificar si ya emitimos alerta para este batch
                    alert_key = f"delay:{batch_id}"
                    if alert_key in self._emitted_alerts:
                        logger.debug("Skipping duplicate delay alert", batch_id=str(batch_id))
                        continue

                    tenant_id = UUID(str(batch["tenant_id"]))
                    delay_minutes = int(batch["delay_minutes"]) if batch.get("delay_minutes") else 0
                    affected_orders = int(batch.get("affected_orders", 0))

                    # Emit production delay alert
                    await self.alert_service.emit_production_delay(
                        tenant_id=tenant_id,
                        batch_id=batch_id,
                        product_name=batch.get("product_name", "Unknown Product"),
                        batch_number=batch.get("batch_number", "Unknown Batch"),
                        delay_minutes=delay_minutes,
                        affected_orders=affected_orders
                    )

                    # Registrar en cache
                    self._emitted_alerts.add(alert_key)
                    alerts_generated += 1

                except Exception as e:
                    logger.error(
                        "Error emitting production delay alert",
                        batch_id=batch.get("id", "unknown"),
                        error=str(e)
                    )
                    continue

            return alerts_generated

        except Exception as e:
            logger.error("Error checking production delays", error=str(e))
            return 0

    async def _check_equipment_maintenance(self, equipment_repo: EquipmentRepository) -> int:
        """
        Check for equipment needing maintenance and trigger alerts.

        Args:
            equipment_repo: Equipment repository

        Returns:
            int: Number of maintenance alerts generated
        """
        try:
            # Get equipment that needs maintenance using repository method
            equipment_needing_maintenance = await equipment_repo.get_equipment_needing_maintenance()

            logger.info(
                "Found equipment needing maintenance",
                count=len(equipment_needing_maintenance)
            )

            alerts_generated = 0

            for equipment in equipment_needing_maintenance:
                try:
                    equipment_id = UUID(equipment["id"])
                    tenant_id = UUID(equipment["tenant_id"])
                    days_overdue = int(equipment.get("days_overdue", 0))

                    # Emit equipment maintenance alert
                    await self.alert_service.emit_equipment_maintenance_due(
                        tenant_id=tenant_id,
                        equipment_id=equipment_id,
                        equipment_name=equipment.get("name", "Unknown Equipment"),
                        equipment_type=equipment.get("type", "unknown"),
                        last_maintenance_date=equipment.get("last_maintenance_date"),
                        days_overdue=days_overdue
                    )

                    alerts_generated += 1

                except Exception as e:
                    logger.error(
                        "Error emitting equipment maintenance alert",
                        equipment_id=equipment.get("id", "unknown"),
                        error=str(e)
                    )
                    continue

            return alerts_generated

        except Exception as e:
            logger.error("Error checking equipment maintenance", error=str(e))
            return 0

    async def _check_batch_start_delays(self, batch_repo: ProductionBatchRepository) -> int:
        """
        Check for batches that should have started but haven't.

        Args:
            batch_repo: Production batch repository

        Returns:
            int: Number of start delay alerts generated
        """
        try:
            # Get batches that should have started using repository method
            delayed_start_batches = await batch_repo.get_batches_with_delayed_start()

            logger.info(
                "Found batches with delayed start",
                count=len(delayed_start_batches)
            )

            alerts_generated = 0

            for batch in delayed_start_batches:
                try:
                    batch_id = UUID(batch["id"])

                    # Verificar si ya emitimos alerta para este batch
                    alert_key = f"start_delay:{batch_id}"
                    if alert_key in self._emitted_alerts:
                        logger.debug("Skipping duplicate start delay alert", batch_id=str(batch_id))
                        continue

                    tenant_id = UUID(batch["tenant_id"])
                    scheduled_start = batch.get("scheduled_start_time")

                    # Emit batch start delayed alert
                    await self.alert_service.emit_batch_start_delayed(
                        tenant_id=tenant_id,
                        batch_id=batch_id,
                        product_name=batch.get("product_name", "Unknown Product"),
                        batch_number=batch.get("batch_number", "Unknown Batch"),
                        scheduled_start=scheduled_start,
                        delay_reason="Batch has not started on time"
                    )

                    # Registrar en cache
                    self._emitted_alerts.add(alert_key)
                    alerts_generated += 1

                except Exception as e:
                    logger.error(
                        "Error emitting batch start delay alert",
                        batch_id=batch.get("id", "unknown"),
                        error=str(e)
                    )
                    continue

            return alerts_generated

        except Exception as e:
            logger.error("Error checking batch start delays", error=str(e))
            return 0

    async def trigger_manual_check(self, tenant_id: Optional[UUID] = None) -> Dict[str, Any]:
        """
        Manually trigger production alert checks for a specific tenant or all tenants.

        Args:
            tenant_id: Optional tenant ID to check. If None, checks all tenants.

        Returns:
            Dict with alert generation results
        """
        logger.info(
            "Manually triggering production alert checks",
            tenant_id=str(tenant_id) if tenant_id else "all_tenants"
        )

        try:
            if tenant_id:
                # Run tenant-specific alert checks
                alerts_generated = await self.check_all_conditions_for_tenant(tenant_id)
            else:
                # Run all alert checks across all tenants
                alerts_generated = await self.check_all_conditions()

            return {
                "success": True,
                "tenant_id": str(tenant_id) if tenant_id else None,
                "alerts_generated": alerts_generated,
                "timestamp": datetime.now().isoformat(),
                "message": "Production alert checks completed successfully"
            }

        except Exception as e:
            logger.error(
                "Error during manual production alert check",
                error=str(e),
                exc_info=True
            )
            return {
                "success": False,
                "tenant_id": str(tenant_id) if tenant_id else None,
                "alerts_generated": 0,
                "timestamp": datetime.now().isoformat(),
                "error": str(e)
            }

    async def check_all_conditions_for_tenant(self, tenant_id: UUID) -> int:
        """
        Check all production alert conditions for a specific tenant and trigger alerts.

        Args:
            tenant_id: Tenant ID to check conditions for

        Returns:
            int: Total number of alerts generated
        """
        if not self.database_manager:
            logger.warning("Database manager not available for production checks")
            return 0

        total_alerts = 0

        try:
            async with self.database_manager.get_session() as session:
                # Get repositories
                batch_repo = ProductionBatchRepository(session)
                equipment_repo = EquipmentRepository(session)

                # Check production delays for specific tenant
                delay_alerts = await self._check_production_delays_for_tenant(batch_repo, tenant_id)
                total_alerts += delay_alerts

                # Check equipment maintenance for specific tenant
                maintenance_alerts = await self._check_equipment_maintenance_for_tenant(equipment_repo, tenant_id)
                total_alerts += maintenance_alerts

                # Check batch start delays for specific tenant
                start_delay_alerts = await self._check_batch_start_delays_for_tenant(batch_repo, tenant_id)
                total_alerts += start_delay_alerts

                logger.info(
                    "Tenant-specific production alert checks completed",
                    tenant_id=str(tenant_id),
                    total_alerts=total_alerts,
                    production_delays=delay_alerts,
                    equipment_maintenance=maintenance_alerts,
                    batch_start_delays=start_delay_alerts
                )

        except Exception as e:
            logger.error(
                "Error during tenant-specific production alert checks",
                tenant_id=str(tenant_id),
                error=str(e),
                exc_info=True
            )

        return total_alerts

    async def _check_production_delays_for_tenant(self, batch_repo: ProductionBatchRepository, tenant_id: UUID) -> int:
        """
        Check for production delays for a specific tenant and trigger alerts.

        Args:
            batch_repo: Production batch repository
            tenant_id: Tenant ID to check for

        Returns:
            int: Number of delay alerts generated
        """
        try:
            # Get delayed batches for the specific tenant using repository method
            delayed_batches = await batch_repo.get_production_delays(tenant_id)

            logger.info("Found delayed production batches for tenant", count=len(delayed_batches), tenant_id=str(tenant_id))

            alerts_generated = 0

            for batch in delayed_batches:
                try:
                    batch_id = UUID(str(batch["id"]))
                    delay_minutes = int(batch["delay_minutes"]) if batch.get("delay_minutes") else 0
                    affected_orders = int(batch.get("affected_orders", 0))

                    # Emit production delay alert
                    await self.alert_service.emit_production_delay(
                        tenant_id=tenant_id,
                        batch_id=batch_id,
                        product_name=batch.get("product_name", "Unknown Product"),
                        batch_number=batch.get("batch_number", "Unknown Batch"),
                        delay_minutes=delay_minutes,
                        affected_orders=affected_orders
                    )

                    alerts_generated += 1

                except Exception as e:
                    logger.error(
                        "Error emitting production delay alert",
                        tenant_id=str(tenant_id),
                        batch_id=batch.get("id", "unknown"),
                        error=str(e)
                    )
                    continue

            return alerts_generated

        except Exception as e:
            logger.error("Error checking production delays for tenant", tenant_id=str(tenant_id), error=str(e))
            return 0

    async def _check_equipment_maintenance_for_tenant(self, equipment_repo: EquipmentRepository, tenant_id: UUID) -> int:
        """
        Check for equipment needing maintenance for a specific tenant and trigger alerts.

        Args:
            equipment_repo: Equipment repository
            tenant_id: Tenant ID to check for

        Returns:
            int: Number of maintenance alerts generated
        """
        try:
            # Get equipment that needs maintenance for specific tenant using repository method
            equipment_needing_maintenance = await equipment_repo.get_equipment_needing_maintenance(tenant_id)

            logger.info(
                "Found equipment needing maintenance for tenant",
                count=len(equipment_needing_maintenance),
                tenant_id=str(tenant_id)
            )

            alerts_generated = 0

            for equipment in equipment_needing_maintenance:
                try:
                    equipment_id = UUID(equipment["id"])
                    days_overdue = int(equipment.get("days_overdue", 0))

                    # Emit equipment maintenance alert
                    await self.alert_service.emit_equipment_maintenance_due(
                        tenant_id=tenant_id,
                        equipment_id=equipment_id,
                        equipment_name=equipment.get("name", "Unknown Equipment"),
                        equipment_type=equipment.get("type", "unknown"),
                        last_maintenance_date=equipment.get("last_maintenance_date"),
                        days_overdue=days_overdue
                    )

                    alerts_generated += 1

                except Exception as e:
                    logger.error(
                        "Error emitting equipment maintenance alert",
                        tenant_id=str(tenant_id),
                        equipment_id=equipment.get("id", "unknown"),
                        error=str(e)
                    )
                    continue

            return alerts_generated

        except Exception as e:
            logger.error("Error checking equipment maintenance for tenant", tenant_id=str(tenant_id), error=str(e))
            return 0

    async def _check_batch_start_delays_for_tenant(self, batch_repo: ProductionBatchRepository, tenant_id: UUID) -> int:
        """
        Check for batches that should have started but haven't for a specific tenant.

        Args:
            batch_repo: Production batch repository
            tenant_id: Tenant ID to check for

        Returns:
            int: Number of start delay alerts generated
        """
        try:
            # Get batches that should have started for specific tenant using repository method
            delayed_start_batches = await batch_repo.get_batches_with_delayed_start(tenant_id)

            logger.info(
                "Found batches with delayed start for tenant",
                count=len(delayed_start_batches),
                tenant_id=str(tenant_id)
            )

            alerts_generated = 0

            for batch in delayed_start_batches:
                try:
                    batch_id = UUID(batch["id"])
                    scheduled_start = batch.get("scheduled_start_time")

                    # Emit batch start delayed alert
                    await self.alert_service.emit_batch_start_delayed(
                        tenant_id=tenant_id,
                        batch_id=batch_id,
                        product_name=batch.get("product_name", "Unknown Product"),
                        batch_number=batch.get("batch_number", "Unknown Batch"),
                        scheduled_start=scheduled_start,
                        delay_reason="Batch has not started on time"
                    )

                    alerts_generated += 1

                except Exception as e:
                    logger.error(
                        "Error emitting batch start delay alert",
                        tenant_id=str(tenant_id),
                        batch_id=batch.get("id", "unknown"),
                        error=str(e)
                    )
                    continue

            return alerts_generated

        except Exception as e:
            logger.error("Error checking batch start delays for tenant", tenant_id=str(tenant_id), error=str(e))
            return 0