Add ci/cd and fix multiple pods issues

2026-01-18 09:02:27 +01:00
parent 3c4b5c2a06
commit 21d35ea92b
27 changed files with 3779 additions and 73 deletions
--- a/services/orchestrator/app/main.py
+++ b/services/orchestrator/app/main.py
@@ -18,6 +18,28 @@ class OrchestratorService(StandardFastAPIService):

    expected_migration_version = "001_initial_schema"

+    def __init__(self):
+        # Define expected database tables for health checks
+        orchestrator_expected_tables = [
+            'orchestration_runs'
+        ]
+
+        self.rabbitmq_client = None
+        self.event_publisher = None
+        self.leader_election = None
+        self.scheduler_service = None
+
+        super().__init__(
+            service_name="orchestrator-service",
+            app_name=settings.APP_NAME,
+            description=settings.DESCRIPTION,
+            version=settings.VERSION,
+            api_prefix="",  # Empty because RouteBuilder already includes /api/v1
+            database_manager=database_manager,
+            expected_tables=orchestrator_expected_tables,
+            enable_messaging=True  # Enable RabbitMQ for event publishing
+        )
+
    async def verify_migrations(self):
        """Verify database schema matches the latest migrations"""
        try:
@@ -32,26 +54,6 @@ class OrchestratorService(StandardFastAPIService):
            self.logger.error(f"Migration verification failed: {e}")
            raise

-    def __init__(self):
-        # Define expected database tables for health checks
-        orchestrator_expected_tables = [
-            'orchestration_runs'
-        ]
-
-        self.rabbitmq_client = None
-        self.event_publisher = None
-
-        super().__init__(
-            service_name="orchestrator-service",
-            app_name=settings.APP_NAME,
-            description=settings.DESCRIPTION,
-            version=settings.VERSION,
-            api_prefix="",  # Empty because RouteBuilder already includes /api/v1
-            database_manager=database_manager,
-            expected_tables=orchestrator_expected_tables,
-            enable_messaging=True  # Enable RabbitMQ for event publishing
-        )
-
    async def _setup_messaging(self):
        """Setup messaging for orchestrator service"""
        from shared.messaging import UnifiedEventPublisher, RabbitMQClient
@@ -84,22 +86,91 @@ class OrchestratorService(StandardFastAPIService):

        self.logger.info("Orchestrator Service starting up...")

-        # Initialize orchestrator scheduler service with EventPublisher
-        from app.services.orchestrator_service import OrchestratorSchedulerService
-        scheduler_service = OrchestratorSchedulerService(self.event_publisher, settings)
-        await scheduler_service.start()
-        app.state.scheduler_service = scheduler_service
-        self.logger.info("Orchestrator scheduler service started")
+        # Initialize leader election for horizontal scaling
+        # Only the leader pod will run the scheduler
+        await self._setup_leader_election(app)

        # REMOVED: Delivery tracking service - moved to procurement service (domain ownership)

+    async def _setup_leader_election(self, app: FastAPI):
+        """
+        Setup leader election for scheduler.
+
+        CRITICAL FOR HORIZONTAL SCALING:
+        Without leader election, each pod would run the same scheduled jobs,
+        causing duplicate forecasts, production schedules, and database contention.
+        """
+        from shared.leader_election import LeaderElectionService
+        import redis.asyncio as redis
+
+        try:
+            # Create Redis connection for leader election
+            redis_url = f"redis://:{settings.REDIS_PASSWORD}@{settings.REDIS_HOST}:{settings.REDIS_PORT}/{settings.REDIS_DB}"
+            if settings.REDIS_TLS_ENABLED.lower() == "true":
+                redis_url = redis_url.replace("redis://", "rediss://")
+
+            redis_client = redis.from_url(redis_url, decode_responses=False)
+            await redis_client.ping()
+
+            # Use shared leader election service
+            self.leader_election = LeaderElectionService(
+                redis_client,
+                service_name="orchestrator"
+            )
+
+            # Define callbacks for leader state changes
+            async def on_become_leader():
+                self.logger.info("This pod became the leader - starting scheduler")
+                from app.services.orchestrator_service import OrchestratorSchedulerService
+                self.scheduler_service = OrchestratorSchedulerService(self.event_publisher, settings)
+                await self.scheduler_service.start()
+                app.state.scheduler_service = self.scheduler_service
+                self.logger.info("Orchestrator scheduler service started (leader only)")
+
+            async def on_lose_leader():
+                self.logger.warning("This pod lost leadership - stopping scheduler")
+                if self.scheduler_service:
+                    await self.scheduler_service.stop()
+                    self.scheduler_service = None
+                    if hasattr(app.state, 'scheduler_service'):
+                        app.state.scheduler_service = None
+                self.logger.info("Orchestrator scheduler service stopped (no longer leader)")
+
+            # Start leader election
+            await self.leader_election.start(
+                on_become_leader=on_become_leader,
+                on_lose_leader=on_lose_leader
+            )
+
+            # Store leader election in app state for health checks
+            app.state.leader_election = self.leader_election
+
+            self.logger.info("Leader election initialized",
+                           is_leader=self.leader_election.is_leader,
+                           instance_id=self.leader_election.instance_id)
+
+        except Exception as e:
+            self.logger.error("Failed to setup leader election, falling back to standalone mode",
+                            error=str(e))
+            # Fallback: start scheduler anyway (for single-pod deployments)
+            from app.services.orchestrator_service import OrchestratorSchedulerService
+            self.scheduler_service = OrchestratorSchedulerService(self.event_publisher, settings)
+            await self.scheduler_service.start()
+            app.state.scheduler_service = self.scheduler_service
+            self.logger.warning("Scheduler started in standalone mode (no leader election)")
+
    async def on_shutdown(self, app: FastAPI):
        """Custom shutdown logic for orchestrator service"""
        self.logger.info("Orchestrator Service shutting down...")

-        # Stop scheduler service
-        if hasattr(app.state, 'scheduler_service'):
-            await app.state.scheduler_service.stop()
+        # Stop leader election (this will also stop scheduler if we're the leader)
+        if self.leader_election:
+            await self.leader_election.stop()
+            self.logger.info("Leader election stopped")
+
+        # Stop scheduler service if still running
+        if self.scheduler_service:
+            await self.scheduler_service.stop()
            self.logger.info("Orchestrator scheduler service stopped")