Add ci/cd and fix multiple pods issues

This commit is contained in:
Urtzi Alfaro
2026-01-18 09:02:27 +01:00
parent 3c4b5c2a06
commit 21d35ea92b
27 changed files with 3779 additions and 73 deletions

View File

@@ -18,6 +18,28 @@ class OrchestratorService(StandardFastAPIService):
expected_migration_version = "001_initial_schema"
def __init__(self):
# Define expected database tables for health checks
orchestrator_expected_tables = [
'orchestration_runs'
]
self.rabbitmq_client = None
self.event_publisher = None
self.leader_election = None
self.scheduler_service = None
super().__init__(
service_name="orchestrator-service",
app_name=settings.APP_NAME,
description=settings.DESCRIPTION,
version=settings.VERSION,
api_prefix="", # Empty because RouteBuilder already includes /api/v1
database_manager=database_manager,
expected_tables=orchestrator_expected_tables,
enable_messaging=True # Enable RabbitMQ for event publishing
)
async def verify_migrations(self):
"""Verify database schema matches the latest migrations"""
try:
@@ -32,26 +54,6 @@ class OrchestratorService(StandardFastAPIService):
self.logger.error(f"Migration verification failed: {e}")
raise
def __init__(self):
# Define expected database tables for health checks
orchestrator_expected_tables = [
'orchestration_runs'
]
self.rabbitmq_client = None
self.event_publisher = None
super().__init__(
service_name="orchestrator-service",
app_name=settings.APP_NAME,
description=settings.DESCRIPTION,
version=settings.VERSION,
api_prefix="", # Empty because RouteBuilder already includes /api/v1
database_manager=database_manager,
expected_tables=orchestrator_expected_tables,
enable_messaging=True # Enable RabbitMQ for event publishing
)
async def _setup_messaging(self):
"""Setup messaging for orchestrator service"""
from shared.messaging import UnifiedEventPublisher, RabbitMQClient
@@ -84,22 +86,91 @@ class OrchestratorService(StandardFastAPIService):
self.logger.info("Orchestrator Service starting up...")
# Initialize orchestrator scheduler service with EventPublisher
from app.services.orchestrator_service import OrchestratorSchedulerService
scheduler_service = OrchestratorSchedulerService(self.event_publisher, settings)
await scheduler_service.start()
app.state.scheduler_service = scheduler_service
self.logger.info("Orchestrator scheduler service started")
# Initialize leader election for horizontal scaling
# Only the leader pod will run the scheduler
await self._setup_leader_election(app)
# REMOVED: Delivery tracking service - moved to procurement service (domain ownership)
async def _setup_leader_election(self, app: FastAPI):
"""
Setup leader election for scheduler.
CRITICAL FOR HORIZONTAL SCALING:
Without leader election, each pod would run the same scheduled jobs,
causing duplicate forecasts, production schedules, and database contention.
"""
from shared.leader_election import LeaderElectionService
import redis.asyncio as redis
try:
# Create Redis connection for leader election
redis_url = f"redis://:{settings.REDIS_PASSWORD}@{settings.REDIS_HOST}:{settings.REDIS_PORT}/{settings.REDIS_DB}"
if settings.REDIS_TLS_ENABLED.lower() == "true":
redis_url = redis_url.replace("redis://", "rediss://")
redis_client = redis.from_url(redis_url, decode_responses=False)
await redis_client.ping()
# Use shared leader election service
self.leader_election = LeaderElectionService(
redis_client,
service_name="orchestrator"
)
# Define callbacks for leader state changes
async def on_become_leader():
self.logger.info("This pod became the leader - starting scheduler")
from app.services.orchestrator_service import OrchestratorSchedulerService
self.scheduler_service = OrchestratorSchedulerService(self.event_publisher, settings)
await self.scheduler_service.start()
app.state.scheduler_service = self.scheduler_service
self.logger.info("Orchestrator scheduler service started (leader only)")
async def on_lose_leader():
self.logger.warning("This pod lost leadership - stopping scheduler")
if self.scheduler_service:
await self.scheduler_service.stop()
self.scheduler_service = None
if hasattr(app.state, 'scheduler_service'):
app.state.scheduler_service = None
self.logger.info("Orchestrator scheduler service stopped (no longer leader)")
# Start leader election
await self.leader_election.start(
on_become_leader=on_become_leader,
on_lose_leader=on_lose_leader
)
# Store leader election in app state for health checks
app.state.leader_election = self.leader_election
self.logger.info("Leader election initialized",
is_leader=self.leader_election.is_leader,
instance_id=self.leader_election.instance_id)
except Exception as e:
self.logger.error("Failed to setup leader election, falling back to standalone mode",
error=str(e))
# Fallback: start scheduler anyway (for single-pod deployments)
from app.services.orchestrator_service import OrchestratorSchedulerService
self.scheduler_service = OrchestratorSchedulerService(self.event_publisher, settings)
await self.scheduler_service.start()
app.state.scheduler_service = self.scheduler_service
self.logger.warning("Scheduler started in standalone mode (no leader election)")
async def on_shutdown(self, app: FastAPI):
"""Custom shutdown logic for orchestrator service"""
self.logger.info("Orchestrator Service shutting down...")
# Stop scheduler service
if hasattr(app.state, 'scheduler_service'):
await app.state.scheduler_service.stop()
# Stop leader election (this will also stop scheduler if we're the leader)
if self.leader_election:
await self.leader_election.stop()
self.logger.info("Leader election stopped")
# Stop scheduler service if still running
if self.scheduler_service:
await self.scheduler_service.stop()
self.logger.info("Orchestrator scheduler service stopped")