Add ci/cd and fix multiple pods issues
This commit is contained in:
@@ -18,6 +18,28 @@ class OrchestratorService(StandardFastAPIService):
|
||||
|
||||
expected_migration_version = "001_initial_schema"
|
||||
|
||||
def __init__(self):
|
||||
# Define expected database tables for health checks
|
||||
orchestrator_expected_tables = [
|
||||
'orchestration_runs'
|
||||
]
|
||||
|
||||
self.rabbitmq_client = None
|
||||
self.event_publisher = None
|
||||
self.leader_election = None
|
||||
self.scheduler_service = None
|
||||
|
||||
super().__init__(
|
||||
service_name="orchestrator-service",
|
||||
app_name=settings.APP_NAME,
|
||||
description=settings.DESCRIPTION,
|
||||
version=settings.VERSION,
|
||||
api_prefix="", # Empty because RouteBuilder already includes /api/v1
|
||||
database_manager=database_manager,
|
||||
expected_tables=orchestrator_expected_tables,
|
||||
enable_messaging=True # Enable RabbitMQ for event publishing
|
||||
)
|
||||
|
||||
async def verify_migrations(self):
|
||||
"""Verify database schema matches the latest migrations"""
|
||||
try:
|
||||
@@ -32,26 +54,6 @@ class OrchestratorService(StandardFastAPIService):
|
||||
self.logger.error(f"Migration verification failed: {e}")
|
||||
raise
|
||||
|
||||
def __init__(self):
|
||||
# Define expected database tables for health checks
|
||||
orchestrator_expected_tables = [
|
||||
'orchestration_runs'
|
||||
]
|
||||
|
||||
self.rabbitmq_client = None
|
||||
self.event_publisher = None
|
||||
|
||||
super().__init__(
|
||||
service_name="orchestrator-service",
|
||||
app_name=settings.APP_NAME,
|
||||
description=settings.DESCRIPTION,
|
||||
version=settings.VERSION,
|
||||
api_prefix="", # Empty because RouteBuilder already includes /api/v1
|
||||
database_manager=database_manager,
|
||||
expected_tables=orchestrator_expected_tables,
|
||||
enable_messaging=True # Enable RabbitMQ for event publishing
|
||||
)
|
||||
|
||||
async def _setup_messaging(self):
|
||||
"""Setup messaging for orchestrator service"""
|
||||
from shared.messaging import UnifiedEventPublisher, RabbitMQClient
|
||||
@@ -84,22 +86,91 @@ class OrchestratorService(StandardFastAPIService):
|
||||
|
||||
self.logger.info("Orchestrator Service starting up...")
|
||||
|
||||
# Initialize orchestrator scheduler service with EventPublisher
|
||||
from app.services.orchestrator_service import OrchestratorSchedulerService
|
||||
scheduler_service = OrchestratorSchedulerService(self.event_publisher, settings)
|
||||
await scheduler_service.start()
|
||||
app.state.scheduler_service = scheduler_service
|
||||
self.logger.info("Orchestrator scheduler service started")
|
||||
# Initialize leader election for horizontal scaling
|
||||
# Only the leader pod will run the scheduler
|
||||
await self._setup_leader_election(app)
|
||||
|
||||
# REMOVED: Delivery tracking service - moved to procurement service (domain ownership)
|
||||
|
||||
async def _setup_leader_election(self, app: FastAPI):
|
||||
"""
|
||||
Setup leader election for scheduler.
|
||||
|
||||
CRITICAL FOR HORIZONTAL SCALING:
|
||||
Without leader election, each pod would run the same scheduled jobs,
|
||||
causing duplicate forecasts, production schedules, and database contention.
|
||||
"""
|
||||
from shared.leader_election import LeaderElectionService
|
||||
import redis.asyncio as redis
|
||||
|
||||
try:
|
||||
# Create Redis connection for leader election
|
||||
redis_url = f"redis://:{settings.REDIS_PASSWORD}@{settings.REDIS_HOST}:{settings.REDIS_PORT}/{settings.REDIS_DB}"
|
||||
if settings.REDIS_TLS_ENABLED.lower() == "true":
|
||||
redis_url = redis_url.replace("redis://", "rediss://")
|
||||
|
||||
redis_client = redis.from_url(redis_url, decode_responses=False)
|
||||
await redis_client.ping()
|
||||
|
||||
# Use shared leader election service
|
||||
self.leader_election = LeaderElectionService(
|
||||
redis_client,
|
||||
service_name="orchestrator"
|
||||
)
|
||||
|
||||
# Define callbacks for leader state changes
|
||||
async def on_become_leader():
|
||||
self.logger.info("This pod became the leader - starting scheduler")
|
||||
from app.services.orchestrator_service import OrchestratorSchedulerService
|
||||
self.scheduler_service = OrchestratorSchedulerService(self.event_publisher, settings)
|
||||
await self.scheduler_service.start()
|
||||
app.state.scheduler_service = self.scheduler_service
|
||||
self.logger.info("Orchestrator scheduler service started (leader only)")
|
||||
|
||||
async def on_lose_leader():
|
||||
self.logger.warning("This pod lost leadership - stopping scheduler")
|
||||
if self.scheduler_service:
|
||||
await self.scheduler_service.stop()
|
||||
self.scheduler_service = None
|
||||
if hasattr(app.state, 'scheduler_service'):
|
||||
app.state.scheduler_service = None
|
||||
self.logger.info("Orchestrator scheduler service stopped (no longer leader)")
|
||||
|
||||
# Start leader election
|
||||
await self.leader_election.start(
|
||||
on_become_leader=on_become_leader,
|
||||
on_lose_leader=on_lose_leader
|
||||
)
|
||||
|
||||
# Store leader election in app state for health checks
|
||||
app.state.leader_election = self.leader_election
|
||||
|
||||
self.logger.info("Leader election initialized",
|
||||
is_leader=self.leader_election.is_leader,
|
||||
instance_id=self.leader_election.instance_id)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error("Failed to setup leader election, falling back to standalone mode",
|
||||
error=str(e))
|
||||
# Fallback: start scheduler anyway (for single-pod deployments)
|
||||
from app.services.orchestrator_service import OrchestratorSchedulerService
|
||||
self.scheduler_service = OrchestratorSchedulerService(self.event_publisher, settings)
|
||||
await self.scheduler_service.start()
|
||||
app.state.scheduler_service = self.scheduler_service
|
||||
self.logger.warning("Scheduler started in standalone mode (no leader election)")
|
||||
|
||||
async def on_shutdown(self, app: FastAPI):
|
||||
"""Custom shutdown logic for orchestrator service"""
|
||||
self.logger.info("Orchestrator Service shutting down...")
|
||||
|
||||
# Stop scheduler service
|
||||
if hasattr(app.state, 'scheduler_service'):
|
||||
await app.state.scheduler_service.stop()
|
||||
# Stop leader election (this will also stop scheduler if we're the leader)
|
||||
if self.leader_election:
|
||||
await self.leader_election.stop()
|
||||
self.logger.info("Leader election stopped")
|
||||
|
||||
# Stop scheduler service if still running
|
||||
if self.scheduler_service:
|
||||
await self.scheduler_service.stop()
|
||||
self.logger.info("Orchestrator scheduler service stopped")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user