Add ci/cd and fix multiple pods issues

This commit is contained in:
Urtzi Alfaro
2026-01-18 09:02:27 +01:00
parent 3c4b5c2a06
commit 21d35ea92b
27 changed files with 3779 additions and 73 deletions

View File

@@ -0,0 +1,33 @@
"""
Shared Leader Election for Bakery-IA platform
Provides Redis-based leader election for services that need to run
singleton scheduled tasks (APScheduler, background jobs, etc.)
Usage:
from shared.leader_election import LeaderElectionService, SchedulerLeaderMixin
# Option 1: Direct usage
leader_election = LeaderElectionService(redis_client, "my-service")
await leader_election.start(
on_become_leader=start_scheduler,
on_lose_leader=stop_scheduler
)
# Option 2: Mixin for services with APScheduler
class MySchedulerService(SchedulerLeaderMixin):
async def _create_scheduler_jobs(self):
self.scheduler.add_job(...)
"""
from shared.leader_election.service import (
LeaderElectionService,
LeaderElectionConfig,
)
from shared.leader_election.mixin import SchedulerLeaderMixin
__all__ = [
"LeaderElectionService",
"LeaderElectionConfig",
"SchedulerLeaderMixin",
]

View File

@@ -0,0 +1,209 @@
"""
Scheduler Leader Mixin
Provides a mixin class for services that use APScheduler and need
leader election for horizontal scaling.
Usage:
class MySchedulerService(SchedulerLeaderMixin):
def __init__(self, redis_url: str, service_name: str):
super().__init__(redis_url, service_name)
# Your initialization here
async def _create_scheduler_jobs(self):
'''Override to define your scheduled jobs'''
self.scheduler.add_job(
self.my_job,
trigger=CronTrigger(hour=0),
id='my_job'
)
async def my_job(self):
# Your job logic here
pass
"""
import asyncio
from typing import Optional
from abc import abstractmethod
import structlog
logger = structlog.get_logger()
class SchedulerLeaderMixin:
"""
Mixin for services that use APScheduler with leader election.
Provides automatic leader election and scheduler management.
Only the leader pod will run scheduled jobs.
"""
def __init__(self, redis_url: str, service_name: str, **kwargs):
"""
Initialize the scheduler with leader election.
Args:
redis_url: Redis connection URL for leader election
service_name: Unique service name for leader election lock
**kwargs: Additional arguments passed to parent class
"""
super().__init__(**kwargs)
self._redis_url = redis_url
self._service_name = service_name
self._leader_election = None
self._redis_client = None
self.scheduler = None
self._scheduler_started = False
async def start_with_leader_election(self):
"""
Start the service with leader election.
Only the leader will start the scheduler.
"""
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from shared.leader_election.service import LeaderElectionService
import redis.asyncio as redis
try:
# Create Redis connection
self._redis_client = redis.from_url(self._redis_url, decode_responses=False)
await self._redis_client.ping()
# Create scheduler (but don't start it yet)
self.scheduler = AsyncIOScheduler()
# Create leader election
self._leader_election = LeaderElectionService(
self._redis_client,
self._service_name
)
# Start leader election with callbacks
await self._leader_election.start(
on_become_leader=self._on_become_leader,
on_lose_leader=self._on_lose_leader
)
logger.info("Scheduler service started with leader election",
service=self._service_name,
is_leader=self._leader_election.is_leader,
instance_id=self._leader_election.instance_id)
except Exception as e:
logger.error("Failed to start with leader election, falling back to standalone",
service=self._service_name,
error=str(e))
# Fallback: start scheduler anyway (for single-pod deployments)
await self._start_scheduler_standalone()
async def _on_become_leader(self):
"""Called when this instance becomes the leader"""
logger.info("Became leader, starting scheduler",
service=self._service_name)
await self._start_scheduler()
async def _on_lose_leader(self):
"""Called when this instance loses leadership"""
logger.warning("Lost leadership, stopping scheduler",
service=self._service_name)
await self._stop_scheduler()
async def _start_scheduler(self):
"""Start the scheduler with defined jobs"""
if self._scheduler_started:
logger.warning("Scheduler already started",
service=self._service_name)
return
try:
# Let subclass define jobs
await self._create_scheduler_jobs()
# Start scheduler
if not self.scheduler.running:
self.scheduler.start()
self._scheduler_started = True
logger.info("Scheduler started",
service=self._service_name,
job_count=len(self.scheduler.get_jobs()))
except Exception as e:
logger.error("Failed to start scheduler",
service=self._service_name,
error=str(e))
async def _stop_scheduler(self):
"""Stop the scheduler"""
if not self._scheduler_started:
return
try:
if self.scheduler and self.scheduler.running:
self.scheduler.shutdown(wait=False)
self._scheduler_started = False
logger.info("Scheduler stopped",
service=self._service_name)
except Exception as e:
logger.error("Failed to stop scheduler",
service=self._service_name,
error=str(e))
async def _start_scheduler_standalone(self):
"""Start scheduler without leader election (fallback mode)"""
from apscheduler.schedulers.asyncio import AsyncIOScheduler
logger.warning("Starting scheduler in standalone mode (no leader election)",
service=self._service_name)
self.scheduler = AsyncIOScheduler()
await self._create_scheduler_jobs()
if not self.scheduler.running:
self.scheduler.start()
self._scheduler_started = True
@abstractmethod
async def _create_scheduler_jobs(self):
"""
Override to define scheduled jobs.
Example:
self.scheduler.add_job(
self.my_task,
trigger=CronTrigger(hour=0, minute=30),
id='my_task',
max_instances=1
)
"""
pass
async def stop(self):
"""Stop the scheduler and leader election"""
# Stop leader election
if self._leader_election:
await self._leader_election.stop()
# Stop scheduler
await self._stop_scheduler()
# Close Redis
if self._redis_client:
await self._redis_client.close()
logger.info("Scheduler service stopped",
service=self._service_name)
@property
def is_leader(self) -> bool:
"""Check if this instance is the leader"""
return self._leader_election.is_leader if self._leader_election else False
def get_leader_status(self) -> dict:
"""Get leader election status"""
if self._leader_election:
return self._leader_election.get_status()
return {"is_leader": True, "mode": "standalone"}

View File

@@ -0,0 +1,352 @@
"""
Leader Election Service
Implements Redis-based leader election to ensure only ONE pod runs
singleton tasks like APScheduler jobs.
This is CRITICAL for horizontal scaling - without leader election,
each pod would run the same scheduled jobs, causing:
- Duplicate operations (forecasts, alerts, syncs)
- Database contention
- Inconsistent state
- Duplicate notifications
Implementation:
- Uses Redis SET NX (set if not exists) for atomic leadership acquisition
- Leader maintains leadership with periodic heartbeats
- If leader fails to heartbeat, another pod can take over
- Non-leader pods check periodically if they should become leader
"""
import asyncio
import os
import socket
from dataclasses import dataclass
from typing import Optional, Callable, Awaitable
import structlog
logger = structlog.get_logger()
@dataclass
class LeaderElectionConfig:
"""Configuration for leader election"""
# Redis key prefix for the lock
lock_key_prefix: str = "leader"
# Lock expires after this many seconds without refresh
lock_ttl_seconds: int = 30
# Refresh lock every N seconds (should be < lock_ttl_seconds / 2)
heartbeat_interval_seconds: int = 10
# Non-leaders check for leadership every N seconds
election_check_interval_seconds: int = 15
class LeaderElectionService:
"""
Redis-based leader election service.
Ensures only one pod runs scheduled tasks at a time across all replicas.
"""
def __init__(
self,
redis_client,
service_name: str,
config: Optional[LeaderElectionConfig] = None
):
"""
Initialize leader election service.
Args:
redis_client: Async Redis client instance
service_name: Unique name for this service (used in Redis key)
config: Optional configuration override
"""
self.redis = redis_client
self.service_name = service_name
self.config = config or LeaderElectionConfig()
self.lock_key = f"{self.config.lock_key_prefix}:{service_name}:lock"
self.instance_id = self._generate_instance_id()
self.is_leader = False
self._heartbeat_task: Optional[asyncio.Task] = None
self._election_task: Optional[asyncio.Task] = None
self._running = False
self._on_become_leader_callback: Optional[Callable[[], Awaitable[None]]] = None
self._on_lose_leader_callback: Optional[Callable[[], Awaitable[None]]] = None
def _generate_instance_id(self) -> str:
"""Generate unique instance identifier for this pod"""
hostname = os.environ.get('HOSTNAME', socket.gethostname())
pod_ip = os.environ.get('POD_IP', 'unknown')
return f"{hostname}:{pod_ip}:{os.getpid()}"
async def start(
self,
on_become_leader: Optional[Callable[[], Awaitable[None]]] = None,
on_lose_leader: Optional[Callable[[], Awaitable[None]]] = None
):
"""
Start leader election process.
Args:
on_become_leader: Async callback when this instance becomes leader
on_lose_leader: Async callback when this instance loses leadership
"""
self._on_become_leader_callback = on_become_leader
self._on_lose_leader_callback = on_lose_leader
self._running = True
logger.info("Starting leader election",
service=self.service_name,
instance_id=self.instance_id,
lock_key=self.lock_key)
# Try to become leader immediately
await self._try_become_leader()
# Start background tasks
if self.is_leader:
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
else:
self._election_task = asyncio.create_task(self._election_loop())
async def stop(self):
"""Stop leader election and release leadership if held"""
self._running = False
# Cancel background tasks
if self._heartbeat_task:
self._heartbeat_task.cancel()
try:
await self._heartbeat_task
except asyncio.CancelledError:
pass
self._heartbeat_task = None
if self._election_task:
self._election_task.cancel()
try:
await self._election_task
except asyncio.CancelledError:
pass
self._election_task = None
# Release leadership
if self.is_leader:
await self._release_leadership()
logger.info("Leader election stopped",
service=self.service_name,
instance_id=self.instance_id,
was_leader=self.is_leader)
async def _try_become_leader(self) -> bool:
"""
Attempt to become the leader.
Returns:
True if this instance is now the leader
"""
try:
# Try to set the lock with NX (only if not exists) and EX (expiry)
acquired = await self.redis.set(
self.lock_key,
self.instance_id,
nx=True, # Only set if not exists
ex=self.config.lock_ttl_seconds
)
if acquired:
self.is_leader = True
logger.info("Became leader",
service=self.service_name,
instance_id=self.instance_id)
# Call callback
if self._on_become_leader_callback:
try:
await self._on_become_leader_callback()
except Exception as e:
logger.error("Error in on_become_leader callback",
service=self.service_name,
error=str(e))
return True
# Check if we're already the leader (reconnection scenario)
current_leader = await self.redis.get(self.lock_key)
if current_leader:
current_leader_str = current_leader.decode() if isinstance(current_leader, bytes) else current_leader
if current_leader_str == self.instance_id:
self.is_leader = True
logger.info("Confirmed as existing leader",
service=self.service_name,
instance_id=self.instance_id)
return True
else:
logger.debug("Another instance is leader",
service=self.service_name,
current_leader=current_leader_str,
this_instance=self.instance_id)
return False
except Exception as e:
logger.error("Failed to acquire leadership",
service=self.service_name,
instance_id=self.instance_id,
error=str(e))
return False
async def _release_leadership(self):
"""Release leadership lock"""
try:
# Only delete if we're the current leader
current_leader = await self.redis.get(self.lock_key)
if current_leader:
current_leader_str = current_leader.decode() if isinstance(current_leader, bytes) else current_leader
if current_leader_str == self.instance_id:
await self.redis.delete(self.lock_key)
logger.info("Released leadership",
service=self.service_name,
instance_id=self.instance_id)
was_leader = self.is_leader
self.is_leader = False
# Call callback only if we were the leader
if was_leader and self._on_lose_leader_callback:
try:
await self._on_lose_leader_callback()
except Exception as e:
logger.error("Error in on_lose_leader callback",
service=self.service_name,
error=str(e))
except Exception as e:
logger.error("Failed to release leadership",
service=self.service_name,
instance_id=self.instance_id,
error=str(e))
async def _refresh_leadership(self) -> bool:
"""
Refresh leadership lock TTL.
Returns:
True if leadership was maintained
"""
try:
# Verify we're still the leader
current_leader = await self.redis.get(self.lock_key)
if not current_leader:
logger.warning("Lost leadership (lock expired)",
service=self.service_name,
instance_id=self.instance_id)
return False
current_leader_str = current_leader.decode() if isinstance(current_leader, bytes) else current_leader
if current_leader_str != self.instance_id:
logger.warning("Lost leadership (lock held by another instance)",
service=self.service_name,
instance_id=self.instance_id,
current_leader=current_leader_str)
return False
# Refresh the TTL
await self.redis.expire(self.lock_key, self.config.lock_ttl_seconds)
return True
except Exception as e:
logger.error("Failed to refresh leadership",
service=self.service_name,
instance_id=self.instance_id,
error=str(e))
return False
async def _heartbeat_loop(self):
"""Background loop to maintain leadership"""
while self._running and self.is_leader:
try:
await asyncio.sleep(self.config.heartbeat_interval_seconds)
if not self._running:
break
maintained = await self._refresh_leadership()
if not maintained:
self.is_leader = False
# Call callback
if self._on_lose_leader_callback:
try:
await self._on_lose_leader_callback()
except Exception as e:
logger.error("Error in on_lose_leader callback",
service=self.service_name,
error=str(e))
# Switch to election loop
self._election_task = asyncio.create_task(self._election_loop())
break
except asyncio.CancelledError:
break
except Exception as e:
logger.error("Error in heartbeat loop",
service=self.service_name,
instance_id=self.instance_id,
error=str(e))
async def _election_loop(self):
"""Background loop to attempt leadership acquisition"""
while self._running and not self.is_leader:
try:
await asyncio.sleep(self.config.election_check_interval_seconds)
if not self._running:
break
acquired = await self._try_become_leader()
if acquired:
# Switch to heartbeat loop
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
break
except asyncio.CancelledError:
break
except Exception as e:
logger.error("Error in election loop",
service=self.service_name,
instance_id=self.instance_id,
error=str(e))
def get_status(self) -> dict:
"""Get current leader election status"""
return {
"service": self.service_name,
"instance_id": self.instance_id,
"is_leader": self.is_leader,
"running": self._running,
"lock_key": self.lock_key,
"config": {
"lock_ttl_seconds": self.config.lock_ttl_seconds,
"heartbeat_interval_seconds": self.config.heartbeat_interval_seconds,
"election_check_interval_seconds": self.config.election_check_interval_seconds
}
}
async def get_current_leader(self) -> Optional[str]:
"""Get the current leader instance ID (if any)"""
try:
current_leader = await self.redis.get(self.lock_key)
if current_leader:
return current_leader.decode() if isinstance(current_leader, bytes) else current_leader
return None
except Exception as e:
logger.error("Failed to get current leader",
service=self.service_name,
error=str(e))
return None