Add ci/cd and fix multiple pods issues
This commit is contained in:
33
shared/leader_election/__init__.py
Normal file
33
shared/leader_election/__init__.py
Normal file
@@ -0,0 +1,33 @@
|
||||
"""
|
||||
Shared Leader Election for Bakery-IA platform
|
||||
|
||||
Provides Redis-based leader election for services that need to run
|
||||
singleton scheduled tasks (APScheduler, background jobs, etc.)
|
||||
|
||||
Usage:
|
||||
from shared.leader_election import LeaderElectionService, SchedulerLeaderMixin
|
||||
|
||||
# Option 1: Direct usage
|
||||
leader_election = LeaderElectionService(redis_client, "my-service")
|
||||
await leader_election.start(
|
||||
on_become_leader=start_scheduler,
|
||||
on_lose_leader=stop_scheduler
|
||||
)
|
||||
|
||||
# Option 2: Mixin for services with APScheduler
|
||||
class MySchedulerService(SchedulerLeaderMixin):
|
||||
async def _create_scheduler_jobs(self):
|
||||
self.scheduler.add_job(...)
|
||||
"""
|
||||
|
||||
from shared.leader_election.service import (
|
||||
LeaderElectionService,
|
||||
LeaderElectionConfig,
|
||||
)
|
||||
from shared.leader_election.mixin import SchedulerLeaderMixin
|
||||
|
||||
__all__ = [
|
||||
"LeaderElectionService",
|
||||
"LeaderElectionConfig",
|
||||
"SchedulerLeaderMixin",
|
||||
]
|
||||
209
shared/leader_election/mixin.py
Normal file
209
shared/leader_election/mixin.py
Normal file
@@ -0,0 +1,209 @@
|
||||
"""
|
||||
Scheduler Leader Mixin
|
||||
|
||||
Provides a mixin class for services that use APScheduler and need
|
||||
leader election for horizontal scaling.
|
||||
|
||||
Usage:
|
||||
class MySchedulerService(SchedulerLeaderMixin):
|
||||
def __init__(self, redis_url: str, service_name: str):
|
||||
super().__init__(redis_url, service_name)
|
||||
# Your initialization here
|
||||
|
||||
async def _create_scheduler_jobs(self):
|
||||
'''Override to define your scheduled jobs'''
|
||||
self.scheduler.add_job(
|
||||
self.my_job,
|
||||
trigger=CronTrigger(hour=0),
|
||||
id='my_job'
|
||||
)
|
||||
|
||||
async def my_job(self):
|
||||
# Your job logic here
|
||||
pass
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
from abc import abstractmethod
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class SchedulerLeaderMixin:
|
||||
"""
|
||||
Mixin for services that use APScheduler with leader election.
|
||||
|
||||
Provides automatic leader election and scheduler management.
|
||||
Only the leader pod will run scheduled jobs.
|
||||
"""
|
||||
|
||||
def __init__(self, redis_url: str, service_name: str, **kwargs):
|
||||
"""
|
||||
Initialize the scheduler with leader election.
|
||||
|
||||
Args:
|
||||
redis_url: Redis connection URL for leader election
|
||||
service_name: Unique service name for leader election lock
|
||||
**kwargs: Additional arguments passed to parent class
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._redis_url = redis_url
|
||||
self._service_name = service_name
|
||||
self._leader_election = None
|
||||
self._redis_client = None
|
||||
self.scheduler = None
|
||||
self._scheduler_started = False
|
||||
|
||||
async def start_with_leader_election(self):
|
||||
"""
|
||||
Start the service with leader election.
|
||||
|
||||
Only the leader will start the scheduler.
|
||||
"""
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from shared.leader_election.service import LeaderElectionService
|
||||
import redis.asyncio as redis
|
||||
|
||||
try:
|
||||
# Create Redis connection
|
||||
self._redis_client = redis.from_url(self._redis_url, decode_responses=False)
|
||||
await self._redis_client.ping()
|
||||
|
||||
# Create scheduler (but don't start it yet)
|
||||
self.scheduler = AsyncIOScheduler()
|
||||
|
||||
# Create leader election
|
||||
self._leader_election = LeaderElectionService(
|
||||
self._redis_client,
|
||||
self._service_name
|
||||
)
|
||||
|
||||
# Start leader election with callbacks
|
||||
await self._leader_election.start(
|
||||
on_become_leader=self._on_become_leader,
|
||||
on_lose_leader=self._on_lose_leader
|
||||
)
|
||||
|
||||
logger.info("Scheduler service started with leader election",
|
||||
service=self._service_name,
|
||||
is_leader=self._leader_election.is_leader,
|
||||
instance_id=self._leader_election.instance_id)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to start with leader election, falling back to standalone",
|
||||
service=self._service_name,
|
||||
error=str(e))
|
||||
# Fallback: start scheduler anyway (for single-pod deployments)
|
||||
await self._start_scheduler_standalone()
|
||||
|
||||
async def _on_become_leader(self):
|
||||
"""Called when this instance becomes the leader"""
|
||||
logger.info("Became leader, starting scheduler",
|
||||
service=self._service_name)
|
||||
await self._start_scheduler()
|
||||
|
||||
async def _on_lose_leader(self):
|
||||
"""Called when this instance loses leadership"""
|
||||
logger.warning("Lost leadership, stopping scheduler",
|
||||
service=self._service_name)
|
||||
await self._stop_scheduler()
|
||||
|
||||
async def _start_scheduler(self):
|
||||
"""Start the scheduler with defined jobs"""
|
||||
if self._scheduler_started:
|
||||
logger.warning("Scheduler already started",
|
||||
service=self._service_name)
|
||||
return
|
||||
|
||||
try:
|
||||
# Let subclass define jobs
|
||||
await self._create_scheduler_jobs()
|
||||
|
||||
# Start scheduler
|
||||
if not self.scheduler.running:
|
||||
self.scheduler.start()
|
||||
self._scheduler_started = True
|
||||
logger.info("Scheduler started",
|
||||
service=self._service_name,
|
||||
job_count=len(self.scheduler.get_jobs()))
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to start scheduler",
|
||||
service=self._service_name,
|
||||
error=str(e))
|
||||
|
||||
async def _stop_scheduler(self):
|
||||
"""Stop the scheduler"""
|
||||
if not self._scheduler_started:
|
||||
return
|
||||
|
||||
try:
|
||||
if self.scheduler and self.scheduler.running:
|
||||
self.scheduler.shutdown(wait=False)
|
||||
self._scheduler_started = False
|
||||
logger.info("Scheduler stopped",
|
||||
service=self._service_name)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to stop scheduler",
|
||||
service=self._service_name,
|
||||
error=str(e))
|
||||
|
||||
async def _start_scheduler_standalone(self):
|
||||
"""Start scheduler without leader election (fallback mode)"""
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
|
||||
logger.warning("Starting scheduler in standalone mode (no leader election)",
|
||||
service=self._service_name)
|
||||
|
||||
self.scheduler = AsyncIOScheduler()
|
||||
await self._create_scheduler_jobs()
|
||||
|
||||
if not self.scheduler.running:
|
||||
self.scheduler.start()
|
||||
self._scheduler_started = True
|
||||
|
||||
@abstractmethod
|
||||
async def _create_scheduler_jobs(self):
|
||||
"""
|
||||
Override to define scheduled jobs.
|
||||
|
||||
Example:
|
||||
self.scheduler.add_job(
|
||||
self.my_task,
|
||||
trigger=CronTrigger(hour=0, minute=30),
|
||||
id='my_task',
|
||||
max_instances=1
|
||||
)
|
||||
"""
|
||||
pass
|
||||
|
||||
async def stop(self):
|
||||
"""Stop the scheduler and leader election"""
|
||||
# Stop leader election
|
||||
if self._leader_election:
|
||||
await self._leader_election.stop()
|
||||
|
||||
# Stop scheduler
|
||||
await self._stop_scheduler()
|
||||
|
||||
# Close Redis
|
||||
if self._redis_client:
|
||||
await self._redis_client.close()
|
||||
|
||||
logger.info("Scheduler service stopped",
|
||||
service=self._service_name)
|
||||
|
||||
@property
|
||||
def is_leader(self) -> bool:
|
||||
"""Check if this instance is the leader"""
|
||||
return self._leader_election.is_leader if self._leader_election else False
|
||||
|
||||
def get_leader_status(self) -> dict:
|
||||
"""Get leader election status"""
|
||||
if self._leader_election:
|
||||
return self._leader_election.get_status()
|
||||
return {"is_leader": True, "mode": "standalone"}
|
||||
352
shared/leader_election/service.py
Normal file
352
shared/leader_election/service.py
Normal file
@@ -0,0 +1,352 @@
|
||||
"""
|
||||
Leader Election Service
|
||||
|
||||
Implements Redis-based leader election to ensure only ONE pod runs
|
||||
singleton tasks like APScheduler jobs.
|
||||
|
||||
This is CRITICAL for horizontal scaling - without leader election,
|
||||
each pod would run the same scheduled jobs, causing:
|
||||
- Duplicate operations (forecasts, alerts, syncs)
|
||||
- Database contention
|
||||
- Inconsistent state
|
||||
- Duplicate notifications
|
||||
|
||||
Implementation:
|
||||
- Uses Redis SET NX (set if not exists) for atomic leadership acquisition
|
||||
- Leader maintains leadership with periodic heartbeats
|
||||
- If leader fails to heartbeat, another pod can take over
|
||||
- Non-leader pods check periodically if they should become leader
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import socket
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Callable, Awaitable
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
@dataclass
|
||||
class LeaderElectionConfig:
|
||||
"""Configuration for leader election"""
|
||||
# Redis key prefix for the lock
|
||||
lock_key_prefix: str = "leader"
|
||||
# Lock expires after this many seconds without refresh
|
||||
lock_ttl_seconds: int = 30
|
||||
# Refresh lock every N seconds (should be < lock_ttl_seconds / 2)
|
||||
heartbeat_interval_seconds: int = 10
|
||||
# Non-leaders check for leadership every N seconds
|
||||
election_check_interval_seconds: int = 15
|
||||
|
||||
|
||||
class LeaderElectionService:
|
||||
"""
|
||||
Redis-based leader election service.
|
||||
|
||||
Ensures only one pod runs scheduled tasks at a time across all replicas.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
redis_client,
|
||||
service_name: str,
|
||||
config: Optional[LeaderElectionConfig] = None
|
||||
):
|
||||
"""
|
||||
Initialize leader election service.
|
||||
|
||||
Args:
|
||||
redis_client: Async Redis client instance
|
||||
service_name: Unique name for this service (used in Redis key)
|
||||
config: Optional configuration override
|
||||
"""
|
||||
self.redis = redis_client
|
||||
self.service_name = service_name
|
||||
self.config = config or LeaderElectionConfig()
|
||||
self.lock_key = f"{self.config.lock_key_prefix}:{service_name}:lock"
|
||||
self.instance_id = self._generate_instance_id()
|
||||
self.is_leader = False
|
||||
self._heartbeat_task: Optional[asyncio.Task] = None
|
||||
self._election_task: Optional[asyncio.Task] = None
|
||||
self._running = False
|
||||
self._on_become_leader_callback: Optional[Callable[[], Awaitable[None]]] = None
|
||||
self._on_lose_leader_callback: Optional[Callable[[], Awaitable[None]]] = None
|
||||
|
||||
def _generate_instance_id(self) -> str:
|
||||
"""Generate unique instance identifier for this pod"""
|
||||
hostname = os.environ.get('HOSTNAME', socket.gethostname())
|
||||
pod_ip = os.environ.get('POD_IP', 'unknown')
|
||||
return f"{hostname}:{pod_ip}:{os.getpid()}"
|
||||
|
||||
async def start(
|
||||
self,
|
||||
on_become_leader: Optional[Callable[[], Awaitable[None]]] = None,
|
||||
on_lose_leader: Optional[Callable[[], Awaitable[None]]] = None
|
||||
):
|
||||
"""
|
||||
Start leader election process.
|
||||
|
||||
Args:
|
||||
on_become_leader: Async callback when this instance becomes leader
|
||||
on_lose_leader: Async callback when this instance loses leadership
|
||||
"""
|
||||
self._on_become_leader_callback = on_become_leader
|
||||
self._on_lose_leader_callback = on_lose_leader
|
||||
self._running = True
|
||||
|
||||
logger.info("Starting leader election",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id,
|
||||
lock_key=self.lock_key)
|
||||
|
||||
# Try to become leader immediately
|
||||
await self._try_become_leader()
|
||||
|
||||
# Start background tasks
|
||||
if self.is_leader:
|
||||
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
||||
else:
|
||||
self._election_task = asyncio.create_task(self._election_loop())
|
||||
|
||||
async def stop(self):
|
||||
"""Stop leader election and release leadership if held"""
|
||||
self._running = False
|
||||
|
||||
# Cancel background tasks
|
||||
if self._heartbeat_task:
|
||||
self._heartbeat_task.cancel()
|
||||
try:
|
||||
await self._heartbeat_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._heartbeat_task = None
|
||||
|
||||
if self._election_task:
|
||||
self._election_task.cancel()
|
||||
try:
|
||||
await self._election_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._election_task = None
|
||||
|
||||
# Release leadership
|
||||
if self.is_leader:
|
||||
await self._release_leadership()
|
||||
|
||||
logger.info("Leader election stopped",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id,
|
||||
was_leader=self.is_leader)
|
||||
|
||||
async def _try_become_leader(self) -> bool:
|
||||
"""
|
||||
Attempt to become the leader.
|
||||
|
||||
Returns:
|
||||
True if this instance is now the leader
|
||||
"""
|
||||
try:
|
||||
# Try to set the lock with NX (only if not exists) and EX (expiry)
|
||||
acquired = await self.redis.set(
|
||||
self.lock_key,
|
||||
self.instance_id,
|
||||
nx=True, # Only set if not exists
|
||||
ex=self.config.lock_ttl_seconds
|
||||
)
|
||||
|
||||
if acquired:
|
||||
self.is_leader = True
|
||||
logger.info("Became leader",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id)
|
||||
|
||||
# Call callback
|
||||
if self._on_become_leader_callback:
|
||||
try:
|
||||
await self._on_become_leader_callback()
|
||||
except Exception as e:
|
||||
logger.error("Error in on_become_leader callback",
|
||||
service=self.service_name,
|
||||
error=str(e))
|
||||
|
||||
return True
|
||||
|
||||
# Check if we're already the leader (reconnection scenario)
|
||||
current_leader = await self.redis.get(self.lock_key)
|
||||
if current_leader:
|
||||
current_leader_str = current_leader.decode() if isinstance(current_leader, bytes) else current_leader
|
||||
if current_leader_str == self.instance_id:
|
||||
self.is_leader = True
|
||||
logger.info("Confirmed as existing leader",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id)
|
||||
return True
|
||||
else:
|
||||
logger.debug("Another instance is leader",
|
||||
service=self.service_name,
|
||||
current_leader=current_leader_str,
|
||||
this_instance=self.instance_id)
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to acquire leadership",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id,
|
||||
error=str(e))
|
||||
return False
|
||||
|
||||
async def _release_leadership(self):
|
||||
"""Release leadership lock"""
|
||||
try:
|
||||
# Only delete if we're the current leader
|
||||
current_leader = await self.redis.get(self.lock_key)
|
||||
if current_leader:
|
||||
current_leader_str = current_leader.decode() if isinstance(current_leader, bytes) else current_leader
|
||||
if current_leader_str == self.instance_id:
|
||||
await self.redis.delete(self.lock_key)
|
||||
logger.info("Released leadership",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id)
|
||||
|
||||
was_leader = self.is_leader
|
||||
self.is_leader = False
|
||||
|
||||
# Call callback only if we were the leader
|
||||
if was_leader and self._on_lose_leader_callback:
|
||||
try:
|
||||
await self._on_lose_leader_callback()
|
||||
except Exception as e:
|
||||
logger.error("Error in on_lose_leader callback",
|
||||
service=self.service_name,
|
||||
error=str(e))
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to release leadership",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id,
|
||||
error=str(e))
|
||||
|
||||
async def _refresh_leadership(self) -> bool:
|
||||
"""
|
||||
Refresh leadership lock TTL.
|
||||
|
||||
Returns:
|
||||
True if leadership was maintained
|
||||
"""
|
||||
try:
|
||||
# Verify we're still the leader
|
||||
current_leader = await self.redis.get(self.lock_key)
|
||||
if not current_leader:
|
||||
logger.warning("Lost leadership (lock expired)",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id)
|
||||
return False
|
||||
|
||||
current_leader_str = current_leader.decode() if isinstance(current_leader, bytes) else current_leader
|
||||
if current_leader_str != self.instance_id:
|
||||
logger.warning("Lost leadership (lock held by another instance)",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id,
|
||||
current_leader=current_leader_str)
|
||||
return False
|
||||
|
||||
# Refresh the TTL
|
||||
await self.redis.expire(self.lock_key, self.config.lock_ttl_seconds)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to refresh leadership",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id,
|
||||
error=str(e))
|
||||
return False
|
||||
|
||||
async def _heartbeat_loop(self):
|
||||
"""Background loop to maintain leadership"""
|
||||
while self._running and self.is_leader:
|
||||
try:
|
||||
await asyncio.sleep(self.config.heartbeat_interval_seconds)
|
||||
|
||||
if not self._running:
|
||||
break
|
||||
|
||||
maintained = await self._refresh_leadership()
|
||||
|
||||
if not maintained:
|
||||
self.is_leader = False
|
||||
|
||||
# Call callback
|
||||
if self._on_lose_leader_callback:
|
||||
try:
|
||||
await self._on_lose_leader_callback()
|
||||
except Exception as e:
|
||||
logger.error("Error in on_lose_leader callback",
|
||||
service=self.service_name,
|
||||
error=str(e))
|
||||
|
||||
# Switch to election loop
|
||||
self._election_task = asyncio.create_task(self._election_loop())
|
||||
break
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error("Error in heartbeat loop",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id,
|
||||
error=str(e))
|
||||
|
||||
async def _election_loop(self):
|
||||
"""Background loop to attempt leadership acquisition"""
|
||||
while self._running and not self.is_leader:
|
||||
try:
|
||||
await asyncio.sleep(self.config.election_check_interval_seconds)
|
||||
|
||||
if not self._running:
|
||||
break
|
||||
|
||||
acquired = await self._try_become_leader()
|
||||
|
||||
if acquired:
|
||||
# Switch to heartbeat loop
|
||||
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
||||
break
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error("Error in election loop",
|
||||
service=self.service_name,
|
||||
instance_id=self.instance_id,
|
||||
error=str(e))
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""Get current leader election status"""
|
||||
return {
|
||||
"service": self.service_name,
|
||||
"instance_id": self.instance_id,
|
||||
"is_leader": self.is_leader,
|
||||
"running": self._running,
|
||||
"lock_key": self.lock_key,
|
||||
"config": {
|
||||
"lock_ttl_seconds": self.config.lock_ttl_seconds,
|
||||
"heartbeat_interval_seconds": self.config.heartbeat_interval_seconds,
|
||||
"election_check_interval_seconds": self.config.election_check_interval_seconds
|
||||
}
|
||||
}
|
||||
|
||||
async def get_current_leader(self) -> Optional[str]:
|
||||
"""Get the current leader instance ID (if any)"""
|
||||
try:
|
||||
current_leader = await self.redis.get(self.lock_key)
|
||||
if current_leader:
|
||||
return current_leader.decode() if isinstance(current_leader, bytes) else current_leader
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error("Failed to get current leader",
|
||||
service=self.service_name,
|
||||
error=str(e))
|
||||
return None
|
||||
Reference in New Issue
Block a user