Add new infra architecture

This commit is contained in:
Urtzi Alfaro
2026-01-19 11:55:17 +01:00
parent 21d35ea92b
commit 35f164f0cd
311 changed files with 13241 additions and 3700 deletions

View File

@@ -2,6 +2,8 @@
Production Scheduler Service
Background task that periodically checks for production alert conditions
and triggers appropriate alerts.
Uses shared leader election for horizontal scaling - only one pod runs the scheduler.
"""
import asyncio
@@ -21,103 +23,144 @@ from app.services.production_alert_service import ProductionAlertService
logger = structlog.get_logger()
class ProductionScheduler:
"""Production scheduler service that checks for alert conditions"""
def __init__(self, alert_service: ProductionAlertService, database_manager: Any):
class ProductionScheduler:
"""Production scheduler service that checks for alert conditions.
Uses Redis-based leader election to ensure only one pod runs the scheduler.
"""
def __init__(self, alert_service: ProductionAlertService, database_manager: Any, redis_url: str = None):
self.alert_service = alert_service
self.database_manager = database_manager
self.redis_url = redis_url
self.scheduler = AsyncIOScheduler()
self.check_interval = 300 # 5 minutes
self.job_id = 'production_scheduler'
# Leader election
self._leader_election = None
self._redis_client = None
self._scheduler_started = False
# Cache de alertas emitidas para evitar duplicados
self._emitted_alerts: set = set()
self._alert_cache_ttl = 3600 # 1 hora
self._last_cache_clear = datetime.utcnow()
async def start(self):
"""Start the production scheduler with APScheduler"""
if self.scheduler.running:
logger.warning("Production scheduler is already running")
return
"""Start the production scheduler with leader election"""
try:
# Initialize leader election if Redis URL is provided
if self.redis_url:
await self._setup_leader_election()
else:
# No Redis, start scheduler directly (standalone mode)
logger.warning("No Redis URL provided, starting scheduler in standalone mode")
await self._start_scheduler()
except Exception as e:
logger.error("Failed to setup leader election, starting in standalone mode",
error=str(e))
await self._start_scheduler()
# Add the periodic job
trigger = IntervalTrigger(seconds=self.check_interval)
self.scheduler.add_job(
self._run_scheduler_task,
trigger=trigger,
id=self.job_id,
name="Production Alert Checks",
max_instances=1 # Prevent overlapping executions
async def _setup_leader_election(self):
"""Setup Redis-based leader election"""
from shared.leader_election import LeaderElectionService
import redis.asyncio as redis
self._redis_client = redis.from_url(self.redis_url, decode_responses=False)
await self._redis_client.ping()
self._leader_election = LeaderElectionService(
self._redis_client,
service_name="production-scheduler"
)
# Start the scheduler
self.scheduler.start()
logger.info("Production scheduler started", interval_seconds=self.check_interval)
await self._leader_election.start(
on_become_leader=self._on_become_leader,
on_lose_leader=self._on_lose_leader
)
logger.info("Leader election initialized for production scheduler",
is_leader=self._leader_election.is_leader)
async def _on_become_leader(self):
"""Called when this instance becomes the leader"""
logger.info("Became leader for production scheduler - starting scheduler")
await self._start_scheduler()
async def _on_lose_leader(self):
"""Called when this instance loses leadership"""
logger.warning("Lost leadership for production scheduler - stopping scheduler")
await self._stop_scheduler()
async def _start_scheduler(self):
"""Start the APScheduler"""
if self._scheduler_started:
logger.debug("Production scheduler already started")
return
if not self.scheduler.running:
trigger = IntervalTrigger(seconds=self.check_interval)
self.scheduler.add_job(
self._run_scheduler_task,
trigger=trigger,
id=self.job_id,
name="Production Alert Checks",
max_instances=1
)
self.scheduler.start()
self._scheduler_started = True
logger.info("Production scheduler started", interval_seconds=self.check_interval)
async def _stop_scheduler(self):
"""Stop the APScheduler"""
if not self._scheduler_started:
return
if self.scheduler.running:
self.scheduler.shutdown(wait=False)
self._scheduler_started = False
logger.info("Production scheduler stopped")
async def stop(self):
"""Stop the production scheduler"""
if self.scheduler.running:
self.scheduler.shutdown(wait=True)
logger.info("Production scheduler stopped")
else:
logger.info("Production scheduler already stopped")
"""Stop the production scheduler and leader election"""
if self._leader_election:
await self._leader_election.stop()
await self._stop_scheduler()
if self._redis_client:
await self._redis_client.close()
@property
def is_leader(self) -> bool:
"""Check if this instance is the leader"""
return self._leader_election.is_leader if self._leader_election else True
async def _run_scheduler_task(self):
"""Run scheduled production alert checks with leader election"""
# Try to acquire leader lock for this scheduler
lock_name = f"production_scheduler:{self.database_manager.database_url if hasattr(self.database_manager, 'database_url') else 'default'}"
lock_id = abs(hash(lock_name)) % (2**31) # Generate a unique integer ID for the lock
acquired = False
"""Run scheduled production alert checks"""
start_time = datetime.now()
logger.info("Running scheduled production alert checks")
try:
# Try to acquire PostgreSQL advisory lock for leader election
async with self.database_manager.get_session() as session:
result = await session.execute(text("SELECT pg_try_advisory_lock(:lock_id)"), {"lock_id": lock_id})
acquired = True # If no exception, lock was acquired
# Run all alert checks
alerts_generated = await self.check_all_conditions()
start_time = datetime.now()
logger.info("Running scheduled production alert checks (as leader)")
# Run all alert checks
alerts_generated = await self.check_all_conditions()
duration = (datetime.now() - start_time).total_seconds()
logger.info(
"Completed scheduled production alert checks",
alerts_generated=alerts_generated,
duration_seconds=round(duration, 2)
)
duration = (datetime.now() - start_time).total_seconds()
logger.info(
"Completed scheduled production alert checks",
alerts_generated=alerts_generated,
duration_seconds=round(duration, 2)
)
except Exception as e:
# If it's a lock acquisition error, log and skip execution (another instance is running)
error_str = str(e).lower()
if "lock" in error_str or "timeout" in error_str or "could not acquire" in error_str:
logger.debug(
"Skipping production scheduler execution (not leader)",
lock_name=lock_name
)
return # Not an error, just not the leader
else:
logger.error(
"Error in production scheduler task",
error=str(e),
exc_info=True
)
finally:
if acquired:
# Release the lock
try:
async with self.database_manager.get_session() as session:
await session.execute(text("SELECT pg_advisory_unlock(:lock_id)"), {"lock_id": lock_id})
await session.commit()
except Exception as unlock_error:
logger.warning(
"Error releasing leader lock (may have been automatically released)",
error=str(unlock_error)
)
logger.error(
"Error in production scheduler task",
error=str(e),
exc_info=True
)
async def check_all_conditions(self) -> int:
"""