Add new infra architecture
This commit is contained in:
@@ -2,6 +2,8 @@
|
||||
Production Scheduler Service
|
||||
Background task that periodically checks for production alert conditions
|
||||
and triggers appropriate alerts.
|
||||
|
||||
Uses shared leader election for horizontal scaling - only one pod runs the scheduler.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
@@ -21,103 +23,144 @@ from app.services.production_alert_service import ProductionAlertService
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
class ProductionScheduler:
|
||||
"""Production scheduler service that checks for alert conditions"""
|
||||
|
||||
def __init__(self, alert_service: ProductionAlertService, database_manager: Any):
|
||||
class ProductionScheduler:
|
||||
"""Production scheduler service that checks for alert conditions.
|
||||
|
||||
Uses Redis-based leader election to ensure only one pod runs the scheduler.
|
||||
"""
|
||||
|
||||
def __init__(self, alert_service: ProductionAlertService, database_manager: Any, redis_url: str = None):
|
||||
self.alert_service = alert_service
|
||||
self.database_manager = database_manager
|
||||
self.redis_url = redis_url
|
||||
self.scheduler = AsyncIOScheduler()
|
||||
self.check_interval = 300 # 5 minutes
|
||||
self.job_id = 'production_scheduler'
|
||||
|
||||
# Leader election
|
||||
self._leader_election = None
|
||||
self._redis_client = None
|
||||
self._scheduler_started = False
|
||||
|
||||
# Cache de alertas emitidas para evitar duplicados
|
||||
self._emitted_alerts: set = set()
|
||||
self._alert_cache_ttl = 3600 # 1 hora
|
||||
self._last_cache_clear = datetime.utcnow()
|
||||
|
||||
async def start(self):
|
||||
"""Start the production scheduler with APScheduler"""
|
||||
if self.scheduler.running:
|
||||
logger.warning("Production scheduler is already running")
|
||||
return
|
||||
"""Start the production scheduler with leader election"""
|
||||
try:
|
||||
# Initialize leader election if Redis URL is provided
|
||||
if self.redis_url:
|
||||
await self._setup_leader_election()
|
||||
else:
|
||||
# No Redis, start scheduler directly (standalone mode)
|
||||
logger.warning("No Redis URL provided, starting scheduler in standalone mode")
|
||||
await self._start_scheduler()
|
||||
except Exception as e:
|
||||
logger.error("Failed to setup leader election, starting in standalone mode",
|
||||
error=str(e))
|
||||
await self._start_scheduler()
|
||||
|
||||
# Add the periodic job
|
||||
trigger = IntervalTrigger(seconds=self.check_interval)
|
||||
self.scheduler.add_job(
|
||||
self._run_scheduler_task,
|
||||
trigger=trigger,
|
||||
id=self.job_id,
|
||||
name="Production Alert Checks",
|
||||
max_instances=1 # Prevent overlapping executions
|
||||
async def _setup_leader_election(self):
|
||||
"""Setup Redis-based leader election"""
|
||||
from shared.leader_election import LeaderElectionService
|
||||
import redis.asyncio as redis
|
||||
|
||||
self._redis_client = redis.from_url(self.redis_url, decode_responses=False)
|
||||
await self._redis_client.ping()
|
||||
|
||||
self._leader_election = LeaderElectionService(
|
||||
self._redis_client,
|
||||
service_name="production-scheduler"
|
||||
)
|
||||
|
||||
# Start the scheduler
|
||||
self.scheduler.start()
|
||||
logger.info("Production scheduler started", interval_seconds=self.check_interval)
|
||||
await self._leader_election.start(
|
||||
on_become_leader=self._on_become_leader,
|
||||
on_lose_leader=self._on_lose_leader
|
||||
)
|
||||
|
||||
logger.info("Leader election initialized for production scheduler",
|
||||
is_leader=self._leader_election.is_leader)
|
||||
|
||||
async def _on_become_leader(self):
|
||||
"""Called when this instance becomes the leader"""
|
||||
logger.info("Became leader for production scheduler - starting scheduler")
|
||||
await self._start_scheduler()
|
||||
|
||||
async def _on_lose_leader(self):
|
||||
"""Called when this instance loses leadership"""
|
||||
logger.warning("Lost leadership for production scheduler - stopping scheduler")
|
||||
await self._stop_scheduler()
|
||||
|
||||
async def _start_scheduler(self):
|
||||
"""Start the APScheduler"""
|
||||
if self._scheduler_started:
|
||||
logger.debug("Production scheduler already started")
|
||||
return
|
||||
|
||||
if not self.scheduler.running:
|
||||
trigger = IntervalTrigger(seconds=self.check_interval)
|
||||
self.scheduler.add_job(
|
||||
self._run_scheduler_task,
|
||||
trigger=trigger,
|
||||
id=self.job_id,
|
||||
name="Production Alert Checks",
|
||||
max_instances=1
|
||||
)
|
||||
|
||||
self.scheduler.start()
|
||||
self._scheduler_started = True
|
||||
logger.info("Production scheduler started", interval_seconds=self.check_interval)
|
||||
|
||||
async def _stop_scheduler(self):
|
||||
"""Stop the APScheduler"""
|
||||
if not self._scheduler_started:
|
||||
return
|
||||
|
||||
if self.scheduler.running:
|
||||
self.scheduler.shutdown(wait=False)
|
||||
self._scheduler_started = False
|
||||
logger.info("Production scheduler stopped")
|
||||
|
||||
async def stop(self):
|
||||
"""Stop the production scheduler"""
|
||||
if self.scheduler.running:
|
||||
self.scheduler.shutdown(wait=True)
|
||||
logger.info("Production scheduler stopped")
|
||||
else:
|
||||
logger.info("Production scheduler already stopped")
|
||||
"""Stop the production scheduler and leader election"""
|
||||
if self._leader_election:
|
||||
await self._leader_election.stop()
|
||||
|
||||
await self._stop_scheduler()
|
||||
|
||||
if self._redis_client:
|
||||
await self._redis_client.close()
|
||||
|
||||
@property
|
||||
def is_leader(self) -> bool:
|
||||
"""Check if this instance is the leader"""
|
||||
return self._leader_election.is_leader if self._leader_election else True
|
||||
|
||||
async def _run_scheduler_task(self):
|
||||
"""Run scheduled production alert checks with leader election"""
|
||||
# Try to acquire leader lock for this scheduler
|
||||
lock_name = f"production_scheduler:{self.database_manager.database_url if hasattr(self.database_manager, 'database_url') else 'default'}"
|
||||
lock_id = abs(hash(lock_name)) % (2**31) # Generate a unique integer ID for the lock
|
||||
acquired = False
|
||||
"""Run scheduled production alert checks"""
|
||||
start_time = datetime.now()
|
||||
logger.info("Running scheduled production alert checks")
|
||||
|
||||
try:
|
||||
# Try to acquire PostgreSQL advisory lock for leader election
|
||||
async with self.database_manager.get_session() as session:
|
||||
result = await session.execute(text("SELECT pg_try_advisory_lock(:lock_id)"), {"lock_id": lock_id})
|
||||
acquired = True # If no exception, lock was acquired
|
||||
# Run all alert checks
|
||||
alerts_generated = await self.check_all_conditions()
|
||||
|
||||
start_time = datetime.now()
|
||||
logger.info("Running scheduled production alert checks (as leader)")
|
||||
|
||||
# Run all alert checks
|
||||
alerts_generated = await self.check_all_conditions()
|
||||
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
logger.info(
|
||||
"Completed scheduled production alert checks",
|
||||
alerts_generated=alerts_generated,
|
||||
duration_seconds=round(duration, 2)
|
||||
)
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
logger.info(
|
||||
"Completed scheduled production alert checks",
|
||||
alerts_generated=alerts_generated,
|
||||
duration_seconds=round(duration, 2)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# If it's a lock acquisition error, log and skip execution (another instance is running)
|
||||
error_str = str(e).lower()
|
||||
if "lock" in error_str or "timeout" in error_str or "could not acquire" in error_str:
|
||||
logger.debug(
|
||||
"Skipping production scheduler execution (not leader)",
|
||||
lock_name=lock_name
|
||||
)
|
||||
return # Not an error, just not the leader
|
||||
else:
|
||||
logger.error(
|
||||
"Error in production scheduler task",
|
||||
error=str(e),
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
finally:
|
||||
if acquired:
|
||||
# Release the lock
|
||||
try:
|
||||
async with self.database_manager.get_session() as session:
|
||||
await session.execute(text("SELECT pg_advisory_unlock(:lock_id)"), {"lock_id": lock_id})
|
||||
await session.commit()
|
||||
except Exception as unlock_error:
|
||||
logger.warning(
|
||||
"Error releasing leader lock (may have been automatically released)",
|
||||
error=str(unlock_error)
|
||||
)
|
||||
logger.error(
|
||||
"Error in production scheduler task",
|
||||
error=str(e),
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
async def check_all_conditions(self) -> int:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user