Add new infra architecture
This commit is contained in:
@@ -1,11 +1,11 @@
|
||||
# AI Insights Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Alert Processor Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Auth Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
# Create non-root user for security
|
||||
RUN groupadd -r appgroup && useradd -r -g appgroup appuser
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Demo Session Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Distribution Service Dockerfile
|
||||
# Stage 1: Copy shared libraries
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Stage 2: Main service
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -50,9 +50,9 @@ class DistributionService(StandardFastAPIService):
|
||||
|
||||
def __init__(self):
|
||||
# Define expected database tables for health checks
|
||||
# Must match tables created in migrations/versions/001_initial_schema.py
|
||||
distribution_expected_tables = [
|
||||
'delivery_routes', 'shipments', 'route_assignments', 'delivery_points',
|
||||
'vehicle_assignments', 'delivery_schedule', 'shipment_tracking', 'audit_logs'
|
||||
'delivery_routes', 'shipments', 'delivery_schedules'
|
||||
]
|
||||
|
||||
# Define custom metrics for distribution service
|
||||
|
||||
4
services/external/Dockerfile
vendored
4
services/external/Dockerfile
vendored
@@ -1,11 +1,11 @@
|
||||
# External Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Forecasting Service Dockerfile with MinIO Support
|
||||
# Multi-stage build for optimized production image
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Inventory Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -120,8 +120,12 @@ class InventoryService(StandardFastAPIService):
|
||||
await alert_service.start()
|
||||
self.logger.info("Inventory alert service started")
|
||||
|
||||
# Initialize inventory scheduler with alert service and database manager
|
||||
inventory_scheduler = InventoryScheduler(alert_service, self.database_manager)
|
||||
# Initialize inventory scheduler with alert service, database manager, and Redis URL for leader election
|
||||
inventory_scheduler = InventoryScheduler(
|
||||
alert_service,
|
||||
self.database_manager,
|
||||
redis_url=settings.REDIS_URL # Pass Redis URL for leader election in multi-replica deployments
|
||||
)
|
||||
await inventory_scheduler.start()
|
||||
self.logger.info("Inventory scheduler started")
|
||||
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
Inventory Scheduler Service
|
||||
Background task that periodically checks for inventory alert conditions
|
||||
and triggers appropriate alerts.
|
||||
|
||||
Uses Redis-based leader election to ensure only one pod runs scheduled tasks
|
||||
when running with multiple replicas.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
@@ -22,22 +25,129 @@ from app.services.inventory_alert_service import InventoryAlertService
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
class InventoryScheduler:
|
||||
"""Inventory scheduler service that checks for alert conditions"""
|
||||
|
||||
def __init__(self, alert_service: InventoryAlertService, database_manager: Any):
|
||||
class InventoryScheduler:
|
||||
"""
|
||||
Inventory scheduler service that checks for alert conditions.
|
||||
|
||||
Uses Redis-based leader election to ensure only one pod runs
|
||||
scheduled jobs in a multi-replica deployment.
|
||||
"""
|
||||
|
||||
def __init__(self, alert_service: InventoryAlertService, database_manager: Any, redis_url: str = None):
|
||||
self.alert_service = alert_service
|
||||
self.database_manager = database_manager
|
||||
self.scheduler = AsyncIOScheduler()
|
||||
self.scheduler = None
|
||||
self.check_interval = 300 # 5 minutes
|
||||
self.job_id = 'inventory_scheduler'
|
||||
|
||||
# Leader election
|
||||
self._redis_url = redis_url
|
||||
self._leader_election = None
|
||||
self._redis_client = None
|
||||
self._scheduler_started = False
|
||||
|
||||
async def start(self):
|
||||
"""Start the inventory scheduler with APScheduler"""
|
||||
if self.scheduler.running:
|
||||
logger.warning("Inventory scheduler is already running")
|
||||
"""Start the inventory scheduler with leader election"""
|
||||
if self._redis_url:
|
||||
await self._start_with_leader_election()
|
||||
else:
|
||||
# Fallback to standalone mode (for local development or single-pod deployments)
|
||||
logger.warning("Redis URL not provided, starting inventory scheduler in standalone mode")
|
||||
await self._start_standalone()
|
||||
|
||||
async def _start_with_leader_election(self):
|
||||
"""Start with Redis-based leader election for horizontal scaling"""
|
||||
import redis.asyncio as redis
|
||||
from shared.leader_election import LeaderElectionService
|
||||
|
||||
try:
|
||||
# Create Redis connection
|
||||
self._redis_client = redis.from_url(self._redis_url, decode_responses=False)
|
||||
await self._redis_client.ping()
|
||||
|
||||
# Create scheduler (but don't start it yet)
|
||||
self.scheduler = AsyncIOScheduler()
|
||||
|
||||
# Create leader election
|
||||
self._leader_election = LeaderElectionService(
|
||||
self._redis_client,
|
||||
service_name="inventory-scheduler"
|
||||
)
|
||||
|
||||
# Start leader election with callbacks
|
||||
await self._leader_election.start(
|
||||
on_become_leader=self._on_become_leader,
|
||||
on_lose_leader=self._on_lose_leader
|
||||
)
|
||||
|
||||
logger.info("Inventory scheduler started with leader election",
|
||||
is_leader=self._leader_election.is_leader,
|
||||
instance_id=self._leader_election.instance_id)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to start with leader election, falling back to standalone",
|
||||
error=str(e))
|
||||
await self._start_standalone()
|
||||
|
||||
async def _on_become_leader(self):
|
||||
"""Called when this instance becomes the leader"""
|
||||
logger.info("Inventory scheduler became leader, starting scheduled jobs")
|
||||
await self._start_scheduler()
|
||||
|
||||
async def _on_lose_leader(self):
|
||||
"""Called when this instance loses leadership"""
|
||||
logger.warning("Inventory scheduler lost leadership, stopping scheduled jobs")
|
||||
await self._stop_scheduler()
|
||||
|
||||
async def _start_scheduler(self):
|
||||
"""Start the APScheduler with inventory check jobs"""
|
||||
if self._scheduler_started:
|
||||
logger.warning("Inventory scheduler already started")
|
||||
return
|
||||
|
||||
try:
|
||||
# Add the periodic job
|
||||
trigger = IntervalTrigger(seconds=self.check_interval)
|
||||
self.scheduler.add_job(
|
||||
self._run_scheduler_task,
|
||||
trigger=trigger,
|
||||
id=self.job_id,
|
||||
name="Inventory Alert Checks",
|
||||
max_instances=1 # Prevent overlapping executions
|
||||
)
|
||||
|
||||
# Start scheduler
|
||||
if not self.scheduler.running:
|
||||
self.scheduler.start()
|
||||
self._scheduler_started = True
|
||||
logger.info("Inventory scheduler jobs started",
|
||||
interval_seconds=self.check_interval,
|
||||
job_count=len(self.scheduler.get_jobs()))
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to start inventory scheduler", error=str(e))
|
||||
|
||||
async def _stop_scheduler(self):
|
||||
"""Stop the APScheduler"""
|
||||
if not self._scheduler_started:
|
||||
return
|
||||
|
||||
try:
|
||||
if self.scheduler and self.scheduler.running:
|
||||
self.scheduler.shutdown(wait=False)
|
||||
self._scheduler_started = False
|
||||
logger.info("Inventory scheduler jobs stopped")
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to stop inventory scheduler", error=str(e))
|
||||
|
||||
async def _start_standalone(self):
|
||||
"""Start scheduler without leader election (fallback mode)"""
|
||||
logger.warning("Starting inventory scheduler in standalone mode (no leader election)")
|
||||
|
||||
self.scheduler = AsyncIOScheduler()
|
||||
|
||||
# Add the periodic job
|
||||
trigger = IntervalTrigger(seconds=self.check_interval)
|
||||
self.scheduler.add_job(
|
||||
@@ -45,75 +155,63 @@ class InventoryScheduler:
|
||||
trigger=trigger,
|
||||
id=self.job_id,
|
||||
name="Inventory Alert Checks",
|
||||
max_instances=1 # Prevent overlapping executions
|
||||
max_instances=1
|
||||
)
|
||||
|
||||
# Start the scheduler
|
||||
self.scheduler.start()
|
||||
logger.info("Inventory scheduler started", interval_seconds=self.check_interval)
|
||||
if not self.scheduler.running:
|
||||
self.scheduler.start()
|
||||
self._scheduler_started = True
|
||||
logger.info("Inventory scheduler started (standalone mode)",
|
||||
interval_seconds=self.check_interval)
|
||||
|
||||
async def stop(self):
|
||||
"""Stop the inventory scheduler"""
|
||||
if self.scheduler.running:
|
||||
self.scheduler.shutdown(wait=True)
|
||||
logger.info("Inventory scheduler stopped")
|
||||
else:
|
||||
logger.info("Inventory scheduler already stopped")
|
||||
"""Stop the inventory scheduler and leader election"""
|
||||
# Stop leader election
|
||||
if self._leader_election:
|
||||
await self._leader_election.stop()
|
||||
|
||||
# Stop scheduler
|
||||
await self._stop_scheduler()
|
||||
|
||||
# Close Redis
|
||||
if self._redis_client:
|
||||
await self._redis_client.close()
|
||||
|
||||
logger.info("Inventory scheduler stopped")
|
||||
|
||||
@property
|
||||
def is_leader(self) -> bool:
|
||||
"""Check if this instance is the leader"""
|
||||
return self._leader_election.is_leader if self._leader_election else True
|
||||
|
||||
def get_leader_status(self) -> dict:
|
||||
"""Get leader election status"""
|
||||
if self._leader_election:
|
||||
return self._leader_election.get_status()
|
||||
return {"is_leader": True, "mode": "standalone"}
|
||||
|
||||
async def _run_scheduler_task(self):
|
||||
"""Run scheduled inventory alert checks with leader election"""
|
||||
# Try to acquire leader lock for this scheduler
|
||||
lock_name = f"inventory_scheduler:{self.database_manager.database_url if hasattr(self.database_manager, 'database_url') else 'default'}"
|
||||
lock_id = abs(hash(lock_name)) % (2**31) # Generate a unique integer ID for the lock
|
||||
acquired = False
|
||||
"""Run scheduled inventory alert checks"""
|
||||
start_time = datetime.now()
|
||||
logger.info("Running scheduled inventory alert checks")
|
||||
|
||||
try:
|
||||
# Try to acquire PostgreSQL advisory lock for leader election
|
||||
async with self.database_manager.get_session() as session:
|
||||
result = await session.execute(text("SELECT pg_try_advisory_lock(:lock_id)"), {"lock_id": lock_id})
|
||||
acquired = True # If no exception, lock was acquired
|
||||
# Run all alert checks
|
||||
alerts_generated = await self.check_all_conditions()
|
||||
|
||||
start_time = datetime.now()
|
||||
logger.info("Running scheduled inventory alert checks (as leader)")
|
||||
|
||||
# Run all alert checks
|
||||
alerts_generated = await self.check_all_conditions()
|
||||
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
logger.info(
|
||||
"Completed scheduled inventory alert checks",
|
||||
alerts_generated=alerts_generated,
|
||||
duration_seconds=round(duration, 2)
|
||||
)
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
logger.info(
|
||||
"Completed scheduled inventory alert checks",
|
||||
alerts_generated=alerts_generated,
|
||||
duration_seconds=round(duration, 2)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# If it's a lock acquisition error, log and skip execution (another instance is running)
|
||||
error_str = str(e).lower()
|
||||
if "lock" in error_str or "timeout" in error_str or "could not acquire" in error_str:
|
||||
logger.debug(
|
||||
"Skipping inventory scheduler execution (not leader)",
|
||||
lock_name=lock_name
|
||||
)
|
||||
return # Not an error, just not the leader
|
||||
else:
|
||||
logger.error(
|
||||
"Error in inventory scheduler task",
|
||||
error=str(e),
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
finally:
|
||||
if acquired:
|
||||
# Release the lock
|
||||
try:
|
||||
async with self.database_manager.get_session() as session:
|
||||
await session.execute(text("SELECT pg_advisory_unlock(:lock_id)"), {"lock_id": lock_id})
|
||||
await session.commit()
|
||||
except Exception as unlock_error:
|
||||
logger.warning(
|
||||
"Error releasing leader lock (may have been automatically released)",
|
||||
error=str(unlock_error)
|
||||
)
|
||||
logger.error(
|
||||
"Error in inventory scheduler task",
|
||||
error=str(e),
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
async def check_all_conditions(self) -> int:
|
||||
"""
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Notification Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Orchestrator Service Dockerfile
|
||||
# Stage 1: Copy shared libraries
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Stage 2: Main service
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Orders Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Pos Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -20,28 +20,12 @@ from shared.service_base import StandardFastAPIService
|
||||
class POSService(StandardFastAPIService):
|
||||
"""POS Integration Service with standardized setup"""
|
||||
|
||||
expected_migration_version = "00001"
|
||||
|
||||
async def on_startup(self, app):
|
||||
"""Custom startup logic including migration verification"""
|
||||
await self.verify_migrations()
|
||||
await super().on_startup(app)
|
||||
|
||||
async def verify_migrations(self):
|
||||
"""Verify database schema matches the latest migrations."""
|
||||
try:
|
||||
async with self.database_manager.get_session() as session:
|
||||
result = await session.execute(text("SELECT version_num FROM alembic_version"))
|
||||
version = result.scalar()
|
||||
if version != self.expected_migration_version:
|
||||
self.logger.error(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
|
||||
raise RuntimeError(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
|
||||
self.logger.info(f"Migration verification successful: {version}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Migration verification failed: {e}")
|
||||
raise
|
||||
expected_migration_version = "e9976ec9fe9e"
|
||||
|
||||
def __init__(self):
|
||||
# Initialize scheduler reference
|
||||
self.pos_scheduler = None
|
||||
|
||||
# Define expected database tables for health checks
|
||||
pos_expected_tables = [
|
||||
'pos_configurations', 'pos_transactions', 'pos_transaction_items',
|
||||
@@ -87,15 +71,42 @@ class POSService(StandardFastAPIService):
|
||||
custom_metrics=pos_custom_metrics
|
||||
)
|
||||
|
||||
async def verify_migrations(self):
|
||||
"""Verify database schema matches the latest migrations."""
|
||||
try:
|
||||
async with self.database_manager.get_session() as session:
|
||||
result = await session.execute(text("SELECT version_num FROM alembic_version"))
|
||||
version = result.scalar()
|
||||
if version != self.expected_migration_version:
|
||||
self.logger.error(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
|
||||
raise RuntimeError(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
|
||||
self.logger.info(f"Migration verification successful: {version}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Migration verification failed: {e}")
|
||||
raise
|
||||
|
||||
async def on_startup(self, app: FastAPI):
|
||||
"""Custom startup logic for POS service"""
|
||||
# Start background scheduler for POS-to-Sales sync
|
||||
# Verify migrations first
|
||||
await self.verify_migrations()
|
||||
|
||||
# Call parent startup
|
||||
await super().on_startup(app)
|
||||
|
||||
# Start background scheduler for POS-to-Sales sync with leader election
|
||||
try:
|
||||
from app.scheduler import start_scheduler
|
||||
start_scheduler()
|
||||
self.logger.info("Background scheduler started successfully")
|
||||
from app.scheduler import POSScheduler
|
||||
self.pos_scheduler = POSScheduler(
|
||||
redis_url=settings.REDIS_URL, # Pass Redis URL for leader election
|
||||
sync_interval_minutes=settings.SYNC_INTERVAL_SECONDS // 60 if settings.SYNC_INTERVAL_SECONDS >= 60 else 5
|
||||
)
|
||||
await self.pos_scheduler.start()
|
||||
self.logger.info("POS scheduler started successfully with leader election")
|
||||
|
||||
# Store scheduler in app state for status checks
|
||||
app.state.pos_scheduler = self.pos_scheduler
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to start background scheduler: {e}", exc_info=True)
|
||||
self.logger.error(f"Failed to start POS scheduler: {e}", exc_info=True)
|
||||
# Don't fail startup if scheduler fails
|
||||
|
||||
# Custom startup completed
|
||||
@@ -103,13 +114,13 @@ class POSService(StandardFastAPIService):
|
||||
|
||||
async def on_shutdown(self, app: FastAPI):
|
||||
"""Custom shutdown logic for POS service"""
|
||||
# Shutdown background scheduler
|
||||
# Shutdown POS scheduler
|
||||
try:
|
||||
from app.scheduler import shutdown_scheduler
|
||||
shutdown_scheduler()
|
||||
self.logger.info("Background scheduler stopped successfully")
|
||||
if self.pos_scheduler:
|
||||
await self.pos_scheduler.stop()
|
||||
self.logger.info("POS scheduler stopped successfully")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to stop background scheduler: {e}", exc_info=True)
|
||||
self.logger.error(f"Failed to stop POS scheduler: {e}", exc_info=True)
|
||||
|
||||
# Database cleanup is handled by the base class
|
||||
pass
|
||||
|
||||
@@ -5,17 +5,19 @@ Sets up periodic background jobs for:
|
||||
- Syncing POS transactions to sales service
|
||||
- Other maintenance tasks as needed
|
||||
|
||||
To enable scheduling, add to main.py startup:
|
||||
Uses Redis-based leader election to ensure only one pod runs scheduled tasks
|
||||
when running with multiple replicas.
|
||||
|
||||
Usage in main.py:
|
||||
```python
|
||||
from app.scheduler import start_scheduler, shutdown_scheduler
|
||||
from app.scheduler import POSScheduler
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
start_scheduler()
|
||||
# On startup
|
||||
scheduler = POSScheduler(redis_url=settings.REDIS_URL)
|
||||
await scheduler.start()
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event():
|
||||
shutdown_scheduler()
|
||||
# On shutdown
|
||||
await scheduler.stop()
|
||||
```
|
||||
"""
|
||||
|
||||
@@ -23,65 +25,307 @@ import structlog
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.interval import IntervalTrigger
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
# Global scheduler instance
|
||||
scheduler = None
|
||||
|
||||
class POSScheduler:
|
||||
"""
|
||||
POS Scheduler service that manages background sync jobs.
|
||||
|
||||
Uses Redis-based leader election to ensure only one pod runs
|
||||
scheduled jobs in a multi-replica deployment.
|
||||
"""
|
||||
|
||||
def __init__(self, redis_url: str = None, sync_interval_minutes: int = 5):
|
||||
"""
|
||||
Initialize POS scheduler.
|
||||
|
||||
Args:
|
||||
redis_url: Redis connection URL for leader election
|
||||
sync_interval_minutes: Interval for POS-to-sales sync job
|
||||
"""
|
||||
self.scheduler = None
|
||||
self.sync_interval_minutes = sync_interval_minutes
|
||||
|
||||
# Leader election
|
||||
self._redis_url = redis_url
|
||||
self._leader_election = None
|
||||
self._redis_client = None
|
||||
self._scheduler_started = False
|
||||
|
||||
async def start(self):
|
||||
"""Start the POS scheduler with leader election"""
|
||||
if self._redis_url:
|
||||
await self._start_with_leader_election()
|
||||
else:
|
||||
# Fallback to standalone mode (for local development or single-pod deployments)
|
||||
logger.warning("Redis URL not provided, starting POS scheduler in standalone mode")
|
||||
await self._start_standalone()
|
||||
|
||||
async def _start_with_leader_election(self):
|
||||
"""Start with Redis-based leader election for horizontal scaling"""
|
||||
import redis.asyncio as redis
|
||||
from shared.leader_election import LeaderElectionService
|
||||
|
||||
try:
|
||||
# Create Redis connection
|
||||
self._redis_client = redis.from_url(self._redis_url, decode_responses=False)
|
||||
await self._redis_client.ping()
|
||||
|
||||
# Create scheduler (but don't start it yet)
|
||||
self.scheduler = AsyncIOScheduler()
|
||||
|
||||
# Create leader election
|
||||
self._leader_election = LeaderElectionService(
|
||||
self._redis_client,
|
||||
service_name="pos-scheduler"
|
||||
)
|
||||
|
||||
# Start leader election with callbacks
|
||||
await self._leader_election.start(
|
||||
on_become_leader=self._on_become_leader,
|
||||
on_lose_leader=self._on_lose_leader
|
||||
)
|
||||
|
||||
logger.info("POS scheduler started with leader election",
|
||||
is_leader=self._leader_election.is_leader,
|
||||
instance_id=self._leader_election.instance_id)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to start with leader election, falling back to standalone",
|
||||
error=str(e))
|
||||
await self._start_standalone()
|
||||
|
||||
async def _on_become_leader(self):
|
||||
"""Called when this instance becomes the leader"""
|
||||
logger.info("POS scheduler became leader, starting scheduled jobs")
|
||||
await self._start_scheduler()
|
||||
|
||||
async def _on_lose_leader(self):
|
||||
"""Called when this instance loses leadership"""
|
||||
logger.warning("POS scheduler lost leadership, stopping scheduled jobs")
|
||||
await self._stop_scheduler()
|
||||
|
||||
async def _start_scheduler(self):
|
||||
"""Start the APScheduler with POS jobs"""
|
||||
if self._scheduler_started:
|
||||
logger.warning("POS scheduler already started")
|
||||
return
|
||||
|
||||
try:
|
||||
# Import sync job
|
||||
from app.jobs.sync_pos_to_sales import run_pos_to_sales_sync
|
||||
|
||||
# Job 1: Sync POS transactions to sales service
|
||||
self.scheduler.add_job(
|
||||
run_pos_to_sales_sync,
|
||||
trigger=IntervalTrigger(minutes=self.sync_interval_minutes),
|
||||
id='pos_to_sales_sync',
|
||||
name='Sync POS Transactions to Sales',
|
||||
replace_existing=True,
|
||||
max_instances=1, # Prevent concurrent runs
|
||||
coalesce=True, # Combine multiple missed runs into one
|
||||
misfire_grace_time=60 # Allow 60 seconds grace for missed runs
|
||||
)
|
||||
|
||||
# Start scheduler
|
||||
if not self.scheduler.running:
|
||||
self.scheduler.start()
|
||||
self._scheduler_started = True
|
||||
logger.info("POS scheduler jobs started",
|
||||
sync_interval_minutes=self.sync_interval_minutes,
|
||||
job_count=len(self.scheduler.get_jobs()),
|
||||
next_run=self.scheduler.get_jobs()[0].next_run_time if self.scheduler.get_jobs() else None)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to start POS scheduler", error=str(e))
|
||||
|
||||
async def _stop_scheduler(self):
|
||||
"""Stop the APScheduler"""
|
||||
if not self._scheduler_started:
|
||||
return
|
||||
|
||||
try:
|
||||
if self.scheduler and self.scheduler.running:
|
||||
self.scheduler.shutdown(wait=False)
|
||||
self._scheduler_started = False
|
||||
logger.info("POS scheduler jobs stopped")
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to stop POS scheduler", error=str(e))
|
||||
|
||||
async def _start_standalone(self):
|
||||
"""Start scheduler without leader election (fallback mode)"""
|
||||
logger.warning("Starting POS scheduler in standalone mode (no leader election)")
|
||||
|
||||
self.scheduler = AsyncIOScheduler()
|
||||
|
||||
try:
|
||||
# Import sync job
|
||||
from app.jobs.sync_pos_to_sales import run_pos_to_sales_sync
|
||||
|
||||
self.scheduler.add_job(
|
||||
run_pos_to_sales_sync,
|
||||
trigger=IntervalTrigger(minutes=self.sync_interval_minutes),
|
||||
id='pos_to_sales_sync',
|
||||
name='Sync POS Transactions to Sales',
|
||||
replace_existing=True,
|
||||
max_instances=1,
|
||||
coalesce=True,
|
||||
misfire_grace_time=60
|
||||
)
|
||||
|
||||
if not self.scheduler.running:
|
||||
self.scheduler.start()
|
||||
self._scheduler_started = True
|
||||
logger.info("POS scheduler started (standalone mode)",
|
||||
sync_interval_minutes=self.sync_interval_minutes,
|
||||
next_run=self.scheduler.get_jobs()[0].next_run_time if self.scheduler.get_jobs() else None)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to start POS scheduler in standalone mode", error=str(e))
|
||||
|
||||
async def stop(self):
|
||||
"""Stop the POS scheduler and leader election"""
|
||||
# Stop leader election
|
||||
if self._leader_election:
|
||||
await self._leader_election.stop()
|
||||
|
||||
# Stop scheduler
|
||||
await self._stop_scheduler()
|
||||
|
||||
# Close Redis
|
||||
if self._redis_client:
|
||||
await self._redis_client.close()
|
||||
|
||||
logger.info("POS scheduler stopped")
|
||||
|
||||
@property
|
||||
def is_leader(self) -> bool:
|
||||
"""Check if this instance is the leader"""
|
||||
return self._leader_election.is_leader if self._leader_election else True
|
||||
|
||||
def get_leader_status(self) -> dict:
|
||||
"""Get leader election status"""
|
||||
if self._leader_election:
|
||||
return self._leader_election.get_status()
|
||||
return {"is_leader": True, "mode": "standalone"}
|
||||
|
||||
def get_scheduler_status(self) -> dict:
|
||||
"""
|
||||
Get current scheduler status
|
||||
|
||||
Returns:
|
||||
Dict with scheduler info and job statuses
|
||||
"""
|
||||
if self.scheduler is None or not self._scheduler_started:
|
||||
return {
|
||||
"running": False,
|
||||
"is_leader": self.is_leader,
|
||||
"jobs": []
|
||||
}
|
||||
|
||||
jobs = []
|
||||
for job in self.scheduler.get_jobs():
|
||||
jobs.append({
|
||||
"id": job.id,
|
||||
"name": job.name,
|
||||
"next_run": job.next_run_time.isoformat() if job.next_run_time else None,
|
||||
"trigger": str(job.trigger)
|
||||
})
|
||||
|
||||
return {
|
||||
"running": True,
|
||||
"is_leader": self.is_leader,
|
||||
"jobs": jobs,
|
||||
"state": self.scheduler.state
|
||||
}
|
||||
|
||||
def trigger_job_now(self, job_id: str) -> bool:
|
||||
"""
|
||||
Manually trigger a scheduled job immediately
|
||||
|
||||
Args:
|
||||
job_id: Job identifier (e.g., 'pos_to_sales_sync')
|
||||
|
||||
Returns:
|
||||
True if job was triggered, False otherwise
|
||||
"""
|
||||
if self.scheduler is None or not self._scheduler_started:
|
||||
logger.error("Cannot trigger job, scheduler not running")
|
||||
return False
|
||||
|
||||
if not self.is_leader:
|
||||
logger.warning("Cannot trigger job, this instance is not the leader")
|
||||
return False
|
||||
|
||||
try:
|
||||
job = self.scheduler.get_job(job_id)
|
||||
if job:
|
||||
self.scheduler.modify_job(job_id, next_run_time=datetime.now())
|
||||
logger.info("Job triggered manually", job_id=job_id)
|
||||
return True
|
||||
else:
|
||||
logger.warning("Job not found", job_id=job_id)
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to trigger job", job_id=job_id, error=str(e))
|
||||
return False
|
||||
|
||||
|
||||
# ================================================================
|
||||
# Legacy compatibility functions (deprecated - use POSScheduler class)
|
||||
# ================================================================
|
||||
|
||||
# Global scheduler instance for backward compatibility
|
||||
_scheduler_instance: Optional[POSScheduler] = None
|
||||
|
||||
|
||||
def start_scheduler():
|
||||
"""
|
||||
Initialize and start the background scheduler
|
||||
DEPRECATED: Use POSScheduler class directly for better leader election support.
|
||||
|
||||
Jobs configured:
|
||||
- POS to Sales Sync: Every 5 minutes
|
||||
Initialize and start the background scheduler (legacy function).
|
||||
"""
|
||||
global scheduler
|
||||
global _scheduler_instance
|
||||
|
||||
if scheduler is not None:
|
||||
if _scheduler_instance is not None:
|
||||
logger.warning("Scheduler already running")
|
||||
return
|
||||
|
||||
logger.warning("Using deprecated start_scheduler function. "
|
||||
"Consider migrating to POSScheduler class for leader election support.")
|
||||
|
||||
try:
|
||||
scheduler = AsyncIOScheduler()
|
||||
|
||||
# Job 1: Sync POS transactions to sales service
|
||||
from app.jobs.sync_pos_to_sales import run_pos_to_sales_sync
|
||||
|
||||
scheduler.add_job(
|
||||
run_pos_to_sales_sync,
|
||||
trigger=IntervalTrigger(minutes=5),
|
||||
id='pos_to_sales_sync',
|
||||
name='Sync POS Transactions to Sales',
|
||||
replace_existing=True,
|
||||
max_instances=1, # Prevent concurrent runs
|
||||
coalesce=True, # Combine multiple missed runs into one
|
||||
misfire_grace_time=60 # Allow 60 seconds grace for missed runs
|
||||
)
|
||||
|
||||
scheduler.start()
|
||||
logger.info("Background scheduler started",
|
||||
jobs=len(scheduler.get_jobs()),
|
||||
next_run=scheduler.get_jobs()[0].next_run_time if scheduler.get_jobs() else None)
|
||||
_scheduler_instance = POSScheduler()
|
||||
# Note: This is synchronous fallback, no leader election
|
||||
import asyncio
|
||||
asyncio.create_task(_scheduler_instance._start_standalone())
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to start scheduler", error=str(e), exc_info=True)
|
||||
scheduler = None
|
||||
_scheduler_instance = None
|
||||
|
||||
|
||||
def shutdown_scheduler():
|
||||
"""Gracefully shutdown the scheduler"""
|
||||
global scheduler
|
||||
"""
|
||||
DEPRECATED: Use POSScheduler class directly.
|
||||
|
||||
if scheduler is None:
|
||||
Gracefully shutdown the scheduler (legacy function).
|
||||
"""
|
||||
global _scheduler_instance
|
||||
|
||||
if _scheduler_instance is None:
|
||||
logger.warning("Scheduler not running")
|
||||
return
|
||||
|
||||
try:
|
||||
scheduler.shutdown(wait=True)
|
||||
logger.info("Background scheduler stopped")
|
||||
scheduler = None
|
||||
import asyncio
|
||||
asyncio.create_task(_scheduler_instance.stop())
|
||||
_scheduler_instance = None
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to shutdown scheduler", error=str(e), exc_info=True)
|
||||
@@ -89,57 +333,25 @@ def shutdown_scheduler():
|
||||
|
||||
def get_scheduler_status():
|
||||
"""
|
||||
Get current scheduler status
|
||||
DEPRECATED: Use POSScheduler class directly.
|
||||
|
||||
Returns:
|
||||
Dict with scheduler info and job statuses
|
||||
Get current scheduler status (legacy function).
|
||||
"""
|
||||
if scheduler is None:
|
||||
if _scheduler_instance is None:
|
||||
return {
|
||||
"running": False,
|
||||
"jobs": []
|
||||
}
|
||||
|
||||
jobs = []
|
||||
for job in scheduler.get_jobs():
|
||||
jobs.append({
|
||||
"id": job.id,
|
||||
"name": job.name,
|
||||
"next_run": job.next_run_time.isoformat() if job.next_run_time else None,
|
||||
"trigger": str(job.trigger)
|
||||
})
|
||||
|
||||
return {
|
||||
"running": True,
|
||||
"jobs": jobs,
|
||||
"state": scheduler.state
|
||||
}
|
||||
return _scheduler_instance.get_scheduler_status()
|
||||
|
||||
|
||||
def trigger_job_now(job_id: str):
|
||||
"""
|
||||
Manually trigger a scheduled job immediately
|
||||
DEPRECATED: Use POSScheduler class directly.
|
||||
|
||||
Args:
|
||||
job_id: Job identifier (e.g., 'pos_to_sales_sync')
|
||||
|
||||
Returns:
|
||||
True if job was triggered, False otherwise
|
||||
Manually trigger a scheduled job immediately (legacy function).
|
||||
"""
|
||||
if scheduler is None:
|
||||
if _scheduler_instance is None:
|
||||
logger.error("Cannot trigger job, scheduler not running")
|
||||
return False
|
||||
|
||||
try:
|
||||
job = scheduler.get_job(job_id)
|
||||
if job:
|
||||
scheduler.modify_job(job_id, next_run_time=datetime.now())
|
||||
logger.info("Job triggered manually", job_id=job_id)
|
||||
return True
|
||||
else:
|
||||
logger.warning("Job not found", job_id=job_id)
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to trigger job", job_id=job_id, error=str(e))
|
||||
return False
|
||||
return _scheduler_instance.trigger_job_now(job_id)
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Procurement Service Dockerfile
|
||||
# Stage 1: Copy shared libraries
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Stage 2: Main service
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -156,21 +156,14 @@ class DeliveryTrackingService:
|
||||
|
||||
async def _check_all_tenants(self):
|
||||
"""
|
||||
Check deliveries for all active tenants (with leader election).
|
||||
Check deliveries for all active tenants.
|
||||
|
||||
Only one pod executes this - others skip if not leader.
|
||||
This method is only called by the leader pod (via APScheduler).
|
||||
Leader election is handled at the scheduler level, not here.
|
||||
"""
|
||||
# Try to acquire leader lock
|
||||
if not await self._try_acquire_leader_lock():
|
||||
logger.debug(
|
||||
"Skipping delivery check - not leader",
|
||||
instance_id=self.instance_id
|
||||
)
|
||||
return
|
||||
logger.info("Starting delivery checks", instance_id=self.instance_id)
|
||||
|
||||
try:
|
||||
logger.info("Starting delivery checks (as leader)", instance_id=self.instance_id)
|
||||
|
||||
# Get all active tenants from database
|
||||
tenants = await self._get_active_tenants()
|
||||
|
||||
@@ -194,24 +187,8 @@ class DeliveryTrackingService:
|
||||
total_alerts=total_alerts
|
||||
)
|
||||
|
||||
finally:
|
||||
await self._release_leader_lock()
|
||||
|
||||
async def _try_acquire_leader_lock(self) -> bool:
|
||||
"""
|
||||
Try to acquire leader lock for delivery tracking.
|
||||
|
||||
Uses Redis to ensure only one pod runs checks.
|
||||
Returns True if acquired, False if another pod is leader.
|
||||
"""
|
||||
# This simplified version doesn't implement leader election
|
||||
# In a real implementation, you'd use Redis or database locks
|
||||
logger.info("Delivery tracking check running", instance_id=self.instance_id)
|
||||
return True
|
||||
|
||||
async def _release_leader_lock(self):
|
||||
"""Release leader lock"""
|
||||
logger.debug("Delivery tracking check completed", instance_id=self.instance_id)
|
||||
except Exception as e:
|
||||
logger.error("Delivery checks failed", error=str(e), exc_info=True)
|
||||
|
||||
async def _get_active_tenants(self) -> List[UUID]:
|
||||
"""
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Production Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
Production Scheduler Service
|
||||
Background task that periodically checks for production alert conditions
|
||||
and triggers appropriate alerts.
|
||||
|
||||
Uses shared leader election for horizontal scaling - only one pod runs the scheduler.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
@@ -21,103 +23,144 @@ from app.services.production_alert_service import ProductionAlertService
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
class ProductionScheduler:
|
||||
"""Production scheduler service that checks for alert conditions"""
|
||||
|
||||
def __init__(self, alert_service: ProductionAlertService, database_manager: Any):
|
||||
class ProductionScheduler:
|
||||
"""Production scheduler service that checks for alert conditions.
|
||||
|
||||
Uses Redis-based leader election to ensure only one pod runs the scheduler.
|
||||
"""
|
||||
|
||||
def __init__(self, alert_service: ProductionAlertService, database_manager: Any, redis_url: str = None):
|
||||
self.alert_service = alert_service
|
||||
self.database_manager = database_manager
|
||||
self.redis_url = redis_url
|
||||
self.scheduler = AsyncIOScheduler()
|
||||
self.check_interval = 300 # 5 minutes
|
||||
self.job_id = 'production_scheduler'
|
||||
|
||||
# Leader election
|
||||
self._leader_election = None
|
||||
self._redis_client = None
|
||||
self._scheduler_started = False
|
||||
|
||||
# Cache de alertas emitidas para evitar duplicados
|
||||
self._emitted_alerts: set = set()
|
||||
self._alert_cache_ttl = 3600 # 1 hora
|
||||
self._last_cache_clear = datetime.utcnow()
|
||||
|
||||
async def start(self):
|
||||
"""Start the production scheduler with APScheduler"""
|
||||
if self.scheduler.running:
|
||||
logger.warning("Production scheduler is already running")
|
||||
return
|
||||
"""Start the production scheduler with leader election"""
|
||||
try:
|
||||
# Initialize leader election if Redis URL is provided
|
||||
if self.redis_url:
|
||||
await self._setup_leader_election()
|
||||
else:
|
||||
# No Redis, start scheduler directly (standalone mode)
|
||||
logger.warning("No Redis URL provided, starting scheduler in standalone mode")
|
||||
await self._start_scheduler()
|
||||
except Exception as e:
|
||||
logger.error("Failed to setup leader election, starting in standalone mode",
|
||||
error=str(e))
|
||||
await self._start_scheduler()
|
||||
|
||||
# Add the periodic job
|
||||
trigger = IntervalTrigger(seconds=self.check_interval)
|
||||
self.scheduler.add_job(
|
||||
self._run_scheduler_task,
|
||||
trigger=trigger,
|
||||
id=self.job_id,
|
||||
name="Production Alert Checks",
|
||||
max_instances=1 # Prevent overlapping executions
|
||||
async def _setup_leader_election(self):
|
||||
"""Setup Redis-based leader election"""
|
||||
from shared.leader_election import LeaderElectionService
|
||||
import redis.asyncio as redis
|
||||
|
||||
self._redis_client = redis.from_url(self.redis_url, decode_responses=False)
|
||||
await self._redis_client.ping()
|
||||
|
||||
self._leader_election = LeaderElectionService(
|
||||
self._redis_client,
|
||||
service_name="production-scheduler"
|
||||
)
|
||||
|
||||
# Start the scheduler
|
||||
self.scheduler.start()
|
||||
logger.info("Production scheduler started", interval_seconds=self.check_interval)
|
||||
await self._leader_election.start(
|
||||
on_become_leader=self._on_become_leader,
|
||||
on_lose_leader=self._on_lose_leader
|
||||
)
|
||||
|
||||
logger.info("Leader election initialized for production scheduler",
|
||||
is_leader=self._leader_election.is_leader)
|
||||
|
||||
async def _on_become_leader(self):
|
||||
"""Called when this instance becomes the leader"""
|
||||
logger.info("Became leader for production scheduler - starting scheduler")
|
||||
await self._start_scheduler()
|
||||
|
||||
async def _on_lose_leader(self):
|
||||
"""Called when this instance loses leadership"""
|
||||
logger.warning("Lost leadership for production scheduler - stopping scheduler")
|
||||
await self._stop_scheduler()
|
||||
|
||||
async def _start_scheduler(self):
|
||||
"""Start the APScheduler"""
|
||||
if self._scheduler_started:
|
||||
logger.debug("Production scheduler already started")
|
||||
return
|
||||
|
||||
if not self.scheduler.running:
|
||||
trigger = IntervalTrigger(seconds=self.check_interval)
|
||||
self.scheduler.add_job(
|
||||
self._run_scheduler_task,
|
||||
trigger=trigger,
|
||||
id=self.job_id,
|
||||
name="Production Alert Checks",
|
||||
max_instances=1
|
||||
)
|
||||
|
||||
self.scheduler.start()
|
||||
self._scheduler_started = True
|
||||
logger.info("Production scheduler started", interval_seconds=self.check_interval)
|
||||
|
||||
async def _stop_scheduler(self):
|
||||
"""Stop the APScheduler"""
|
||||
if not self._scheduler_started:
|
||||
return
|
||||
|
||||
if self.scheduler.running:
|
||||
self.scheduler.shutdown(wait=False)
|
||||
self._scheduler_started = False
|
||||
logger.info("Production scheduler stopped")
|
||||
|
||||
async def stop(self):
|
||||
"""Stop the production scheduler"""
|
||||
if self.scheduler.running:
|
||||
self.scheduler.shutdown(wait=True)
|
||||
logger.info("Production scheduler stopped")
|
||||
else:
|
||||
logger.info("Production scheduler already stopped")
|
||||
"""Stop the production scheduler and leader election"""
|
||||
if self._leader_election:
|
||||
await self._leader_election.stop()
|
||||
|
||||
await self._stop_scheduler()
|
||||
|
||||
if self._redis_client:
|
||||
await self._redis_client.close()
|
||||
|
||||
@property
|
||||
def is_leader(self) -> bool:
|
||||
"""Check if this instance is the leader"""
|
||||
return self._leader_election.is_leader if self._leader_election else True
|
||||
|
||||
async def _run_scheduler_task(self):
|
||||
"""Run scheduled production alert checks with leader election"""
|
||||
# Try to acquire leader lock for this scheduler
|
||||
lock_name = f"production_scheduler:{self.database_manager.database_url if hasattr(self.database_manager, 'database_url') else 'default'}"
|
||||
lock_id = abs(hash(lock_name)) % (2**31) # Generate a unique integer ID for the lock
|
||||
acquired = False
|
||||
"""Run scheduled production alert checks"""
|
||||
start_time = datetime.now()
|
||||
logger.info("Running scheduled production alert checks")
|
||||
|
||||
try:
|
||||
# Try to acquire PostgreSQL advisory lock for leader election
|
||||
async with self.database_manager.get_session() as session:
|
||||
result = await session.execute(text("SELECT pg_try_advisory_lock(:lock_id)"), {"lock_id": lock_id})
|
||||
acquired = True # If no exception, lock was acquired
|
||||
# Run all alert checks
|
||||
alerts_generated = await self.check_all_conditions()
|
||||
|
||||
start_time = datetime.now()
|
||||
logger.info("Running scheduled production alert checks (as leader)")
|
||||
|
||||
# Run all alert checks
|
||||
alerts_generated = await self.check_all_conditions()
|
||||
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
logger.info(
|
||||
"Completed scheduled production alert checks",
|
||||
alerts_generated=alerts_generated,
|
||||
duration_seconds=round(duration, 2)
|
||||
)
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
logger.info(
|
||||
"Completed scheduled production alert checks",
|
||||
alerts_generated=alerts_generated,
|
||||
duration_seconds=round(duration, 2)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# If it's a lock acquisition error, log and skip execution (another instance is running)
|
||||
error_str = str(e).lower()
|
||||
if "lock" in error_str or "timeout" in error_str or "could not acquire" in error_str:
|
||||
logger.debug(
|
||||
"Skipping production scheduler execution (not leader)",
|
||||
lock_name=lock_name
|
||||
)
|
||||
return # Not an error, just not the leader
|
||||
else:
|
||||
logger.error(
|
||||
"Error in production scheduler task",
|
||||
error=str(e),
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
finally:
|
||||
if acquired:
|
||||
# Release the lock
|
||||
try:
|
||||
async with self.database_manager.get_session() as session:
|
||||
await session.execute(text("SELECT pg_advisory_unlock(:lock_id)"), {"lock_id": lock_id})
|
||||
await session.commit()
|
||||
except Exception as unlock_error:
|
||||
logger.warning(
|
||||
"Error releasing leader lock (may have been automatically released)",
|
||||
error=str(unlock_error)
|
||||
)
|
||||
logger.error(
|
||||
"Error in production scheduler task",
|
||||
error=str(e),
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
async def check_all_conditions(self) -> int:
|
||||
"""
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Recipes Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Sales Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Suppliers Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Tenant Dockerfile
|
||||
# Add this stage at the top of each service Dockerfile
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Then your main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
# Training Service Dockerfile with MinIO Support
|
||||
# Multi-stage build for optimized production image
|
||||
FROM python:3.11-slim AS shared
|
||||
FROM localhost:5000/python_3.11-slim AS shared
|
||||
WORKDIR /shared
|
||||
COPY shared/ /shared/
|
||||
|
||||
# Main service stage
|
||||
FROM python:3.11-slim
|
||||
FROM localhost:5000/python_3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
Reference in New Issue
Block a user