Add new infra architecture

This commit is contained in:
Urtzi Alfaro
2026-01-19 11:55:17 +01:00
parent 21d35ea92b
commit 35f164f0cd
311 changed files with 13241 additions and 3700 deletions

View File

@@ -1,11 +1,11 @@
# AI Insights Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Alert Processor Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Auth Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
# Create non-root user for security
RUN groupadd -r appgroup && useradd -r -g appgroup appuser

View File

@@ -1,11 +1,11 @@
# Demo Session Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Distribution Service Dockerfile
# Stage 1: Copy shared libraries
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Stage 2: Main service
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -50,9 +50,9 @@ class DistributionService(StandardFastAPIService):
def __init__(self):
# Define expected database tables for health checks
# Must match tables created in migrations/versions/001_initial_schema.py
distribution_expected_tables = [
'delivery_routes', 'shipments', 'route_assignments', 'delivery_points',
'vehicle_assignments', 'delivery_schedule', 'shipment_tracking', 'audit_logs'
'delivery_routes', 'shipments', 'delivery_schedules'
]
# Define custom metrics for distribution service

View File

@@ -1,11 +1,11 @@
# External Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Forecasting Service Dockerfile with MinIO Support
# Multi-stage build for optimized production image
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Inventory Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -120,8 +120,12 @@ class InventoryService(StandardFastAPIService):
await alert_service.start()
self.logger.info("Inventory alert service started")
# Initialize inventory scheduler with alert service and database manager
inventory_scheduler = InventoryScheduler(alert_service, self.database_manager)
# Initialize inventory scheduler with alert service, database manager, and Redis URL for leader election
inventory_scheduler = InventoryScheduler(
alert_service,
self.database_manager,
redis_url=settings.REDIS_URL # Pass Redis URL for leader election in multi-replica deployments
)
await inventory_scheduler.start()
self.logger.info("Inventory scheduler started")

View File

@@ -2,6 +2,9 @@
Inventory Scheduler Service
Background task that periodically checks for inventory alert conditions
and triggers appropriate alerts.
Uses Redis-based leader election to ensure only one pod runs scheduled tasks
when running with multiple replicas.
"""
import asyncio
@@ -22,22 +25,129 @@ from app.services.inventory_alert_service import InventoryAlertService
logger = structlog.get_logger()
class InventoryScheduler:
"""Inventory scheduler service that checks for alert conditions"""
def __init__(self, alert_service: InventoryAlertService, database_manager: Any):
class InventoryScheduler:
"""
Inventory scheduler service that checks for alert conditions.
Uses Redis-based leader election to ensure only one pod runs
scheduled jobs in a multi-replica deployment.
"""
def __init__(self, alert_service: InventoryAlertService, database_manager: Any, redis_url: str = None):
self.alert_service = alert_service
self.database_manager = database_manager
self.scheduler = AsyncIOScheduler()
self.scheduler = None
self.check_interval = 300 # 5 minutes
self.job_id = 'inventory_scheduler'
# Leader election
self._redis_url = redis_url
self._leader_election = None
self._redis_client = None
self._scheduler_started = False
async def start(self):
"""Start the inventory scheduler with APScheduler"""
if self.scheduler.running:
logger.warning("Inventory scheduler is already running")
"""Start the inventory scheduler with leader election"""
if self._redis_url:
await self._start_with_leader_election()
else:
# Fallback to standalone mode (for local development or single-pod deployments)
logger.warning("Redis URL not provided, starting inventory scheduler in standalone mode")
await self._start_standalone()
async def _start_with_leader_election(self):
"""Start with Redis-based leader election for horizontal scaling"""
import redis.asyncio as redis
from shared.leader_election import LeaderElectionService
try:
# Create Redis connection
self._redis_client = redis.from_url(self._redis_url, decode_responses=False)
await self._redis_client.ping()
# Create scheduler (but don't start it yet)
self.scheduler = AsyncIOScheduler()
# Create leader election
self._leader_election = LeaderElectionService(
self._redis_client,
service_name="inventory-scheduler"
)
# Start leader election with callbacks
await self._leader_election.start(
on_become_leader=self._on_become_leader,
on_lose_leader=self._on_lose_leader
)
logger.info("Inventory scheduler started with leader election",
is_leader=self._leader_election.is_leader,
instance_id=self._leader_election.instance_id)
except Exception as e:
logger.error("Failed to start with leader election, falling back to standalone",
error=str(e))
await self._start_standalone()
async def _on_become_leader(self):
"""Called when this instance becomes the leader"""
logger.info("Inventory scheduler became leader, starting scheduled jobs")
await self._start_scheduler()
async def _on_lose_leader(self):
"""Called when this instance loses leadership"""
logger.warning("Inventory scheduler lost leadership, stopping scheduled jobs")
await self._stop_scheduler()
async def _start_scheduler(self):
"""Start the APScheduler with inventory check jobs"""
if self._scheduler_started:
logger.warning("Inventory scheduler already started")
return
try:
# Add the periodic job
trigger = IntervalTrigger(seconds=self.check_interval)
self.scheduler.add_job(
self._run_scheduler_task,
trigger=trigger,
id=self.job_id,
name="Inventory Alert Checks",
max_instances=1 # Prevent overlapping executions
)
# Start scheduler
if not self.scheduler.running:
self.scheduler.start()
self._scheduler_started = True
logger.info("Inventory scheduler jobs started",
interval_seconds=self.check_interval,
job_count=len(self.scheduler.get_jobs()))
except Exception as e:
logger.error("Failed to start inventory scheduler", error=str(e))
async def _stop_scheduler(self):
"""Stop the APScheduler"""
if not self._scheduler_started:
return
try:
if self.scheduler and self.scheduler.running:
self.scheduler.shutdown(wait=False)
self._scheduler_started = False
logger.info("Inventory scheduler jobs stopped")
except Exception as e:
logger.error("Failed to stop inventory scheduler", error=str(e))
async def _start_standalone(self):
"""Start scheduler without leader election (fallback mode)"""
logger.warning("Starting inventory scheduler in standalone mode (no leader election)")
self.scheduler = AsyncIOScheduler()
# Add the periodic job
trigger = IntervalTrigger(seconds=self.check_interval)
self.scheduler.add_job(
@@ -45,75 +155,63 @@ class InventoryScheduler:
trigger=trigger,
id=self.job_id,
name="Inventory Alert Checks",
max_instances=1 # Prevent overlapping executions
max_instances=1
)
# Start the scheduler
self.scheduler.start()
logger.info("Inventory scheduler started", interval_seconds=self.check_interval)
if not self.scheduler.running:
self.scheduler.start()
self._scheduler_started = True
logger.info("Inventory scheduler started (standalone mode)",
interval_seconds=self.check_interval)
async def stop(self):
"""Stop the inventory scheduler"""
if self.scheduler.running:
self.scheduler.shutdown(wait=True)
logger.info("Inventory scheduler stopped")
else:
logger.info("Inventory scheduler already stopped")
"""Stop the inventory scheduler and leader election"""
# Stop leader election
if self._leader_election:
await self._leader_election.stop()
# Stop scheduler
await self._stop_scheduler()
# Close Redis
if self._redis_client:
await self._redis_client.close()
logger.info("Inventory scheduler stopped")
@property
def is_leader(self) -> bool:
"""Check if this instance is the leader"""
return self._leader_election.is_leader if self._leader_election else True
def get_leader_status(self) -> dict:
"""Get leader election status"""
if self._leader_election:
return self._leader_election.get_status()
return {"is_leader": True, "mode": "standalone"}
async def _run_scheduler_task(self):
"""Run scheduled inventory alert checks with leader election"""
# Try to acquire leader lock for this scheduler
lock_name = f"inventory_scheduler:{self.database_manager.database_url if hasattr(self.database_manager, 'database_url') else 'default'}"
lock_id = abs(hash(lock_name)) % (2**31) # Generate a unique integer ID for the lock
acquired = False
"""Run scheduled inventory alert checks"""
start_time = datetime.now()
logger.info("Running scheduled inventory alert checks")
try:
# Try to acquire PostgreSQL advisory lock for leader election
async with self.database_manager.get_session() as session:
result = await session.execute(text("SELECT pg_try_advisory_lock(:lock_id)"), {"lock_id": lock_id})
acquired = True # If no exception, lock was acquired
# Run all alert checks
alerts_generated = await self.check_all_conditions()
start_time = datetime.now()
logger.info("Running scheduled inventory alert checks (as leader)")
# Run all alert checks
alerts_generated = await self.check_all_conditions()
duration = (datetime.now() - start_time).total_seconds()
logger.info(
"Completed scheduled inventory alert checks",
alerts_generated=alerts_generated,
duration_seconds=round(duration, 2)
)
duration = (datetime.now() - start_time).total_seconds()
logger.info(
"Completed scheduled inventory alert checks",
alerts_generated=alerts_generated,
duration_seconds=round(duration, 2)
)
except Exception as e:
# If it's a lock acquisition error, log and skip execution (another instance is running)
error_str = str(e).lower()
if "lock" in error_str or "timeout" in error_str or "could not acquire" in error_str:
logger.debug(
"Skipping inventory scheduler execution (not leader)",
lock_name=lock_name
)
return # Not an error, just not the leader
else:
logger.error(
"Error in inventory scheduler task",
error=str(e),
exc_info=True
)
finally:
if acquired:
# Release the lock
try:
async with self.database_manager.get_session() as session:
await session.execute(text("SELECT pg_advisory_unlock(:lock_id)"), {"lock_id": lock_id})
await session.commit()
except Exception as unlock_error:
logger.warning(
"Error releasing leader lock (may have been automatically released)",
error=str(unlock_error)
)
logger.error(
"Error in inventory scheduler task",
error=str(e),
exc_info=True
)
async def check_all_conditions(self) -> int:
"""

View File

@@ -1,11 +1,11 @@
# Notification Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Orchestrator Service Dockerfile
# Stage 1: Copy shared libraries
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Stage 2: Main service
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Orders Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Pos Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -20,28 +20,12 @@ from shared.service_base import StandardFastAPIService
class POSService(StandardFastAPIService):
"""POS Integration Service with standardized setup"""
expected_migration_version = "00001"
async def on_startup(self, app):
"""Custom startup logic including migration verification"""
await self.verify_migrations()
await super().on_startup(app)
async def verify_migrations(self):
"""Verify database schema matches the latest migrations."""
try:
async with self.database_manager.get_session() as session:
result = await session.execute(text("SELECT version_num FROM alembic_version"))
version = result.scalar()
if version != self.expected_migration_version:
self.logger.error(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
raise RuntimeError(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
self.logger.info(f"Migration verification successful: {version}")
except Exception as e:
self.logger.error(f"Migration verification failed: {e}")
raise
expected_migration_version = "e9976ec9fe9e"
def __init__(self):
# Initialize scheduler reference
self.pos_scheduler = None
# Define expected database tables for health checks
pos_expected_tables = [
'pos_configurations', 'pos_transactions', 'pos_transaction_items',
@@ -87,15 +71,42 @@ class POSService(StandardFastAPIService):
custom_metrics=pos_custom_metrics
)
async def verify_migrations(self):
"""Verify database schema matches the latest migrations."""
try:
async with self.database_manager.get_session() as session:
result = await session.execute(text("SELECT version_num FROM alembic_version"))
version = result.scalar()
if version != self.expected_migration_version:
self.logger.error(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
raise RuntimeError(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
self.logger.info(f"Migration verification successful: {version}")
except Exception as e:
self.logger.error(f"Migration verification failed: {e}")
raise
async def on_startup(self, app: FastAPI):
"""Custom startup logic for POS service"""
# Start background scheduler for POS-to-Sales sync
# Verify migrations first
await self.verify_migrations()
# Call parent startup
await super().on_startup(app)
# Start background scheduler for POS-to-Sales sync with leader election
try:
from app.scheduler import start_scheduler
start_scheduler()
self.logger.info("Background scheduler started successfully")
from app.scheduler import POSScheduler
self.pos_scheduler = POSScheduler(
redis_url=settings.REDIS_URL, # Pass Redis URL for leader election
sync_interval_minutes=settings.SYNC_INTERVAL_SECONDS // 60 if settings.SYNC_INTERVAL_SECONDS >= 60 else 5
)
await self.pos_scheduler.start()
self.logger.info("POS scheduler started successfully with leader election")
# Store scheduler in app state for status checks
app.state.pos_scheduler = self.pos_scheduler
except Exception as e:
self.logger.error(f"Failed to start background scheduler: {e}", exc_info=True)
self.logger.error(f"Failed to start POS scheduler: {e}", exc_info=True)
# Don't fail startup if scheduler fails
# Custom startup completed
@@ -103,13 +114,13 @@ class POSService(StandardFastAPIService):
async def on_shutdown(self, app: FastAPI):
"""Custom shutdown logic for POS service"""
# Shutdown background scheduler
# Shutdown POS scheduler
try:
from app.scheduler import shutdown_scheduler
shutdown_scheduler()
self.logger.info("Background scheduler stopped successfully")
if self.pos_scheduler:
await self.pos_scheduler.stop()
self.logger.info("POS scheduler stopped successfully")
except Exception as e:
self.logger.error(f"Failed to stop background scheduler: {e}", exc_info=True)
self.logger.error(f"Failed to stop POS scheduler: {e}", exc_info=True)
# Database cleanup is handled by the base class
pass

View File

@@ -5,17 +5,19 @@ Sets up periodic background jobs for:
- Syncing POS transactions to sales service
- Other maintenance tasks as needed
To enable scheduling, add to main.py startup:
Uses Redis-based leader election to ensure only one pod runs scheduled tasks
when running with multiple replicas.
Usage in main.py:
```python
from app.scheduler import start_scheduler, shutdown_scheduler
from app.scheduler import POSScheduler
@app.on_event("startup")
async def startup_event():
start_scheduler()
# On startup
scheduler = POSScheduler(redis_url=settings.REDIS_URL)
await scheduler.start()
@app.on_event("shutdown")
async def shutdown_event():
shutdown_scheduler()
# On shutdown
await scheduler.stop()
```
"""
@@ -23,65 +25,307 @@ import structlog
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.interval import IntervalTrigger
from datetime import datetime
from typing import Optional
logger = structlog.get_logger()
# Global scheduler instance
scheduler = None
class POSScheduler:
"""
POS Scheduler service that manages background sync jobs.
Uses Redis-based leader election to ensure only one pod runs
scheduled jobs in a multi-replica deployment.
"""
def __init__(self, redis_url: str = None, sync_interval_minutes: int = 5):
"""
Initialize POS scheduler.
Args:
redis_url: Redis connection URL for leader election
sync_interval_minutes: Interval for POS-to-sales sync job
"""
self.scheduler = None
self.sync_interval_minutes = sync_interval_minutes
# Leader election
self._redis_url = redis_url
self._leader_election = None
self._redis_client = None
self._scheduler_started = False
async def start(self):
"""Start the POS scheduler with leader election"""
if self._redis_url:
await self._start_with_leader_election()
else:
# Fallback to standalone mode (for local development or single-pod deployments)
logger.warning("Redis URL not provided, starting POS scheduler in standalone mode")
await self._start_standalone()
async def _start_with_leader_election(self):
"""Start with Redis-based leader election for horizontal scaling"""
import redis.asyncio as redis
from shared.leader_election import LeaderElectionService
try:
# Create Redis connection
self._redis_client = redis.from_url(self._redis_url, decode_responses=False)
await self._redis_client.ping()
# Create scheduler (but don't start it yet)
self.scheduler = AsyncIOScheduler()
# Create leader election
self._leader_election = LeaderElectionService(
self._redis_client,
service_name="pos-scheduler"
)
# Start leader election with callbacks
await self._leader_election.start(
on_become_leader=self._on_become_leader,
on_lose_leader=self._on_lose_leader
)
logger.info("POS scheduler started with leader election",
is_leader=self._leader_election.is_leader,
instance_id=self._leader_election.instance_id)
except Exception as e:
logger.error("Failed to start with leader election, falling back to standalone",
error=str(e))
await self._start_standalone()
async def _on_become_leader(self):
"""Called when this instance becomes the leader"""
logger.info("POS scheduler became leader, starting scheduled jobs")
await self._start_scheduler()
async def _on_lose_leader(self):
"""Called when this instance loses leadership"""
logger.warning("POS scheduler lost leadership, stopping scheduled jobs")
await self._stop_scheduler()
async def _start_scheduler(self):
"""Start the APScheduler with POS jobs"""
if self._scheduler_started:
logger.warning("POS scheduler already started")
return
try:
# Import sync job
from app.jobs.sync_pos_to_sales import run_pos_to_sales_sync
# Job 1: Sync POS transactions to sales service
self.scheduler.add_job(
run_pos_to_sales_sync,
trigger=IntervalTrigger(minutes=self.sync_interval_minutes),
id='pos_to_sales_sync',
name='Sync POS Transactions to Sales',
replace_existing=True,
max_instances=1, # Prevent concurrent runs
coalesce=True, # Combine multiple missed runs into one
misfire_grace_time=60 # Allow 60 seconds grace for missed runs
)
# Start scheduler
if not self.scheduler.running:
self.scheduler.start()
self._scheduler_started = True
logger.info("POS scheduler jobs started",
sync_interval_minutes=self.sync_interval_minutes,
job_count=len(self.scheduler.get_jobs()),
next_run=self.scheduler.get_jobs()[0].next_run_time if self.scheduler.get_jobs() else None)
except Exception as e:
logger.error("Failed to start POS scheduler", error=str(e))
async def _stop_scheduler(self):
"""Stop the APScheduler"""
if not self._scheduler_started:
return
try:
if self.scheduler and self.scheduler.running:
self.scheduler.shutdown(wait=False)
self._scheduler_started = False
logger.info("POS scheduler jobs stopped")
except Exception as e:
logger.error("Failed to stop POS scheduler", error=str(e))
async def _start_standalone(self):
"""Start scheduler without leader election (fallback mode)"""
logger.warning("Starting POS scheduler in standalone mode (no leader election)")
self.scheduler = AsyncIOScheduler()
try:
# Import sync job
from app.jobs.sync_pos_to_sales import run_pos_to_sales_sync
self.scheduler.add_job(
run_pos_to_sales_sync,
trigger=IntervalTrigger(minutes=self.sync_interval_minutes),
id='pos_to_sales_sync',
name='Sync POS Transactions to Sales',
replace_existing=True,
max_instances=1,
coalesce=True,
misfire_grace_time=60
)
if not self.scheduler.running:
self.scheduler.start()
self._scheduler_started = True
logger.info("POS scheduler started (standalone mode)",
sync_interval_minutes=self.sync_interval_minutes,
next_run=self.scheduler.get_jobs()[0].next_run_time if self.scheduler.get_jobs() else None)
except Exception as e:
logger.error("Failed to start POS scheduler in standalone mode", error=str(e))
async def stop(self):
"""Stop the POS scheduler and leader election"""
# Stop leader election
if self._leader_election:
await self._leader_election.stop()
# Stop scheduler
await self._stop_scheduler()
# Close Redis
if self._redis_client:
await self._redis_client.close()
logger.info("POS scheduler stopped")
@property
def is_leader(self) -> bool:
"""Check if this instance is the leader"""
return self._leader_election.is_leader if self._leader_election else True
def get_leader_status(self) -> dict:
"""Get leader election status"""
if self._leader_election:
return self._leader_election.get_status()
return {"is_leader": True, "mode": "standalone"}
def get_scheduler_status(self) -> dict:
"""
Get current scheduler status
Returns:
Dict with scheduler info and job statuses
"""
if self.scheduler is None or not self._scheduler_started:
return {
"running": False,
"is_leader": self.is_leader,
"jobs": []
}
jobs = []
for job in self.scheduler.get_jobs():
jobs.append({
"id": job.id,
"name": job.name,
"next_run": job.next_run_time.isoformat() if job.next_run_time else None,
"trigger": str(job.trigger)
})
return {
"running": True,
"is_leader": self.is_leader,
"jobs": jobs,
"state": self.scheduler.state
}
def trigger_job_now(self, job_id: str) -> bool:
"""
Manually trigger a scheduled job immediately
Args:
job_id: Job identifier (e.g., 'pos_to_sales_sync')
Returns:
True if job was triggered, False otherwise
"""
if self.scheduler is None or not self._scheduler_started:
logger.error("Cannot trigger job, scheduler not running")
return False
if not self.is_leader:
logger.warning("Cannot trigger job, this instance is not the leader")
return False
try:
job = self.scheduler.get_job(job_id)
if job:
self.scheduler.modify_job(job_id, next_run_time=datetime.now())
logger.info("Job triggered manually", job_id=job_id)
return True
else:
logger.warning("Job not found", job_id=job_id)
return False
except Exception as e:
logger.error("Failed to trigger job", job_id=job_id, error=str(e))
return False
# ================================================================
# Legacy compatibility functions (deprecated - use POSScheduler class)
# ================================================================
# Global scheduler instance for backward compatibility
_scheduler_instance: Optional[POSScheduler] = None
def start_scheduler():
"""
Initialize and start the background scheduler
DEPRECATED: Use POSScheduler class directly for better leader election support.
Jobs configured:
- POS to Sales Sync: Every 5 minutes
Initialize and start the background scheduler (legacy function).
"""
global scheduler
global _scheduler_instance
if scheduler is not None:
if _scheduler_instance is not None:
logger.warning("Scheduler already running")
return
logger.warning("Using deprecated start_scheduler function. "
"Consider migrating to POSScheduler class for leader election support.")
try:
scheduler = AsyncIOScheduler()
# Job 1: Sync POS transactions to sales service
from app.jobs.sync_pos_to_sales import run_pos_to_sales_sync
scheduler.add_job(
run_pos_to_sales_sync,
trigger=IntervalTrigger(minutes=5),
id='pos_to_sales_sync',
name='Sync POS Transactions to Sales',
replace_existing=True,
max_instances=1, # Prevent concurrent runs
coalesce=True, # Combine multiple missed runs into one
misfire_grace_time=60 # Allow 60 seconds grace for missed runs
)
scheduler.start()
logger.info("Background scheduler started",
jobs=len(scheduler.get_jobs()),
next_run=scheduler.get_jobs()[0].next_run_time if scheduler.get_jobs() else None)
_scheduler_instance = POSScheduler()
# Note: This is synchronous fallback, no leader election
import asyncio
asyncio.create_task(_scheduler_instance._start_standalone())
except Exception as e:
logger.error("Failed to start scheduler", error=str(e), exc_info=True)
scheduler = None
_scheduler_instance = None
def shutdown_scheduler():
"""Gracefully shutdown the scheduler"""
global scheduler
"""
DEPRECATED: Use POSScheduler class directly.
if scheduler is None:
Gracefully shutdown the scheduler (legacy function).
"""
global _scheduler_instance
if _scheduler_instance is None:
logger.warning("Scheduler not running")
return
try:
scheduler.shutdown(wait=True)
logger.info("Background scheduler stopped")
scheduler = None
import asyncio
asyncio.create_task(_scheduler_instance.stop())
_scheduler_instance = None
except Exception as e:
logger.error("Failed to shutdown scheduler", error=str(e), exc_info=True)
@@ -89,57 +333,25 @@ def shutdown_scheduler():
def get_scheduler_status():
"""
Get current scheduler status
DEPRECATED: Use POSScheduler class directly.
Returns:
Dict with scheduler info and job statuses
Get current scheduler status (legacy function).
"""
if scheduler is None:
if _scheduler_instance is None:
return {
"running": False,
"jobs": []
}
jobs = []
for job in scheduler.get_jobs():
jobs.append({
"id": job.id,
"name": job.name,
"next_run": job.next_run_time.isoformat() if job.next_run_time else None,
"trigger": str(job.trigger)
})
return {
"running": True,
"jobs": jobs,
"state": scheduler.state
}
return _scheduler_instance.get_scheduler_status()
def trigger_job_now(job_id: str):
"""
Manually trigger a scheduled job immediately
DEPRECATED: Use POSScheduler class directly.
Args:
job_id: Job identifier (e.g., 'pos_to_sales_sync')
Returns:
True if job was triggered, False otherwise
Manually trigger a scheduled job immediately (legacy function).
"""
if scheduler is None:
if _scheduler_instance is None:
logger.error("Cannot trigger job, scheduler not running")
return False
try:
job = scheduler.get_job(job_id)
if job:
scheduler.modify_job(job_id, next_run_time=datetime.now())
logger.info("Job triggered manually", job_id=job_id)
return True
else:
logger.warning("Job not found", job_id=job_id)
return False
except Exception as e:
logger.error("Failed to trigger job", job_id=job_id, error=str(e))
return False
return _scheduler_instance.trigger_job_now(job_id)

View File

@@ -1,11 +1,11 @@
# Procurement Service Dockerfile
# Stage 1: Copy shared libraries
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Stage 2: Main service
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -156,21 +156,14 @@ class DeliveryTrackingService:
async def _check_all_tenants(self):
"""
Check deliveries for all active tenants (with leader election).
Check deliveries for all active tenants.
Only one pod executes this - others skip if not leader.
This method is only called by the leader pod (via APScheduler).
Leader election is handled at the scheduler level, not here.
"""
# Try to acquire leader lock
if not await self._try_acquire_leader_lock():
logger.debug(
"Skipping delivery check - not leader",
instance_id=self.instance_id
)
return
logger.info("Starting delivery checks", instance_id=self.instance_id)
try:
logger.info("Starting delivery checks (as leader)", instance_id=self.instance_id)
# Get all active tenants from database
tenants = await self._get_active_tenants()
@@ -194,24 +187,8 @@ class DeliveryTrackingService:
total_alerts=total_alerts
)
finally:
await self._release_leader_lock()
async def _try_acquire_leader_lock(self) -> bool:
"""
Try to acquire leader lock for delivery tracking.
Uses Redis to ensure only one pod runs checks.
Returns True if acquired, False if another pod is leader.
"""
# This simplified version doesn't implement leader election
# In a real implementation, you'd use Redis or database locks
logger.info("Delivery tracking check running", instance_id=self.instance_id)
return True
async def _release_leader_lock(self):
"""Release leader lock"""
logger.debug("Delivery tracking check completed", instance_id=self.instance_id)
except Exception as e:
logger.error("Delivery checks failed", error=str(e), exc_info=True)
async def _get_active_tenants(self) -> List[UUID]:
"""

View File

@@ -1,11 +1,11 @@
# Production Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -2,6 +2,8 @@
Production Scheduler Service
Background task that periodically checks for production alert conditions
and triggers appropriate alerts.
Uses shared leader election for horizontal scaling - only one pod runs the scheduler.
"""
import asyncio
@@ -21,103 +23,144 @@ from app.services.production_alert_service import ProductionAlertService
logger = structlog.get_logger()
class ProductionScheduler:
"""Production scheduler service that checks for alert conditions"""
def __init__(self, alert_service: ProductionAlertService, database_manager: Any):
class ProductionScheduler:
"""Production scheduler service that checks for alert conditions.
Uses Redis-based leader election to ensure only one pod runs the scheduler.
"""
def __init__(self, alert_service: ProductionAlertService, database_manager: Any, redis_url: str = None):
self.alert_service = alert_service
self.database_manager = database_manager
self.redis_url = redis_url
self.scheduler = AsyncIOScheduler()
self.check_interval = 300 # 5 minutes
self.job_id = 'production_scheduler'
# Leader election
self._leader_election = None
self._redis_client = None
self._scheduler_started = False
# Cache de alertas emitidas para evitar duplicados
self._emitted_alerts: set = set()
self._alert_cache_ttl = 3600 # 1 hora
self._last_cache_clear = datetime.utcnow()
async def start(self):
"""Start the production scheduler with APScheduler"""
if self.scheduler.running:
logger.warning("Production scheduler is already running")
return
"""Start the production scheduler with leader election"""
try:
# Initialize leader election if Redis URL is provided
if self.redis_url:
await self._setup_leader_election()
else:
# No Redis, start scheduler directly (standalone mode)
logger.warning("No Redis URL provided, starting scheduler in standalone mode")
await self._start_scheduler()
except Exception as e:
logger.error("Failed to setup leader election, starting in standalone mode",
error=str(e))
await self._start_scheduler()
# Add the periodic job
trigger = IntervalTrigger(seconds=self.check_interval)
self.scheduler.add_job(
self._run_scheduler_task,
trigger=trigger,
id=self.job_id,
name="Production Alert Checks",
max_instances=1 # Prevent overlapping executions
async def _setup_leader_election(self):
"""Setup Redis-based leader election"""
from shared.leader_election import LeaderElectionService
import redis.asyncio as redis
self._redis_client = redis.from_url(self.redis_url, decode_responses=False)
await self._redis_client.ping()
self._leader_election = LeaderElectionService(
self._redis_client,
service_name="production-scheduler"
)
# Start the scheduler
self.scheduler.start()
logger.info("Production scheduler started", interval_seconds=self.check_interval)
await self._leader_election.start(
on_become_leader=self._on_become_leader,
on_lose_leader=self._on_lose_leader
)
logger.info("Leader election initialized for production scheduler",
is_leader=self._leader_election.is_leader)
async def _on_become_leader(self):
"""Called when this instance becomes the leader"""
logger.info("Became leader for production scheduler - starting scheduler")
await self._start_scheduler()
async def _on_lose_leader(self):
"""Called when this instance loses leadership"""
logger.warning("Lost leadership for production scheduler - stopping scheduler")
await self._stop_scheduler()
async def _start_scheduler(self):
"""Start the APScheduler"""
if self._scheduler_started:
logger.debug("Production scheduler already started")
return
if not self.scheduler.running:
trigger = IntervalTrigger(seconds=self.check_interval)
self.scheduler.add_job(
self._run_scheduler_task,
trigger=trigger,
id=self.job_id,
name="Production Alert Checks",
max_instances=1
)
self.scheduler.start()
self._scheduler_started = True
logger.info("Production scheduler started", interval_seconds=self.check_interval)
async def _stop_scheduler(self):
"""Stop the APScheduler"""
if not self._scheduler_started:
return
if self.scheduler.running:
self.scheduler.shutdown(wait=False)
self._scheduler_started = False
logger.info("Production scheduler stopped")
async def stop(self):
"""Stop the production scheduler"""
if self.scheduler.running:
self.scheduler.shutdown(wait=True)
logger.info("Production scheduler stopped")
else:
logger.info("Production scheduler already stopped")
"""Stop the production scheduler and leader election"""
if self._leader_election:
await self._leader_election.stop()
await self._stop_scheduler()
if self._redis_client:
await self._redis_client.close()
@property
def is_leader(self) -> bool:
"""Check if this instance is the leader"""
return self._leader_election.is_leader if self._leader_election else True
async def _run_scheduler_task(self):
"""Run scheduled production alert checks with leader election"""
# Try to acquire leader lock for this scheduler
lock_name = f"production_scheduler:{self.database_manager.database_url if hasattr(self.database_manager, 'database_url') else 'default'}"
lock_id = abs(hash(lock_name)) % (2**31) # Generate a unique integer ID for the lock
acquired = False
"""Run scheduled production alert checks"""
start_time = datetime.now()
logger.info("Running scheduled production alert checks")
try:
# Try to acquire PostgreSQL advisory lock for leader election
async with self.database_manager.get_session() as session:
result = await session.execute(text("SELECT pg_try_advisory_lock(:lock_id)"), {"lock_id": lock_id})
acquired = True # If no exception, lock was acquired
# Run all alert checks
alerts_generated = await self.check_all_conditions()
start_time = datetime.now()
logger.info("Running scheduled production alert checks (as leader)")
# Run all alert checks
alerts_generated = await self.check_all_conditions()
duration = (datetime.now() - start_time).total_seconds()
logger.info(
"Completed scheduled production alert checks",
alerts_generated=alerts_generated,
duration_seconds=round(duration, 2)
)
duration = (datetime.now() - start_time).total_seconds()
logger.info(
"Completed scheduled production alert checks",
alerts_generated=alerts_generated,
duration_seconds=round(duration, 2)
)
except Exception as e:
# If it's a lock acquisition error, log and skip execution (another instance is running)
error_str = str(e).lower()
if "lock" in error_str or "timeout" in error_str or "could not acquire" in error_str:
logger.debug(
"Skipping production scheduler execution (not leader)",
lock_name=lock_name
)
return # Not an error, just not the leader
else:
logger.error(
"Error in production scheduler task",
error=str(e),
exc_info=True
)
finally:
if acquired:
# Release the lock
try:
async with self.database_manager.get_session() as session:
await session.execute(text("SELECT pg_advisory_unlock(:lock_id)"), {"lock_id": lock_id})
await session.commit()
except Exception as unlock_error:
logger.warning(
"Error releasing leader lock (may have been automatically released)",
error=str(unlock_error)
)
logger.error(
"Error in production scheduler task",
error=str(e),
exc_info=True
)
async def check_all_conditions(self) -> int:
"""

View File

@@ -1,11 +1,11 @@
# Recipes Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Sales Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Suppliers Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Tenant Dockerfile
# Add this stage at the top of each service Dockerfile
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Then your main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app

View File

@@ -1,11 +1,11 @@
# Training Service Dockerfile with MinIO Support
# Multi-stage build for optimized production image
FROM python:3.11-slim AS shared
FROM localhost:5000/python_3.11-slim AS shared
WORKDIR /shared
COPY shared/ /shared/
# Main service stage
FROM python:3.11-slim
FROM localhost:5000/python_3.11-slim
WORKDIR /app