Add new infra architecture

2026-01-19 11:55:17 +01:00
parent 21d35ea92b
commit 35f164f0cd
311 changed files with 13241 additions and 3700 deletions
--- a/services/ai_insights/Dockerfile
+++ b/services/ai_insights/Dockerfile
@@ -1,11 +1,11 @@
 # AI Insights Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/alert_processor/Dockerfile
+++ b/services/alert_processor/Dockerfile
@@ -1,11 +1,11 @@
 # Alert Processor Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/auth/Dockerfile
+++ b/services/auth/Dockerfile
@@ -1,11 +1,11 @@
 # Auth Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 # Create non-root user for security
 RUN groupadd -r appgroup && useradd -r -g appgroup appuser
--- a/services/demo_session/Dockerfile
+++ b/services/demo_session/Dockerfile
@@ -1,11 +1,11 @@
 # Demo Session Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/distribution/Dockerfile
+++ b/services/distribution/Dockerfile
@@ -1,11 +1,11 @@
 # Distribution Service Dockerfile
 # Stage 1: Copy shared libraries
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Stage 2: Main service
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/distribution/app/main.py
+++ b/services/distribution/app/main.py
@@ -50,9 +50,9 @@ class DistributionService(StandardFastAPIService):

    def __init__(self):
        # Define expected database tables for health checks
+        # Must match tables created in migrations/versions/001_initial_schema.py
        distribution_expected_tables = [
-            'delivery_routes', 'shipments', 'route_assignments', 'delivery_points',
-            'vehicle_assignments', 'delivery_schedule', 'shipment_tracking', 'audit_logs'
+            'delivery_routes', 'shipments', 'delivery_schedules'
        ]

        # Define custom metrics for distribution service
--- a/services/external/Dockerfile
+++ b/services/external/Dockerfile
@@ -1,11 +1,11 @@
 # External Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/forecasting/Dockerfile
+++ b/services/forecasting/Dockerfile
@@ -1,11 +1,11 @@
 # Forecasting Service Dockerfile with MinIO Support
 # Multi-stage build for optimized production image
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/inventory/Dockerfile
+++ b/services/inventory/Dockerfile
@@ -1,11 +1,11 @@
 # Inventory Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/inventory/app/main.py
+++ b/services/inventory/app/main.py
@@ -120,8 +120,12 @@ class InventoryService(StandardFastAPIService):
            await alert_service.start()
            self.logger.info("Inventory alert service started")

-            # Initialize inventory scheduler with alert service and database manager
-            inventory_scheduler = InventoryScheduler(alert_service, self.database_manager)
+            # Initialize inventory scheduler with alert service, database manager, and Redis URL for leader election
+            inventory_scheduler = InventoryScheduler(
+                alert_service,
+                self.database_manager,
+                redis_url=settings.REDIS_URL  # Pass Redis URL for leader election in multi-replica deployments
+            )
            await inventory_scheduler.start()
            self.logger.info("Inventory scheduler started")

--- a/services/inventory/app/services/inventory_scheduler.py
+++ b/services/inventory/app/services/inventory_scheduler.py
@@ -2,6 +2,9 @@
 Inventory Scheduler Service
 Background task that periodically checks for inventory alert conditions
 and triggers appropriate alerts.
+
+Uses Redis-based leader election to ensure only one pod runs scheduled tasks
+when running with multiple replicas.
 """

 import asyncio
@@ -22,22 +25,129 @@ from app.services.inventory_alert_service import InventoryAlertService

 logger = structlog.get_logger()

-class InventoryScheduler:
-    """Inventory scheduler service that checks for alert conditions"""

-    def __init__(self, alert_service: InventoryAlertService, database_manager: Any):
+class InventoryScheduler:
+    """
+    Inventory scheduler service that checks for alert conditions.
+
+    Uses Redis-based leader election to ensure only one pod runs
+    scheduled jobs in a multi-replica deployment.
+    """
+
+    def __init__(self, alert_service: InventoryAlertService, database_manager: Any, redis_url: str = None):
        self.alert_service = alert_service
        self.database_manager = database_manager
-        self.scheduler = AsyncIOScheduler()
+        self.scheduler = None
        self.check_interval = 300  # 5 minutes
        self.job_id = 'inventory_scheduler'

+        # Leader election
+        self._redis_url = redis_url
+        self._leader_election = None
+        self._redis_client = None
+        self._scheduler_started = False
+
    async def start(self):
-        """Start the inventory scheduler with APScheduler"""
-        if self.scheduler.running:
-            logger.warning("Inventory scheduler is already running")
+        """Start the inventory scheduler with leader election"""
+        if self._redis_url:
+            await self._start_with_leader_election()
+        else:
+            # Fallback to standalone mode (for local development or single-pod deployments)
+            logger.warning("Redis URL not provided, starting inventory scheduler in standalone mode")
+            await self._start_standalone()
+
+    async def _start_with_leader_election(self):
+        """Start with Redis-based leader election for horizontal scaling"""
+        import redis.asyncio as redis
+        from shared.leader_election import LeaderElectionService
+
+        try:
+            # Create Redis connection
+            self._redis_client = redis.from_url(self._redis_url, decode_responses=False)
+            await self._redis_client.ping()
+
+            # Create scheduler (but don't start it yet)
+            self.scheduler = AsyncIOScheduler()
+
+            # Create leader election
+            self._leader_election = LeaderElectionService(
+                self._redis_client,
+                service_name="inventory-scheduler"
+            )
+
+            # Start leader election with callbacks
+            await self._leader_election.start(
+                on_become_leader=self._on_become_leader,
+                on_lose_leader=self._on_lose_leader
+            )
+
+            logger.info("Inventory scheduler started with leader election",
+                       is_leader=self._leader_election.is_leader,
+                       instance_id=self._leader_election.instance_id)
+
+        except Exception as e:
+            logger.error("Failed to start with leader election, falling back to standalone",
+                        error=str(e))
+            await self._start_standalone()
+
+    async def _on_become_leader(self):
+        """Called when this instance becomes the leader"""
+        logger.info("Inventory scheduler became leader, starting scheduled jobs")
+        await self._start_scheduler()
+
+    async def _on_lose_leader(self):
+        """Called when this instance loses leadership"""
+        logger.warning("Inventory scheduler lost leadership, stopping scheduled jobs")
+        await self._stop_scheduler()
+
+    async def _start_scheduler(self):
+        """Start the APScheduler with inventory check jobs"""
+        if self._scheduler_started:
+            logger.warning("Inventory scheduler already started")
            return

+        try:
+            # Add the periodic job
+            trigger = IntervalTrigger(seconds=self.check_interval)
+            self.scheduler.add_job(
+                self._run_scheduler_task,
+                trigger=trigger,
+                id=self.job_id,
+                name="Inventory Alert Checks",
+                max_instances=1  # Prevent overlapping executions
+            )
+
+            # Start scheduler
+            if not self.scheduler.running:
+                self.scheduler.start()
+                self._scheduler_started = True
+                logger.info("Inventory scheduler jobs started",
+                           interval_seconds=self.check_interval,
+                           job_count=len(self.scheduler.get_jobs()))
+
+        except Exception as e:
+            logger.error("Failed to start inventory scheduler", error=str(e))
+
+    async def _stop_scheduler(self):
+        """Stop the APScheduler"""
+        if not self._scheduler_started:
+            return
+
+        try:
+            if self.scheduler and self.scheduler.running:
+                self.scheduler.shutdown(wait=False)
+                self._scheduler_started = False
+                logger.info("Inventory scheduler jobs stopped")
+
+        except Exception as e:
+            logger.error("Failed to stop inventory scheduler", error=str(e))
+
+    async def _start_standalone(self):
+        """Start scheduler without leader election (fallback mode)"""
+        logger.warning("Starting inventory scheduler in standalone mode (no leader election)")
+
+        self.scheduler = AsyncIOScheduler()
+
        # Add the periodic job
        trigger = IntervalTrigger(seconds=self.check_interval)
        self.scheduler.add_job(
@@ -45,75 +155,63 @@ class InventoryScheduler:
            trigger=trigger,
            id=self.job_id,
            name="Inventory Alert Checks",
-            max_instances=1  # Prevent overlapping executions
+            max_instances=1
        )

-        # Start the scheduler
-        self.scheduler.start()
-        logger.info("Inventory scheduler started", interval_seconds=self.check_interval)
+        if not self.scheduler.running:
+            self.scheduler.start()
+            self._scheduler_started = True
+            logger.info("Inventory scheduler started (standalone mode)",
+                       interval_seconds=self.check_interval)

    async def stop(self):
-        """Stop the inventory scheduler"""
-        if self.scheduler.running:
-            self.scheduler.shutdown(wait=True)
-            logger.info("Inventory scheduler stopped")
-        else:
-            logger.info("Inventory scheduler already stopped")
+        """Stop the inventory scheduler and leader election"""
+        # Stop leader election
+        if self._leader_election:
+            await self._leader_election.stop()
+
+        # Stop scheduler
+        await self._stop_scheduler()
+
+        # Close Redis
+        if self._redis_client:
+            await self._redis_client.close()
+
+        logger.info("Inventory scheduler stopped")
+
+    @property
+    def is_leader(self) -> bool:
+        """Check if this instance is the leader"""
+        return self._leader_election.is_leader if self._leader_election else True
+
+    def get_leader_status(self) -> dict:
+        """Get leader election status"""
+        if self._leader_election:
+            return self._leader_election.get_status()
+        return {"is_leader": True, "mode": "standalone"}

    async def _run_scheduler_task(self):
-        """Run scheduled inventory alert checks with leader election"""
-        # Try to acquire leader lock for this scheduler
-        lock_name = f"inventory_scheduler:{self.database_manager.database_url if hasattr(self.database_manager, 'database_url') else 'default'}"
-        lock_id = abs(hash(lock_name)) % (2**31)  # Generate a unique integer ID for the lock
-        acquired = False
+        """Run scheduled inventory alert checks"""
+        start_time = datetime.now()
+        logger.info("Running scheduled inventory alert checks")

        try:
-            # Try to acquire PostgreSQL advisory lock for leader election
-            async with self.database_manager.get_session() as session:
-                result = await session.execute(text("SELECT pg_try_advisory_lock(:lock_id)"), {"lock_id": lock_id})
-                acquired = True  # If no exception, lock was acquired
+            # Run all alert checks
+            alerts_generated = await self.check_all_conditions()

-                start_time = datetime.now()
-                logger.info("Running scheduled inventory alert checks (as leader)")
-
-                # Run all alert checks
-                alerts_generated = await self.check_all_conditions()
-
-                duration = (datetime.now() - start_time).total_seconds()
-                logger.info(
-                    "Completed scheduled inventory alert checks",
-                    alerts_generated=alerts_generated,
-                    duration_seconds=round(duration, 2)
-                )
+            duration = (datetime.now() - start_time).total_seconds()
+            logger.info(
+                "Completed scheduled inventory alert checks",
+                alerts_generated=alerts_generated,
+                duration_seconds=round(duration, 2)
+            )

        except Exception as e:
-            # If it's a lock acquisition error, log and skip execution (another instance is running)
-            error_str = str(e).lower()
-            if "lock" in error_str or "timeout" in error_str or "could not acquire" in error_str:
-                logger.debug(
-                    "Skipping inventory scheduler execution (not leader)",
-                    lock_name=lock_name
-                )
-                return  # Not an error, just not the leader
-            else:
-                logger.error(
-                    "Error in inventory scheduler task",
-                    error=str(e),
-                    exc_info=True
-                )
-
-        finally:
-            if acquired:
-                # Release the lock
-                try:
-                    async with self.database_manager.get_session() as session:
-                        await session.execute(text("SELECT pg_advisory_unlock(:lock_id)"), {"lock_id": lock_id})
-                        await session.commit()
-                except Exception as unlock_error:
-                    logger.warning(
-                        "Error releasing leader lock (may have been automatically released)",
-                        error=str(unlock_error)
-                    )
+            logger.error(
+                "Error in inventory scheduler task",
+                error=str(e),
+                exc_info=True
+            )

    async def check_all_conditions(self) -> int:
        """
--- a/services/notification/Dockerfile
+++ b/services/notification/Dockerfile
@@ -1,11 +1,11 @@
 # Notification Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/orchestrator/Dockerfile
+++ b/services/orchestrator/Dockerfile
@@ -1,11 +1,11 @@
 # Orchestrator Service Dockerfile
 # Stage 1: Copy shared libraries
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Stage 2: Main service
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/orders/Dockerfile
+++ b/services/orders/Dockerfile
@@ -1,11 +1,11 @@
 # Orders Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/pos/Dockerfile
+++ b/services/pos/Dockerfile
@@ -1,11 +1,11 @@
 # Pos Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/pos/app/main.py
+++ b/services/pos/app/main.py
@@ -20,28 +20,12 @@ from shared.service_base import StandardFastAPIService
 class POSService(StandardFastAPIService):
    """POS Integration Service with standardized setup"""

-    expected_migration_version = "00001"
-
-    async def on_startup(self, app):
-        """Custom startup logic including migration verification"""
-        await self.verify_migrations()
-        await super().on_startup(app)
-
-    async def verify_migrations(self):
-        """Verify database schema matches the latest migrations."""
-        try:
-            async with self.database_manager.get_session() as session:
-                result = await session.execute(text("SELECT version_num FROM alembic_version"))
-                version = result.scalar()
-                if version != self.expected_migration_version:
-                    self.logger.error(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
-                    raise RuntimeError(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
-                self.logger.info(f"Migration verification successful: {version}")
-        except Exception as e:
-            self.logger.error(f"Migration verification failed: {e}")
-            raise
+    expected_migration_version = "e9976ec9fe9e"

    def __init__(self):
+        # Initialize scheduler reference
+        self.pos_scheduler = None
+
        # Define expected database tables for health checks
        pos_expected_tables = [
            'pos_configurations', 'pos_transactions', 'pos_transaction_items',
@@ -87,15 +71,42 @@ class POSService(StandardFastAPIService):
            custom_metrics=pos_custom_metrics
        )

+    async def verify_migrations(self):
+        """Verify database schema matches the latest migrations."""
+        try:
+            async with self.database_manager.get_session() as session:
+                result = await session.execute(text("SELECT version_num FROM alembic_version"))
+                version = result.scalar()
+                if version != self.expected_migration_version:
+                    self.logger.error(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
+                    raise RuntimeError(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
+                self.logger.info(f"Migration verification successful: {version}")
+        except Exception as e:
+            self.logger.error(f"Migration verification failed: {e}")
+            raise
+
    async def on_startup(self, app: FastAPI):
        """Custom startup logic for POS service"""
-        # Start background scheduler for POS-to-Sales sync
+        # Verify migrations first
+        await self.verify_migrations()
+
+        # Call parent startup
+        await super().on_startup(app)
+
+        # Start background scheduler for POS-to-Sales sync with leader election
        try:
-            from app.scheduler import start_scheduler
-            start_scheduler()
-            self.logger.info("Background scheduler started successfully")
+            from app.scheduler import POSScheduler
+            self.pos_scheduler = POSScheduler(
+                redis_url=settings.REDIS_URL,  # Pass Redis URL for leader election
+                sync_interval_minutes=settings.SYNC_INTERVAL_SECONDS // 60 if settings.SYNC_INTERVAL_SECONDS >= 60 else 5
+            )
+            await self.pos_scheduler.start()
+            self.logger.info("POS scheduler started successfully with leader election")
+
+            # Store scheduler in app state for status checks
+            app.state.pos_scheduler = self.pos_scheduler
        except Exception as e:
-            self.logger.error(f"Failed to start background scheduler: {e}", exc_info=True)
+            self.logger.error(f"Failed to start POS scheduler: {e}", exc_info=True)
            # Don't fail startup if scheduler fails

        # Custom startup completed
@@ -103,13 +114,13 @@ class POSService(StandardFastAPIService):

    async def on_shutdown(self, app: FastAPI):
        """Custom shutdown logic for POS service"""
-        # Shutdown background scheduler
+        # Shutdown POS scheduler
        try:
-            from app.scheduler import shutdown_scheduler
-            shutdown_scheduler()
-            self.logger.info("Background scheduler stopped successfully")
+            if self.pos_scheduler:
+                await self.pos_scheduler.stop()
+                self.logger.info("POS scheduler stopped successfully")
        except Exception as e:
-            self.logger.error(f"Failed to stop background scheduler: {e}", exc_info=True)
+            self.logger.error(f"Failed to stop POS scheduler: {e}", exc_info=True)

        # Database cleanup is handled by the base class
        pass
--- a/services/pos/app/scheduler.py
+++ b/services/pos/app/scheduler.py
@@ -5,17 +5,19 @@ Sets up periodic background jobs for:
 - Syncing POS transactions to sales service
 - Other maintenance tasks as needed

-To enable scheduling, add to main.py startup:
+Uses Redis-based leader election to ensure only one pod runs scheduled tasks
+when running with multiple replicas.
+
+Usage in main.py:
 ```python
-from app.scheduler import start_scheduler, shutdown_scheduler
+from app.scheduler import POSScheduler

-@app.on_event("startup")
-async def startup_event():
-    start_scheduler()
+# On startup
+scheduler = POSScheduler(redis_url=settings.REDIS_URL)
+await scheduler.start()

-@app.on_event("shutdown")
-async def shutdown_event():
-    shutdown_scheduler()
+# On shutdown
+await scheduler.stop()
 ```
 """

@@ -23,65 +25,307 @@ import structlog
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
 from apscheduler.triggers.interval import IntervalTrigger
 from datetime import datetime
+from typing import Optional

 logger = structlog.get_logger()

-# Global scheduler instance
-scheduler = None
+
+class POSScheduler:
+    """
+    POS Scheduler service that manages background sync jobs.
+
+    Uses Redis-based leader election to ensure only one pod runs
+    scheduled jobs in a multi-replica deployment.
+    """
+
+    def __init__(self, redis_url: str = None, sync_interval_minutes: int = 5):
+        """
+        Initialize POS scheduler.
+
+        Args:
+            redis_url: Redis connection URL for leader election
+            sync_interval_minutes: Interval for POS-to-sales sync job
+        """
+        self.scheduler = None
+        self.sync_interval_minutes = sync_interval_minutes
+
+        # Leader election
+        self._redis_url = redis_url
+        self._leader_election = None
+        self._redis_client = None
+        self._scheduler_started = False
+
+    async def start(self):
+        """Start the POS scheduler with leader election"""
+        if self._redis_url:
+            await self._start_with_leader_election()
+        else:
+            # Fallback to standalone mode (for local development or single-pod deployments)
+            logger.warning("Redis URL not provided, starting POS scheduler in standalone mode")
+            await self._start_standalone()
+
+    async def _start_with_leader_election(self):
+        """Start with Redis-based leader election for horizontal scaling"""
+        import redis.asyncio as redis
+        from shared.leader_election import LeaderElectionService
+
+        try:
+            # Create Redis connection
+            self._redis_client = redis.from_url(self._redis_url, decode_responses=False)
+            await self._redis_client.ping()
+
+            # Create scheduler (but don't start it yet)
+            self.scheduler = AsyncIOScheduler()
+
+            # Create leader election
+            self._leader_election = LeaderElectionService(
+                self._redis_client,
+                service_name="pos-scheduler"
+            )
+
+            # Start leader election with callbacks
+            await self._leader_election.start(
+                on_become_leader=self._on_become_leader,
+                on_lose_leader=self._on_lose_leader
+            )
+
+            logger.info("POS scheduler started with leader election",
+                       is_leader=self._leader_election.is_leader,
+                       instance_id=self._leader_election.instance_id)
+
+        except Exception as e:
+            logger.error("Failed to start with leader election, falling back to standalone",
+                        error=str(e))
+            await self._start_standalone()
+
+    async def _on_become_leader(self):
+        """Called when this instance becomes the leader"""
+        logger.info("POS scheduler became leader, starting scheduled jobs")
+        await self._start_scheduler()
+
+    async def _on_lose_leader(self):
+        """Called when this instance loses leadership"""
+        logger.warning("POS scheduler lost leadership, stopping scheduled jobs")
+        await self._stop_scheduler()
+
+    async def _start_scheduler(self):
+        """Start the APScheduler with POS jobs"""
+        if self._scheduler_started:
+            logger.warning("POS scheduler already started")
+            return
+
+        try:
+            # Import sync job
+            from app.jobs.sync_pos_to_sales import run_pos_to_sales_sync
+
+            # Job 1: Sync POS transactions to sales service
+            self.scheduler.add_job(
+                run_pos_to_sales_sync,
+                trigger=IntervalTrigger(minutes=self.sync_interval_minutes),
+                id='pos_to_sales_sync',
+                name='Sync POS Transactions to Sales',
+                replace_existing=True,
+                max_instances=1,  # Prevent concurrent runs
+                coalesce=True,  # Combine multiple missed runs into one
+                misfire_grace_time=60  # Allow 60 seconds grace for missed runs
+            )
+
+            # Start scheduler
+            if not self.scheduler.running:
+                self.scheduler.start()
+                self._scheduler_started = True
+                logger.info("POS scheduler jobs started",
+                           sync_interval_minutes=self.sync_interval_minutes,
+                           job_count=len(self.scheduler.get_jobs()),
+                           next_run=self.scheduler.get_jobs()[0].next_run_time if self.scheduler.get_jobs() else None)
+
+        except Exception as e:
+            logger.error("Failed to start POS scheduler", error=str(e))
+
+    async def _stop_scheduler(self):
+        """Stop the APScheduler"""
+        if not self._scheduler_started:
+            return
+
+        try:
+            if self.scheduler and self.scheduler.running:
+                self.scheduler.shutdown(wait=False)
+                self._scheduler_started = False
+                logger.info("POS scheduler jobs stopped")
+
+        except Exception as e:
+            logger.error("Failed to stop POS scheduler", error=str(e))
+
+    async def _start_standalone(self):
+        """Start scheduler without leader election (fallback mode)"""
+        logger.warning("Starting POS scheduler in standalone mode (no leader election)")
+
+        self.scheduler = AsyncIOScheduler()
+
+        try:
+            # Import sync job
+            from app.jobs.sync_pos_to_sales import run_pos_to_sales_sync
+
+            self.scheduler.add_job(
+                run_pos_to_sales_sync,
+                trigger=IntervalTrigger(minutes=self.sync_interval_minutes),
+                id='pos_to_sales_sync',
+                name='Sync POS Transactions to Sales',
+                replace_existing=True,
+                max_instances=1,
+                coalesce=True,
+                misfire_grace_time=60
+            )
+
+            if not self.scheduler.running:
+                self.scheduler.start()
+                self._scheduler_started = True
+                logger.info("POS scheduler started (standalone mode)",
+                           sync_interval_minutes=self.sync_interval_minutes,
+                           next_run=self.scheduler.get_jobs()[0].next_run_time if self.scheduler.get_jobs() else None)
+
+        except Exception as e:
+            logger.error("Failed to start POS scheduler in standalone mode", error=str(e))
+
+    async def stop(self):
+        """Stop the POS scheduler and leader election"""
+        # Stop leader election
+        if self._leader_election:
+            await self._leader_election.stop()
+
+        # Stop scheduler
+        await self._stop_scheduler()
+
+        # Close Redis
+        if self._redis_client:
+            await self._redis_client.close()
+
+        logger.info("POS scheduler stopped")
+
+    @property
+    def is_leader(self) -> bool:
+        """Check if this instance is the leader"""
+        return self._leader_election.is_leader if self._leader_election else True
+
+    def get_leader_status(self) -> dict:
+        """Get leader election status"""
+        if self._leader_election:
+            return self._leader_election.get_status()
+        return {"is_leader": True, "mode": "standalone"}
+
+    def get_scheduler_status(self) -> dict:
+        """
+        Get current scheduler status
+
+        Returns:
+            Dict with scheduler info and job statuses
+        """
+        if self.scheduler is None or not self._scheduler_started:
+            return {
+                "running": False,
+                "is_leader": self.is_leader,
+                "jobs": []
+            }
+
+        jobs = []
+        for job in self.scheduler.get_jobs():
+            jobs.append({
+                "id": job.id,
+                "name": job.name,
+                "next_run": job.next_run_time.isoformat() if job.next_run_time else None,
+                "trigger": str(job.trigger)
+            })
+
+        return {
+            "running": True,
+            "is_leader": self.is_leader,
+            "jobs": jobs,
+            "state": self.scheduler.state
+        }
+
+    def trigger_job_now(self, job_id: str) -> bool:
+        """
+        Manually trigger a scheduled job immediately
+
+        Args:
+            job_id: Job identifier (e.g., 'pos_to_sales_sync')
+
+        Returns:
+            True if job was triggered, False otherwise
+        """
+        if self.scheduler is None or not self._scheduler_started:
+            logger.error("Cannot trigger job, scheduler not running")
+            return False
+
+        if not self.is_leader:
+            logger.warning("Cannot trigger job, this instance is not the leader")
+            return False
+
+        try:
+            job = self.scheduler.get_job(job_id)
+            if job:
+                self.scheduler.modify_job(job_id, next_run_time=datetime.now())
+                logger.info("Job triggered manually", job_id=job_id)
+                return True
+            else:
+                logger.warning("Job not found", job_id=job_id)
+                return False
+
+        except Exception as e:
+            logger.error("Failed to trigger job", job_id=job_id, error=str(e))
+            return False
+
+
+# ================================================================
+# Legacy compatibility functions (deprecated - use POSScheduler class)
+# ================================================================
+
+# Global scheduler instance for backward compatibility
+_scheduler_instance: Optional[POSScheduler] = None


 def start_scheduler():
    """
-    Initialize and start the background scheduler
+    DEPRECATED: Use POSScheduler class directly for better leader election support.

-    Jobs configured:
-    - POS to Sales Sync: Every 5 minutes
+    Initialize and start the background scheduler (legacy function).
    """
-    global scheduler
+    global _scheduler_instance

-    if scheduler is not None:
+    if _scheduler_instance is not None:
        logger.warning("Scheduler already running")
        return

+    logger.warning("Using deprecated start_scheduler function. "
+                  "Consider migrating to POSScheduler class for leader election support.")
+
    try:
-        scheduler = AsyncIOScheduler()
-
-        # Job 1: Sync POS transactions to sales service
-        from app.jobs.sync_pos_to_sales import run_pos_to_sales_sync
-
-        scheduler.add_job(
-            run_pos_to_sales_sync,
-            trigger=IntervalTrigger(minutes=5),
-            id='pos_to_sales_sync',
-            name='Sync POS Transactions to Sales',
-            replace_existing=True,
-            max_instances=1,  # Prevent concurrent runs
-            coalesce=True,  # Combine multiple missed runs into one
-            misfire_grace_time=60  # Allow 60 seconds grace for missed runs
-        )
-
-        scheduler.start()
-        logger.info("Background scheduler started",
-                   jobs=len(scheduler.get_jobs()),
-                   next_run=scheduler.get_jobs()[0].next_run_time if scheduler.get_jobs() else None)
+        _scheduler_instance = POSScheduler()
+        # Note: This is synchronous fallback, no leader election
+        import asyncio
+        asyncio.create_task(_scheduler_instance._start_standalone())

    except Exception as e:
        logger.error("Failed to start scheduler", error=str(e), exc_info=True)
-        scheduler = None
+        _scheduler_instance = None


 def shutdown_scheduler():
-    """Gracefully shutdown the scheduler"""
-    global scheduler
+    """
+    DEPRECATED: Use POSScheduler class directly.

-    if scheduler is None:
+    Gracefully shutdown the scheduler (legacy function).
+    """
+    global _scheduler_instance
+
+    if _scheduler_instance is None:
        logger.warning("Scheduler not running")
        return

    try:
-        scheduler.shutdown(wait=True)
-        logger.info("Background scheduler stopped")
-        scheduler = None
+        import asyncio
+        asyncio.create_task(_scheduler_instance.stop())
+        _scheduler_instance = None

    except Exception as e:
        logger.error("Failed to shutdown scheduler", error=str(e), exc_info=True)
@@ -89,57 +333,25 @@ def shutdown_scheduler():

 def get_scheduler_status():
    """
-    Get current scheduler status
+    DEPRECATED: Use POSScheduler class directly.

-    Returns:
-        Dict with scheduler info and job statuses
+    Get current scheduler status (legacy function).
    """
-    if scheduler is None:
+    if _scheduler_instance is None:
        return {
            "running": False,
            "jobs": []
        }
-
-    jobs = []
-    for job in scheduler.get_jobs():
-        jobs.append({
-            "id": job.id,
-            "name": job.name,
-            "next_run": job.next_run_time.isoformat() if job.next_run_time else None,
-            "trigger": str(job.trigger)
-        })
-
-    return {
-        "running": True,
-        "jobs": jobs,
-        "state": scheduler.state
-    }
+    return _scheduler_instance.get_scheduler_status()


 def trigger_job_now(job_id: str):
    """
-    Manually trigger a scheduled job immediately
+    DEPRECATED: Use POSScheduler class directly.

-    Args:
-        job_id: Job identifier (e.g., 'pos_to_sales_sync')
-
-    Returns:
-        True if job was triggered, False otherwise
+    Manually trigger a scheduled job immediately (legacy function).
    """
-    if scheduler is None:
+    if _scheduler_instance is None:
        logger.error("Cannot trigger job, scheduler not running")
        return False
-
-    try:
-        job = scheduler.get_job(job_id)
-        if job:
-            scheduler.modify_job(job_id, next_run_time=datetime.now())
-            logger.info("Job triggered manually", job_id=job_id)
-            return True
-        else:
-            logger.warning("Job not found", job_id=job_id)
-            return False
-
-    except Exception as e:
-        logger.error("Failed to trigger job", job_id=job_id, error=str(e))
-        return False
+    return _scheduler_instance.trigger_job_now(job_id)
--- a/services/procurement/Dockerfile
+++ b/services/procurement/Dockerfile
@@ -1,11 +1,11 @@
 # Procurement Service Dockerfile
 # Stage 1: Copy shared libraries
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Stage 2: Main service
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/procurement/app/services/delivery_tracking_service.py
+++ b/services/procurement/app/services/delivery_tracking_service.py
@@ -156,21 +156,14 @@ class DeliveryTrackingService:

    async def _check_all_tenants(self):
        """
-        Check deliveries for all active tenants (with leader election).
+        Check deliveries for all active tenants.

-        Only one pod executes this - others skip if not leader.
+        This method is only called by the leader pod (via APScheduler).
+        Leader election is handled at the scheduler level, not here.
        """
-        # Try to acquire leader lock
-        if not await self._try_acquire_leader_lock():
-            logger.debug(
-                "Skipping delivery check - not leader",
-                instance_id=self.instance_id
-            )
-            return
+        logger.info("Starting delivery checks", instance_id=self.instance_id)

        try:
-            logger.info("Starting delivery checks (as leader)", instance_id=self.instance_id)
-
            # Get all active tenants from database
            tenants = await self._get_active_tenants()

@@ -194,24 +187,8 @@ class DeliveryTrackingService:
                total_alerts=total_alerts
            )

-        finally:
-            await self._release_leader_lock()
-
-    async def _try_acquire_leader_lock(self) -> bool:
-        """
-        Try to acquire leader lock for delivery tracking.
-
-        Uses Redis to ensure only one pod runs checks.
-        Returns True if acquired, False if another pod is leader.
-        """
-        # This simplified version doesn't implement leader election
-        # In a real implementation, you'd use Redis or database locks
-        logger.info("Delivery tracking check running", instance_id=self.instance_id)
-        return True
-
-    async def _release_leader_lock(self):
-        """Release leader lock"""
-        logger.debug("Delivery tracking check completed", instance_id=self.instance_id)
+        except Exception as e:
+            logger.error("Delivery checks failed", error=str(e), exc_info=True)

    async def _get_active_tenants(self) -> List[UUID]:
        """
--- a/services/production/Dockerfile
+++ b/services/production/Dockerfile
@@ -1,11 +1,11 @@
 # Production Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/production/app/services/production_scheduler.py
+++ b/services/production/app/services/production_scheduler.py
@@ -2,6 +2,8 @@
 Production Scheduler Service
 Background task that periodically checks for production alert conditions
 and triggers appropriate alerts.
+
+Uses shared leader election for horizontal scaling - only one pod runs the scheduler.
 """

 import asyncio
@@ -21,103 +23,144 @@ from app.services.production_alert_service import ProductionAlertService

 logger = structlog.get_logger()

-class ProductionScheduler:
-    """Production scheduler service that checks for alert conditions"""

-    def __init__(self, alert_service: ProductionAlertService, database_manager: Any):
+class ProductionScheduler:
+    """Production scheduler service that checks for alert conditions.
+
+    Uses Redis-based leader election to ensure only one pod runs the scheduler.
+    """
+
+    def __init__(self, alert_service: ProductionAlertService, database_manager: Any, redis_url: str = None):
        self.alert_service = alert_service
        self.database_manager = database_manager
+        self.redis_url = redis_url
        self.scheduler = AsyncIOScheduler()
        self.check_interval = 300  # 5 minutes
        self.job_id = 'production_scheduler'

+        # Leader election
+        self._leader_election = None
+        self._redis_client = None
+        self._scheduler_started = False
+
        # Cache de alertas emitidas para evitar duplicados
        self._emitted_alerts: set = set()
        self._alert_cache_ttl = 3600  # 1 hora
        self._last_cache_clear = datetime.utcnow()

    async def start(self):
-        """Start the production scheduler with APScheduler"""
-        if self.scheduler.running:
-            logger.warning("Production scheduler is already running")
-            return
+        """Start the production scheduler with leader election"""
+        try:
+            # Initialize leader election if Redis URL is provided
+            if self.redis_url:
+                await self._setup_leader_election()
+            else:
+                # No Redis, start scheduler directly (standalone mode)
+                logger.warning("No Redis URL provided, starting scheduler in standalone mode")
+                await self._start_scheduler()
+        except Exception as e:
+            logger.error("Failed to setup leader election, starting in standalone mode",
+                        error=str(e))
+            await self._start_scheduler()

-        # Add the periodic job
-        trigger = IntervalTrigger(seconds=self.check_interval)
-        self.scheduler.add_job(
-            self._run_scheduler_task,
-            trigger=trigger,
-            id=self.job_id,
-            name="Production Alert Checks",
-            max_instances=1  # Prevent overlapping executions
+    async def _setup_leader_election(self):
+        """Setup Redis-based leader election"""
+        from shared.leader_election import LeaderElectionService
+        import redis.asyncio as redis
+
+        self._redis_client = redis.from_url(self.redis_url, decode_responses=False)
+        await self._redis_client.ping()
+
+        self._leader_election = LeaderElectionService(
+            self._redis_client,
+            service_name="production-scheduler"
        )

-        # Start the scheduler
-        self.scheduler.start()
-        logger.info("Production scheduler started", interval_seconds=self.check_interval)
+        await self._leader_election.start(
+            on_become_leader=self._on_become_leader,
+            on_lose_leader=self._on_lose_leader
+        )
+
+        logger.info("Leader election initialized for production scheduler",
+                   is_leader=self._leader_election.is_leader)
+
+    async def _on_become_leader(self):
+        """Called when this instance becomes the leader"""
+        logger.info("Became leader for production scheduler - starting scheduler")
+        await self._start_scheduler()
+
+    async def _on_lose_leader(self):
+        """Called when this instance loses leadership"""
+        logger.warning("Lost leadership for production scheduler - stopping scheduler")
+        await self._stop_scheduler()
+
+    async def _start_scheduler(self):
+        """Start the APScheduler"""
+        if self._scheduler_started:
+            logger.debug("Production scheduler already started")
+            return
+
+        if not self.scheduler.running:
+            trigger = IntervalTrigger(seconds=self.check_interval)
+            self.scheduler.add_job(
+                self._run_scheduler_task,
+                trigger=trigger,
+                id=self.job_id,
+                name="Production Alert Checks",
+                max_instances=1
+            )
+
+            self.scheduler.start()
+            self._scheduler_started = True
+            logger.info("Production scheduler started", interval_seconds=self.check_interval)
+
+    async def _stop_scheduler(self):
+        """Stop the APScheduler"""
+        if not self._scheduler_started:
+            return
+
+        if self.scheduler.running:
+            self.scheduler.shutdown(wait=False)
+            self._scheduler_started = False
+            logger.info("Production scheduler stopped")

    async def stop(self):
-        """Stop the production scheduler"""
-        if self.scheduler.running:
-            self.scheduler.shutdown(wait=True)
-            logger.info("Production scheduler stopped")
-        else:
-            logger.info("Production scheduler already stopped")
+        """Stop the production scheduler and leader election"""
+        if self._leader_election:
+            await self._leader_election.stop()
+
+        await self._stop_scheduler()
+
+        if self._redis_client:
+            await self._redis_client.close()
+
+    @property
+    def is_leader(self) -> bool:
+        """Check if this instance is the leader"""
+        return self._leader_election.is_leader if self._leader_election else True

    async def _run_scheduler_task(self):
-        """Run scheduled production alert checks with leader election"""
-        # Try to acquire leader lock for this scheduler
-        lock_name = f"production_scheduler:{self.database_manager.database_url if hasattr(self.database_manager, 'database_url') else 'default'}"
-        lock_id = abs(hash(lock_name)) % (2**31)  # Generate a unique integer ID for the lock
-        acquired = False
+        """Run scheduled production alert checks"""
+        start_time = datetime.now()
+        logger.info("Running scheduled production alert checks")

        try:
-            # Try to acquire PostgreSQL advisory lock for leader election
-            async with self.database_manager.get_session() as session:
-                result = await session.execute(text("SELECT pg_try_advisory_lock(:lock_id)"), {"lock_id": lock_id})
-                acquired = True  # If no exception, lock was acquired
+            # Run all alert checks
+            alerts_generated = await self.check_all_conditions()

-                start_time = datetime.now()
-                logger.info("Running scheduled production alert checks (as leader)")
-
-                # Run all alert checks
-                alerts_generated = await self.check_all_conditions()
-
-                duration = (datetime.now() - start_time).total_seconds()
-                logger.info(
-                    "Completed scheduled production alert checks",
-                    alerts_generated=alerts_generated,
-                    duration_seconds=round(duration, 2)
-                )
+            duration = (datetime.now() - start_time).total_seconds()
+            logger.info(
+                "Completed scheduled production alert checks",
+                alerts_generated=alerts_generated,
+                duration_seconds=round(duration, 2)
+            )

        except Exception as e:
-            # If it's a lock acquisition error, log and skip execution (another instance is running)
-            error_str = str(e).lower()
-            if "lock" in error_str or "timeout" in error_str or "could not acquire" in error_str:
-                logger.debug(
-                    "Skipping production scheduler execution (not leader)",
-                    lock_name=lock_name
-                )
-                return  # Not an error, just not the leader
-            else:
-                logger.error(
-                    "Error in production scheduler task",
-                    error=str(e),
-                    exc_info=True
-                )
-
-        finally:
-            if acquired:
-                # Release the lock
-                try:
-                    async with self.database_manager.get_session() as session:
-                        await session.execute(text("SELECT pg_advisory_unlock(:lock_id)"), {"lock_id": lock_id})
-                        await session.commit()
-                except Exception as unlock_error:
-                    logger.warning(
-                        "Error releasing leader lock (may have been automatically released)",
-                        error=str(unlock_error)
-                    )
+            logger.error(
+                "Error in production scheduler task",
+                error=str(e),
+                exc_info=True
+            )

    async def check_all_conditions(self) -> int:
        """
--- a/services/recipes/Dockerfile
+++ b/services/recipes/Dockerfile
@@ -1,11 +1,11 @@
 # Recipes Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/sales/Dockerfile
+++ b/services/sales/Dockerfile
@@ -1,11 +1,11 @@
 # Sales Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/suppliers/Dockerfile
+++ b/services/suppliers/Dockerfile
@@ -1,11 +1,11 @@
 # Suppliers Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/tenant/Dockerfile
+++ b/services/tenant/Dockerfile
@@ -1,11 +1,11 @@
 # Tenant Dockerfile
 # Add this stage at the top of each service Dockerfile
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Then your main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app

--- a/services/training/Dockerfile
+++ b/services/training/Dockerfile
@@ -1,11 +1,11 @@
 # Training Service Dockerfile with MinIO Support
 # Multi-stage build for optimized production image
-FROM python:3.11-slim AS shared
+FROM localhost:5000/python_3.11-slim AS shared
 WORKDIR /shared
 COPY shared/ /shared/

 # Main service stage
-FROM python:3.11-slim
+FROM localhost:5000/python_3.11-slim

 WORKDIR /app