bakery-ia/services/training/app/main.py

# ================================================================
# services/training/app/main.py
# ================================================================
"""
Training Service Main Application
ML training service for bakery demand forecasting
"""

import asyncio
from fastapi import FastAPI, Request
from sqlalchemy import text
from app.core.config import settings
from app.core.database import initialize_training_database, cleanup_training_database, database_manager
from app.api import training_jobs, training_operations, models, health, monitoring, websocket_operations, audit
from app.services.training_events import setup_messaging, cleanup_messaging
from app.websocket.events import setup_websocket_event_consumer, cleanup_websocket_consumers
from shared.service_base import StandardFastAPIService
from shared.monitoring.system_metrics import SystemMetricsCollector


class TrainingService(StandardFastAPIService):
    """Training Service with standardized setup"""

    def __init__(self):
        # Define expected database tables for health checks
        training_expected_tables = [
            'model_training_logs', 'trained_models', 'model_performance_metrics',
            'training_job_queue', 'model_artifacts'
        ]

        super().__init__(
            service_name="training-service",
            app_name="Bakery Training Service",
            description="ML training service for bakery demand forecasting",
            version="1.0.0",
            log_level=settings.LOG_LEVEL,
            cors_origins=settings.CORS_ORIGINS_LIST,
            api_prefix="",
            database_manager=database_manager,
            expected_tables=training_expected_tables,
            enable_messaging=True
        )

    async def _setup_messaging(self):
        """Setup messaging for training service"""
        await setup_messaging()
        self.logger.info("Messaging setup completed")

        # Initialize Redis pub/sub for cross-pod WebSocket broadcasting
        await self._setup_websocket_redis()

        # Set up WebSocket event consumer (listens to RabbitMQ and broadcasts to WebSockets)
        success = await setup_websocket_event_consumer()
        if success:
            self.logger.info("WebSocket event consumer setup completed")
        else:
            self.logger.warning("WebSocket event consumer setup failed")

    async def _setup_websocket_redis(self):
        """
        Initialize Redis pub/sub for WebSocket cross-pod broadcasting.

        CRITICAL FOR HORIZONTAL SCALING:
        Without this, WebSocket clients on Pod A won't receive events
        from training jobs running on Pod B.
        """
        try:
            from app.websocket.manager import websocket_manager
            from app.core.config import settings

            redis_url = settings.REDIS_URL
            success = await websocket_manager.initialize_redis(redis_url)

            if success:
                self.logger.info("WebSocket Redis pub/sub initialized for horizontal scaling")
            else:
                self.logger.warning(
                    "WebSocket Redis pub/sub failed to initialize. "
                    "WebSocket events will only be delivered to local connections."
                )

        except Exception as e:
            self.logger.error("Failed to setup WebSocket Redis pub/sub",
                            error=str(e))
            # Don't fail startup - WebSockets will work locally without Redis

    async def _cleanup_messaging(self):
        """Cleanup messaging for training service"""
        # Shutdown WebSocket Redis pub/sub
        try:
            from app.websocket.manager import websocket_manager
            await websocket_manager.shutdown()
            self.logger.info("WebSocket Redis pub/sub shutdown completed")
        except Exception as e:
            self.logger.warning("Error shutting down WebSocket Redis", error=str(e))

        await cleanup_websocket_consumers()
        await cleanup_messaging()

    async def verify_migrations(self):
        """Verify database schema matches the latest migrations dynamically."""
        try:
            async with self.database_manager.get_session() as session:
                result = await session.execute(text("SELECT version_num FROM alembic_version"))
                version = result.scalar()

                if not version:
                    self.logger.error("No migration version found in database")
                    raise RuntimeError("Database not initialized - no alembic version found")

                self.logger.info(f"Migration verification successful: {version}")
                return version
        except Exception as e:
            self.logger.error(f"Migration verification failed: {e}")
            raise

    async def on_startup(self, app: FastAPI):
        """Custom startup logic including migration verification"""
        await self.verify_migrations()

        # Initialize system metrics collection
        system_metrics = SystemMetricsCollector("training")
        self.logger.info("System metrics collection started")

        # Recover stale jobs from previous pod crashes
        # This is important for horizontal scaling - jobs may be left in 'running'
        # state if a pod crashes. We mark them as failed so they can be retried.
        await self._recover_stale_jobs()

        self.logger.info("Training service startup completed")

    async def _recover_stale_jobs(self):
        """
        Recover stale training jobs on startup.

        When a pod crashes mid-training, jobs are left in 'running' or 'pending' state.
        This method finds jobs that haven't been updated in a while and marks them
        as failed so users can retry them.
        """
        try:
            from app.repositories.training_log_repository import TrainingLogRepository

            async with self.database_manager.get_session() as session:
                log_repo = TrainingLogRepository(session)

                # Recover jobs that haven't been updated in 60 minutes
                # This is conservative - most training jobs complete within 30 minutes
                recovered = await log_repo.recover_stale_jobs(stale_threshold_minutes=60)

                if recovered:
                    self.logger.warning(
                        "Recovered stale training jobs on startup",
                        recovered_count=len(recovered),
                        job_ids=[j.job_id for j in recovered]
                    )
                else:
                    self.logger.info("No stale training jobs to recover")

        except Exception as e:
            # Don't fail startup if recovery fails - just log the error
            self.logger.error("Failed to recover stale jobs on startup", error=str(e))

    async def on_shutdown(self, app: FastAPI):
        """Custom shutdown logic for training service"""
        await cleanup_training_database()
        self.logger.info("Training database cleanup completed")

    def get_service_features(self):
        """Return training-specific features"""
        return [
            "ml_model_training",
            "demand_forecasting",
            "model_performance_tracking",
            "training_job_queue",
            "model_artifacts_management",
            "websocket_support",
            "messaging_integration"
        ]

    def setup_custom_middleware(self):
        """Setup custom middleware for training service"""
        # Request middleware for logging and metrics
        @self.app.middleware("http")
        async def process_request(request: Request, call_next):
            """Process requests with logging and metrics"""
            start_time = asyncio.get_event_loop().time()

            try:
                response = await call_next(request)
                duration = asyncio.get_event_loop().time() - start_time

                self.logger.info(
                    "Request completed",
                    method=request.method,
                    path=request.url.path,
                    status_code=response.status_code,
                    duration_ms=round(duration * 1000, 2)
                )

                return response

            except Exception as e:
                duration = asyncio.get_event_loop().time() - start_time

                self.logger.error(
                    "Request failed",
                    method=request.method,
                    path=request.url.path,
                    error=str(e),
                    duration_ms=round(duration * 1000, 2)
                )
                raise

    def setup_custom_endpoints(self):
        """Setup custom endpoints for training service"""
        # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
        # The /metrics endpoint is not needed as metrics are pushed automatically
        # @self.app.get("/metrics")
        # async def get_metrics():
        #     """Prometheus metrics endpoint"""
        #     if self.metrics_collector:
        #         return self.metrics_collector.get_metrics()
        #     return {"status": "metrics not available"}

        @self.app.get("/")
        async def root():
            return {"service": "training-service", "version": "1.0.0"}


# Create service instance
service = TrainingService()

# Create FastAPI app with standardized setup
app = service.create_app(
    docs_url="/docs",
    redoc_url="/redoc"
)

# Setup standard endpoints
service.setup_standard_endpoints()

# Setup custom middleware
service.setup_custom_middleware()

# Setup custom endpoints
service.setup_custom_endpoints()

# Include API routers
# IMPORTANT: Register audit router FIRST to avoid route matching conflicts
service.add_router(audit.router)
service.add_router(training_jobs.router, tags=["training-jobs"])
service.add_router(training_operations.router, tags=["training-operations"])
service.add_router(models.router, tags=["models"])
service.add_router(health.router, tags=["health"])
service.add_router(monitoring.router, tags=["monitoring"])
service.add_router(websocket_operations.router, tags=["websocket"])

if __name__ == "__main__":
    uvicorn.run(
        "app.main:app",
        host="0.0.0.0",
        port=settings.PORT,
        reload=settings.DEBUG,
        log_level=settings.LOG_LEVEL.lower()
    )
Improve auth models 2025-07-19 21:16:25 +02:00			`# ================================================================`
Refactor all main.py 2025-09-29 13:13:12 +02:00			`# services/training/app/main.py`
Improve auth models 2025-07-19 21:16:25 +02:00			`# ================================================================`
Initial microservices setup from artifacts 2025-07-17 13:09:24 +02:00			`"""`
Add all the code for training service 2025-07-19 16:59:37 +02:00			`Training Service Main Application`
Refactor all main.py 2025-09-29 13:13:12 +02:00			`ML training service for bakery demand forecasting`
Initial microservices setup from artifacts 2025-07-17 13:09:24 +02:00			`"""`

Add all the code for training service 2025-07-19 16:59:37 +02:00			`import asyncio`
			`from fastapi import FastAPI, Request`
Add migration services 2025-09-30 08:12:45 +02:00			`from sqlalchemy import text`
Initial microservices setup from artifacts 2025-07-17 13:09:24 +02:00			`from app.core.config import settings`
Refactor all main.py 2025-09-29 13:13:12 +02:00			`from app.core.database import initialize_training_database, cleanup_training_database, database_manager`
Improve the frontend 5 2025-11-02 20:24:44 +01:00			`from app.api import training_jobs, training_operations, models, health, monitoring, websocket_operations, audit`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`from app.services.training_events import setup_messaging, cleanup_messaging`
			`from app.websocket.events import setup_websocket_event_consumer, cleanup_websocket_consumers`
Refactor all main.py 2025-09-29 13:13:12 +02:00			`from shared.service_base import StandardFastAPIService`
Improve metrics 2026-01-08 20:48:24 +01:00			`from shared.monitoring.system_metrics import SystemMetricsCollector`
Refactor all main.py 2025-09-29 13:13:12 +02:00

			`class TrainingService(StandardFastAPIService):`
			`"""Training Service with standardized setup"""`

			`def __init__(self):`
			`# Define expected database tables for health checks`
			`training_expected_tables = [`
			`'model_training_logs', 'trained_models', 'model_performance_metrics',`
			`'training_job_queue', 'model_artifacts'`
			`]`

			`super().__init__(`
			`service_name="training-service",`
			`app_name="Bakery Training Service",`
			`description="ML training service for bakery demand forecasting",`
			`version="1.0.0",`
			`log_level=settings.LOG_LEVEL,`
			`cors_origins=settings.CORS_ORIGINS_LIST,`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`api_prefix="",`
Refactor all main.py 2025-09-29 13:13:12 +02:00			`database_manager=database_manager,`
			`expected_tables=training_expected_tables,`
			`enable_messaging=True`
			`)`
Add all the code for training service 2025-07-19 16:59:37 +02:00
Refactor all main.py 2025-09-29 13:13:12 +02:00			`async def _setup_messaging(self):`
			`"""Setup messaging for training service"""`
Add all the code for training service 2025-07-19 16:59:37 +02:00			`await setup_messaging()`
Refactor all main.py 2025-09-29 13:13:12 +02:00			`self.logger.info("Messaging setup completed")`

Add ci/cd and fix multiple pods issues 2026-01-18 09:02:27 +01:00			`# Initialize Redis pub/sub for cross-pod WebSocket broadcasting`
			`await self._setup_websocket_redis()`

REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`# Set up WebSocket event consumer (listens to RabbitMQ and broadcasts to WebSockets)`
			`success = await setup_websocket_event_consumer()`
			`if success:`
			`self.logger.info("WebSocket event consumer setup completed")`
			`else:`
			`self.logger.warning("WebSocket event consumer setup failed")`

Add ci/cd and fix multiple pods issues 2026-01-18 09:02:27 +01:00			`async def _setup_websocket_redis(self):`
			`"""`
			`Initialize Redis pub/sub for WebSocket cross-pod broadcasting.`

			`CRITICAL FOR HORIZONTAL SCALING:`
			`Without this, WebSocket clients on Pod A won't receive events`
			`from training jobs running on Pod B.`
			`"""`
			`try:`
			`from app.websocket.manager import websocket_manager`
			`from app.core.config import settings`

			`redis_url = settings.REDIS_URL`
			`success = await websocket_manager.initialize_redis(redis_url)`

			`if success:`
			`self.logger.info("WebSocket Redis pub/sub initialized for horizontal scaling")`
			`else:`
			`self.logger.warning(`
			`"WebSocket Redis pub/sub failed to initialize. "`
			`"WebSocket events will only be delivered to local connections."`
			`)`

			`except Exception as e:`
			`self.logger.error("Failed to setup WebSocket Redis pub/sub",`
			`error=str(e))`
			`# Don't fail startup - WebSockets will work locally without Redis`

Refactor all main.py 2025-09-29 13:13:12 +02:00			`async def _cleanup_messaging(self):`
			`"""Cleanup messaging for training service"""`
Add ci/cd and fix multiple pods issues 2026-01-18 09:02:27 +01:00			`# Shutdown WebSocket Redis pub/sub`
			`try:`
			`from app.websocket.manager import websocket_manager`
			`await websocket_manager.shutdown()`
			`self.logger.info("WebSocket Redis pub/sub shutdown completed")`
			`except Exception as e:`
			`self.logger.warning("Error shutting down WebSocket Redis", error=str(e))`

REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`await cleanup_websocket_consumers()`
Add all the code for training service 2025-07-19 16:59:37 +02:00			`await cleanup_messaging()`

REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`async def verify_migrations(self):`
			`"""Verify database schema matches the latest migrations dynamically."""`
			`try:`
			`async with self.database_manager.get_session() as session:`
			`result = await session.execute(text("SELECT version_num FROM alembic_version"))`
			`version = result.scalar()`

			`if not version:`
			`self.logger.error("No migration version found in database")`
			`raise RuntimeError("Database not initialized - no alembic version found")`

			`self.logger.info(f"Migration verification successful: {version}")`
			`return version`
			`except Exception as e:`
			`self.logger.error(f"Migration verification failed: {e}")`
			`raise`

Refactor all main.py 2025-09-29 13:13:12 +02:00			`async def on_startup(self, app: FastAPI):`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`"""Custom startup logic including migration verification"""`
			`await self.verify_migrations()`
Add ci/cd and fix multiple pods issues 2026-01-18 09:02:27 +01:00
Improve metrics 2026-01-08 20:48:24 +01:00			`# Initialize system metrics collection`
			`system_metrics = SystemMetricsCollector("training")`
			`self.logger.info("System metrics collection started")`
Add ci/cd and fix multiple pods issues 2026-01-18 09:02:27 +01:00
			`# Recover stale jobs from previous pod crashes`
			`# This is important for horizontal scaling - jobs may be left in 'running'`
			`# state if a pod crashes. We mark them as failed so they can be retried.`
			`await self._recover_stale_jobs()`

REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`self.logger.info("Training service startup completed")`
Initial microservices setup from artifacts 2025-07-17 13:09:24 +02:00
Add ci/cd and fix multiple pods issues 2026-01-18 09:02:27 +01:00			`async def _recover_stale_jobs(self):`
			`"""`
			`Recover stale training jobs on startup.`

			`When a pod crashes mid-training, jobs are left in 'running' or 'pending' state.`
			`This method finds jobs that haven't been updated in a while and marks them`
			`as failed so users can retry them.`
			`"""`
			`try:`
			`from app.repositories.training_log_repository import TrainingLogRepository`

			`async with self.database_manager.get_session() as session:`
			`log_repo = TrainingLogRepository(session)`

			`# Recover jobs that haven't been updated in 60 minutes`
			`# This is conservative - most training jobs complete within 30 minutes`
			`recovered = await log_repo.recover_stale_jobs(stale_threshold_minutes=60)`

			`if recovered:`
			`self.logger.warning(`
			`"Recovered stale training jobs on startup",`
			`recovered_count=len(recovered),`
			`job_ids=[j.job_id for j in recovered]`
			`)`
			`else:`
			`self.logger.info("No stale training jobs to recover")`

			`except Exception as e:`
			`# Don't fail startup if recovery fails - just log the error`
			`self.logger.error("Failed to recover stale jobs on startup", error=str(e))`

Refactor all main.py 2025-09-29 13:13:12 +02:00			`async def on_shutdown(self, app: FastAPI):`
			`"""Custom shutdown logic for training service"""`
			`await cleanup_training_database()`
			`self.logger.info("Training database cleanup completed")`

			`def get_service_features(self):`
			`"""Return training-specific features"""`
			`return [`
			`"ml_model_training",`
			`"demand_forecasting",`
			`"model_performance_tracking",`
			`"training_job_queue",`
			`"model_artifacts_management",`
			`"websocket_support",`
			`"messaging_integration"`
			`]`

			`def setup_custom_middleware(self):`
			`"""Setup custom middleware for training service"""`
			`# Request middleware for logging and metrics`
			`@self.app.middleware("http")`
			`async def process_request(request: Request, call_next):`
			`"""Process requests with logging and metrics"""`
			`start_time = asyncio.get_event_loop().time()`

			`try:`
			`response = await call_next(request)`
			`duration = asyncio.get_event_loop().time() - start_time`

			`self.logger.info(`
			`"Request completed",`
			`method=request.method,`
			`path=request.url.path,`
			`status_code=response.status_code,`
			`duration_ms=round(duration * 1000, 2)`
			`)`

			`return response`

			`except Exception as e:`
			`duration = asyncio.get_event_loop().time() - start_time`

			`self.logger.error(`
			`"Request failed",`
			`method=request.method,`
			`path=request.url.path,`
			`error=str(e),`
			`duration_ms=round(duration * 1000, 2)`
			`)`
			`raise`

			`def setup_custom_endpoints(self):`
			`"""Setup custom endpoints for training service"""`
Improve metrics 2026-01-08 20:48:24 +01:00			`# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz`
			`# The /metrics endpoint is not needed as metrics are pushed automatically`
			`# @self.app.get("/metrics")`
			`# async def get_metrics():`
			`# """Prometheus metrics endpoint"""`
			`# if self.metrics_collector:`
			`# return self.metrics_collector.get_metrics()`
			`# return {"status": "metrics not available"}`
Refactor all main.py 2025-09-29 13:13:12 +02:00
			`@self.app.get("/")`
			`async def root():`
			`return {"service": "training-service", "version": "1.0.0"}`


			`# Create service instance`
			`service = TrainingService()`

			`# Create FastAPI app with standardized setup`
			`app = service.create_app(`
			`docs_url="/docs",`
			`redoc_url="/redoc"`
Initial microservices setup from artifacts 2025-07-17 13:09:24 +02:00			`)`

Refactor all main.py 2025-09-29 13:13:12 +02:00			`# Setup standard endpoints`
			`service.setup_standard_endpoints()`
Improve auth models 2025-07-19 21:16:25 +02:00
Refactor all main.py 2025-09-29 13:13:12 +02:00			`# Setup custom middleware`
			`service.setup_custom_middleware()`
Initial microservices setup from artifacts 2025-07-17 13:09:24 +02:00
Refactor all main.py 2025-09-29 13:13:12 +02:00			`# Setup custom endpoints`
			`service.setup_custom_endpoints()`
Initial microservices setup from artifacts 2025-07-17 13:09:24 +02:00
REFACTOR API gateway 2025-07-26 18:46:52 +02:00			`# Include API routers`
Improve the frontend 5 2025-11-02 20:24:44 +01:00			`# IMPORTANT: Register audit router FIRST to avoid route matching conflicts`
			`service.add_router(audit.router)`
REFACTOR ALL APIs 2025-10-06 15:27:01 +02:00			`service.add_router(training_jobs.router, tags=["training-jobs"])`
			`service.add_router(training_operations.router, tags=["training-operations"])`
Refactor all main.py 2025-09-29 13:13:12 +02:00			`service.add_router(models.router, tags=["models"])`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`service.add_router(health.router, tags=["health"])`
			`service.add_router(monitoring.router, tags=["monitoring"])`
			`service.add_router(websocket_operations.router, tags=["websocket"])`
Add all the code for training service 2025-07-19 16:59:37 +02:00
Initial microservices setup from artifacts 2025-07-17 13:09:24 +02:00			`if __name__ == "__main__":`
Add all the code for training service 2025-07-19 16:59:37 +02:00			`uvicorn.run(`
			`"app.main:app",`
			`host="0.0.0.0",`
Improve auth models 2025-07-19 21:16:25 +02:00			`port=settings.PORT,`
Add all the code for training service 2025-07-19 16:59:37 +02:00			`reload=settings.DEBUG,`
Improve auth models 2025-07-19 21:16:25 +02:00			`log_level=settings.LOG_LEVEL.lower()`
Add all the code for training service 2025-07-19 16:59:37 +02:00			`)`