Files

265 lines
10 KiB
Python
Raw Permalink Normal View History

2025-07-19 21:16:25 +02:00
# ================================================================
2025-09-29 13:13:12 +02:00
# services/training/app/main.py
2025-07-19 21:16:25 +02:00
# ================================================================
"""
2025-07-19 16:59:37 +02:00
Training Service Main Application
2025-09-29 13:13:12 +02:00
ML training service for bakery demand forecasting
"""
2025-07-19 16:59:37 +02:00
import asyncio
from fastapi import FastAPI, Request
2025-09-30 08:12:45 +02:00
from sqlalchemy import text
from app.core.config import settings
2025-09-29 13:13:12 +02:00
from app.core.database import initialize_training_database, cleanup_training_database, database_manager
2025-11-02 20:24:44 +01:00
from app.api import training_jobs, training_operations, models, health, monitoring, websocket_operations, audit
from app.services.training_events import setup_messaging, cleanup_messaging
from app.websocket.events import setup_websocket_event_consumer, cleanup_websocket_consumers
2025-09-29 13:13:12 +02:00
from shared.service_base import StandardFastAPIService
2026-01-08 20:48:24 +01:00
from shared.monitoring.system_metrics import SystemMetricsCollector
2025-09-29 13:13:12 +02:00
class TrainingService(StandardFastAPIService):
"""Training Service with standardized setup"""
def __init__(self):
# Define expected database tables for health checks
training_expected_tables = [
'model_training_logs', 'trained_models', 'model_performance_metrics',
'training_job_queue', 'model_artifacts'
]
super().__init__(
service_name="training-service",
app_name="Bakery Training Service",
description="ML training service for bakery demand forecasting",
version="1.0.0",
log_level=settings.LOG_LEVEL,
cors_origins=settings.CORS_ORIGINS_LIST,
api_prefix="",
2025-09-29 13:13:12 +02:00
database_manager=database_manager,
expected_tables=training_expected_tables,
enable_messaging=True
)
2025-07-19 16:59:37 +02:00
2025-09-29 13:13:12 +02:00
async def _setup_messaging(self):
"""Setup messaging for training service"""
2025-07-19 16:59:37 +02:00
await setup_messaging()
2025-09-29 13:13:12 +02:00
self.logger.info("Messaging setup completed")
2026-01-18 09:02:27 +01:00
# Initialize Redis pub/sub for cross-pod WebSocket broadcasting
await self._setup_websocket_redis()
# Set up WebSocket event consumer (listens to RabbitMQ and broadcasts to WebSockets)
success = await setup_websocket_event_consumer()
if success:
self.logger.info("WebSocket event consumer setup completed")
else:
self.logger.warning("WebSocket event consumer setup failed")
2026-01-18 09:02:27 +01:00
async def _setup_websocket_redis(self):
"""
Initialize Redis pub/sub for WebSocket cross-pod broadcasting.
CRITICAL FOR HORIZONTAL SCALING:
Without this, WebSocket clients on Pod A won't receive events
from training jobs running on Pod B.
"""
try:
from app.websocket.manager import websocket_manager
from app.core.config import settings
redis_url = settings.REDIS_URL
success = await websocket_manager.initialize_redis(redis_url)
if success:
self.logger.info("WebSocket Redis pub/sub initialized for horizontal scaling")
else:
self.logger.warning(
"WebSocket Redis pub/sub failed to initialize. "
"WebSocket events will only be delivered to local connections."
)
except Exception as e:
self.logger.error("Failed to setup WebSocket Redis pub/sub",
error=str(e))
# Don't fail startup - WebSockets will work locally without Redis
2025-09-29 13:13:12 +02:00
async def _cleanup_messaging(self):
"""Cleanup messaging for training service"""
2026-01-18 09:02:27 +01:00
# Shutdown WebSocket Redis pub/sub
try:
from app.websocket.manager import websocket_manager
await websocket_manager.shutdown()
self.logger.info("WebSocket Redis pub/sub shutdown completed")
except Exception as e:
self.logger.warning("Error shutting down WebSocket Redis", error=str(e))
await cleanup_websocket_consumers()
2025-07-19 16:59:37 +02:00
await cleanup_messaging()
async def verify_migrations(self):
"""Verify database schema matches the latest migrations dynamically."""
try:
async with self.database_manager.get_session() as session:
result = await session.execute(text("SELECT version_num FROM alembic_version"))
version = result.scalar()
if not version:
self.logger.error("No migration version found in database")
raise RuntimeError("Database not initialized - no alembic version found")
self.logger.info(f"Migration verification successful: {version}")
return version
except Exception as e:
self.logger.error(f"Migration verification failed: {e}")
raise
2025-09-29 13:13:12 +02:00
async def on_startup(self, app: FastAPI):
"""Custom startup logic including migration verification"""
await self.verify_migrations()
2026-01-18 09:02:27 +01:00
2026-01-08 20:48:24 +01:00
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("training")
self.logger.info("System metrics collection started")
2026-01-18 09:02:27 +01:00
# Recover stale jobs from previous pod crashes
# This is important for horizontal scaling - jobs may be left in 'running'
# state if a pod crashes. We mark them as failed so they can be retried.
await self._recover_stale_jobs()
self.logger.info("Training service startup completed")
2026-01-18 09:02:27 +01:00
async def _recover_stale_jobs(self):
"""
Recover stale training jobs on startup.
When a pod crashes mid-training, jobs are left in 'running' or 'pending' state.
This method finds jobs that haven't been updated in a while and marks them
as failed so users can retry them.
"""
try:
from app.repositories.training_log_repository import TrainingLogRepository
async with self.database_manager.get_session() as session:
log_repo = TrainingLogRepository(session)
# Recover jobs that haven't been updated in 60 minutes
# This is conservative - most training jobs complete within 30 minutes
recovered = await log_repo.recover_stale_jobs(stale_threshold_minutes=60)
if recovered:
self.logger.warning(
"Recovered stale training jobs on startup",
recovered_count=len(recovered),
job_ids=[j.job_id for j in recovered]
)
else:
self.logger.info("No stale training jobs to recover")
except Exception as e:
# Don't fail startup if recovery fails - just log the error
self.logger.error("Failed to recover stale jobs on startup", error=str(e))
2025-09-29 13:13:12 +02:00
async def on_shutdown(self, app: FastAPI):
"""Custom shutdown logic for training service"""
await cleanup_training_database()
self.logger.info("Training database cleanup completed")
def get_service_features(self):
"""Return training-specific features"""
return [
"ml_model_training",
"demand_forecasting",
"model_performance_tracking",
"training_job_queue",
"model_artifacts_management",
"websocket_support",
"messaging_integration"
]
def setup_custom_middleware(self):
"""Setup custom middleware for training service"""
# Request middleware for logging and metrics
@self.app.middleware("http")
async def process_request(request: Request, call_next):
"""Process requests with logging and metrics"""
start_time = asyncio.get_event_loop().time()
try:
response = await call_next(request)
duration = asyncio.get_event_loop().time() - start_time
self.logger.info(
"Request completed",
method=request.method,
path=request.url.path,
status_code=response.status_code,
duration_ms=round(duration * 1000, 2)
)
return response
except Exception as e:
duration = asyncio.get_event_loop().time() - start_time
self.logger.error(
"Request failed",
method=request.method,
path=request.url.path,
error=str(e),
duration_ms=round(duration * 1000, 2)
)
raise
def setup_custom_endpoints(self):
"""Setup custom endpoints for training service"""
2026-01-08 20:48:24 +01:00
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
# @self.app.get("/metrics")
# async def get_metrics():
# """Prometheus metrics endpoint"""
# if self.metrics_collector:
# return self.metrics_collector.get_metrics()
# return {"status": "metrics not available"}
2025-09-29 13:13:12 +02:00
@self.app.get("/")
async def root():
return {"service": "training-service", "version": "1.0.0"}
# Create service instance
service = TrainingService()
# Create FastAPI app with standardized setup
app = service.create_app(
docs_url="/docs",
redoc_url="/redoc"
)
2025-09-29 13:13:12 +02:00
# Setup standard endpoints
service.setup_standard_endpoints()
2025-07-19 21:16:25 +02:00
2025-09-29 13:13:12 +02:00
# Setup custom middleware
service.setup_custom_middleware()
2025-09-29 13:13:12 +02:00
# Setup custom endpoints
service.setup_custom_endpoints()
2025-07-26 18:46:52 +02:00
# Include API routers
2025-11-02 20:24:44 +01:00
# IMPORTANT: Register audit router FIRST to avoid route matching conflicts
service.add_router(audit.router)
2025-10-06 15:27:01 +02:00
service.add_router(training_jobs.router, tags=["training-jobs"])
service.add_router(training_operations.router, tags=["training-operations"])
2025-09-29 13:13:12 +02:00
service.add_router(models.router, tags=["models"])
service.add_router(health.router, tags=["health"])
service.add_router(monitoring.router, tags=["monitoring"])
service.add_router(websocket_operations.router, tags=["websocket"])
2025-07-19 16:59:37 +02:00
if __name__ == "__main__":
2025-07-19 16:59:37 +02:00
uvicorn.run(
"app.main:app",
host="0.0.0.0",
2025-07-19 21:16:25 +02:00
port=settings.PORT,
2025-07-19 16:59:37 +02:00
reload=settings.DEBUG,
2025-07-19 21:16:25 +02:00
log_level=settings.LOG_LEVEL.lower()
2025-07-19 16:59:37 +02:00
)