2025-07-19 16:59:37 +02:00
|
|
|
# services/training/app/main.py
|
2025-07-17 13:09:24 +02:00
|
|
|
"""
|
2025-07-19 16:59:37 +02:00
|
|
|
Training Service Main Application
|
|
|
|
|
Enhanced with proper error handling, monitoring, and lifecycle management
|
2025-07-17 13:09:24 +02:00
|
|
|
"""
|
|
|
|
|
|
2025-07-18 14:41:39 +02:00
|
|
|
import structlog
|
2025-07-19 16:59:37 +02:00
|
|
|
import asyncio
|
|
|
|
|
from contextlib import asynccontextmanager
|
|
|
|
|
from fastapi import FastAPI, Request
|
2025-07-17 13:09:24 +02:00
|
|
|
from fastapi.middleware.cors import CORSMiddleware
|
2025-07-19 16:59:37 +02:00
|
|
|
from fastapi.middleware.trustedhost import TrustedHostMiddleware
|
|
|
|
|
from fastapi.responses import JSONResponse
|
|
|
|
|
import uvicorn
|
2025-07-17 13:09:24 +02:00
|
|
|
|
|
|
|
|
from app.core.config import settings
|
2025-07-19 16:59:37 +02:00
|
|
|
from app.core.database import database_manager, get_db_health
|
2025-07-17 13:09:24 +02:00
|
|
|
from app.api import training, models
|
2025-07-18 14:41:39 +02:00
|
|
|
from app.services.messaging import setup_messaging, cleanup_messaging
|
2025-07-17 13:09:24 +02:00
|
|
|
from shared.monitoring.logging import setup_logging
|
|
|
|
|
from shared.monitoring.metrics import MetricsCollector
|
2025-07-19 16:59:37 +02:00
|
|
|
from shared.auth.decorators import require_auth
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Setup structured logging
|
2025-07-17 13:09:24 +02:00
|
|
|
setup_logging("training-service", settings.LOG_LEVEL)
|
2025-07-18 14:41:39 +02:00
|
|
|
logger = structlog.get_logger()
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Initialize metrics collector
|
|
|
|
|
metrics_collector = MetricsCollector("training-service")
|
|
|
|
|
|
|
|
|
|
@asynccontextmanager
|
|
|
|
|
async def lifespan(app: FastAPI):
|
|
|
|
|
"""
|
|
|
|
|
Application lifespan manager for startup and shutdown events
|
|
|
|
|
"""
|
|
|
|
|
# Startup
|
|
|
|
|
logger.info("Starting Training Service", version="1.0.0")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Initialize database
|
|
|
|
|
logger.info("Initializing database connection")
|
|
|
|
|
await database_manager.create_tables()
|
|
|
|
|
logger.info("Database initialized successfully")
|
|
|
|
|
|
|
|
|
|
# Initialize messaging
|
|
|
|
|
logger.info("Setting up messaging")
|
|
|
|
|
await setup_messaging()
|
|
|
|
|
logger.info("Messaging setup completed")
|
|
|
|
|
|
|
|
|
|
# Start metrics server
|
|
|
|
|
logger.info("Starting metrics server")
|
|
|
|
|
metrics_collector.start_metrics_server(8080)
|
|
|
|
|
logger.info("Metrics server started on port 8080")
|
|
|
|
|
|
|
|
|
|
# Mark service as ready
|
|
|
|
|
app.state.ready = True
|
|
|
|
|
logger.info("Training Service startup completed successfully")
|
|
|
|
|
|
|
|
|
|
yield
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error("Failed to start Training Service", error=str(e))
|
|
|
|
|
app.state.ready = False
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
# Shutdown
|
|
|
|
|
logger.info("Shutting down Training Service")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Cleanup messaging
|
|
|
|
|
logger.info("Cleaning up messaging")
|
|
|
|
|
await cleanup_messaging()
|
|
|
|
|
|
|
|
|
|
# Close database connections
|
|
|
|
|
logger.info("Closing database connections")
|
|
|
|
|
await database_manager.close_connections()
|
|
|
|
|
|
|
|
|
|
logger.info("Training Service shutdown completed")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error("Error during shutdown", error=str(e))
|
|
|
|
|
|
|
|
|
|
# Create FastAPI app with lifespan
|
2025-07-17 13:09:24 +02:00
|
|
|
app = FastAPI(
|
|
|
|
|
title="Training Service",
|
|
|
|
|
description="ML model training service for bakery demand forecasting",
|
2025-07-19 16:59:37 +02:00
|
|
|
version="1.0.0",
|
|
|
|
|
docs_url="/docs" if settings.DEBUG else None,
|
|
|
|
|
redoc_url="/redoc" if settings.DEBUG else None,
|
|
|
|
|
lifespan=lifespan
|
2025-07-17 13:09:24 +02:00
|
|
|
)
|
|
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Initialize app state
|
|
|
|
|
app.state.ready = False
|
|
|
|
|
|
|
|
|
|
# Security middleware
|
|
|
|
|
if not settings.DEBUG:
|
|
|
|
|
app.add_middleware(
|
|
|
|
|
TrustedHostMiddleware,
|
|
|
|
|
allowed_hosts=["localhost", "127.0.0.1", "training-service", "*.bakery-forecast.local"]
|
|
|
|
|
)
|
2025-07-17 13:09:24 +02:00
|
|
|
|
|
|
|
|
# CORS middleware
|
|
|
|
|
app.add_middleware(
|
|
|
|
|
CORSMiddleware,
|
2025-07-19 16:59:37 +02:00
|
|
|
allow_origins=["*"] if settings.DEBUG else [
|
|
|
|
|
"http://localhost:3000",
|
|
|
|
|
"http://localhost:8000",
|
|
|
|
|
"https://dashboard.bakery-forecast.es"
|
|
|
|
|
],
|
2025-07-17 13:09:24 +02:00
|
|
|
allow_credentials=True,
|
2025-07-19 16:59:37 +02:00
|
|
|
allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
|
2025-07-17 13:09:24 +02:00
|
|
|
allow_headers=["*"],
|
|
|
|
|
)
|
|
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Request logging middleware
|
|
|
|
|
@app.middleware("http")
|
|
|
|
|
async def log_requests(request: Request, call_next):
|
|
|
|
|
"""Log all incoming requests with timing"""
|
|
|
|
|
start_time = asyncio.get_event_loop().time()
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Log request
|
|
|
|
|
logger.info(
|
|
|
|
|
"Request started",
|
|
|
|
|
method=request.method,
|
|
|
|
|
path=request.url.path,
|
|
|
|
|
client_ip=request.client.host if request.client else "unknown"
|
|
|
|
|
)
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Process request
|
|
|
|
|
try:
|
|
|
|
|
response = await call_next(request)
|
|
|
|
|
|
|
|
|
|
# Calculate duration
|
|
|
|
|
duration = asyncio.get_event_loop().time() - start_time
|
|
|
|
|
|
|
|
|
|
# Log response
|
|
|
|
|
logger.info(
|
|
|
|
|
"Request completed",
|
|
|
|
|
method=request.method,
|
|
|
|
|
path=request.url.path,
|
|
|
|
|
status_code=response.status_code,
|
|
|
|
|
duration_ms=round(duration * 1000, 2)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Update metrics
|
|
|
|
|
metrics_collector.record_request(
|
|
|
|
|
method=request.method,
|
|
|
|
|
endpoint=request.url.path,
|
|
|
|
|
status_code=response.status_code,
|
|
|
|
|
duration=duration
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
duration = asyncio.get_event_loop().time() - start_time
|
|
|
|
|
|
|
|
|
|
logger.error(
|
|
|
|
|
"Request failed",
|
|
|
|
|
method=request.method,
|
|
|
|
|
path=request.url.path,
|
|
|
|
|
error=str(e),
|
|
|
|
|
duration_ms=round(duration * 1000, 2)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
metrics_collector.increment_counter("http_requests_failed_total")
|
|
|
|
|
raise
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Exception handlers
|
|
|
|
|
@app.exception_handler(Exception)
|
|
|
|
|
async def global_exception_handler(request: Request, exc: Exception):
|
|
|
|
|
"""Global exception handler for unhandled errors"""
|
|
|
|
|
logger.error(
|
|
|
|
|
"Unhandled exception",
|
|
|
|
|
path=request.url.path,
|
|
|
|
|
method=request.method,
|
|
|
|
|
error=str(exc),
|
|
|
|
|
exc_info=True
|
|
|
|
|
)
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
metrics_collector.increment_counter("unhandled_exceptions_total")
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
return JSONResponse(
|
|
|
|
|
status_code=500,
|
|
|
|
|
content={
|
|
|
|
|
"detail": "Internal server error",
|
|
|
|
|
"error_id": structlog.get_logger().new().info("Error logged", error=str(exc))
|
|
|
|
|
}
|
|
|
|
|
)
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Include API routers
|
|
|
|
|
app.include_router(
|
|
|
|
|
training.router,
|
|
|
|
|
prefix="/training",
|
|
|
|
|
tags=["training"],
|
|
|
|
|
dependencies=[require_auth] if not settings.DEBUG else []
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
app.include_router(
|
|
|
|
|
models.router,
|
|
|
|
|
prefix="/models",
|
|
|
|
|
tags=["models"],
|
|
|
|
|
dependencies=[require_auth] if not settings.DEBUG else []
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Health check endpoints
|
2025-07-17 13:09:24 +02:00
|
|
|
@app.get("/health")
|
|
|
|
|
async def health_check():
|
2025-07-19 16:59:37 +02:00
|
|
|
"""Basic health check endpoint"""
|
|
|
|
|
return {
|
|
|
|
|
"status": "healthy" if app.state.ready else "starting",
|
|
|
|
|
"service": "training-service",
|
|
|
|
|
"version": "1.0.0",
|
|
|
|
|
"timestamp": structlog.get_logger().new().info("Health check")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@app.get("/health/ready")
|
|
|
|
|
async def readiness_check():
|
|
|
|
|
"""Kubernetes readiness probe"""
|
|
|
|
|
if not app.state.ready:
|
|
|
|
|
return JSONResponse(
|
|
|
|
|
status_code=503,
|
|
|
|
|
content={"status": "not_ready", "message": "Service is starting up"}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return {"status": "ready", "service": "training-service"}
|
|
|
|
|
|
|
|
|
|
@app.get("/health/live")
|
|
|
|
|
async def liveness_check():
|
|
|
|
|
"""Kubernetes liveness probe"""
|
|
|
|
|
# Check database connectivity
|
|
|
|
|
try:
|
|
|
|
|
db_healthy = await get_db_health()
|
|
|
|
|
if not db_healthy:
|
|
|
|
|
return JSONResponse(
|
|
|
|
|
status_code=503,
|
|
|
|
|
content={"status": "unhealthy", "reason": "database_unavailable"}
|
|
|
|
|
)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error("Database health check failed", error=str(e))
|
|
|
|
|
return JSONResponse(
|
|
|
|
|
status_code=503,
|
|
|
|
|
content={"status": "unhealthy", "reason": "database_error"}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return {"status": "alive", "service": "training-service"}
|
|
|
|
|
|
|
|
|
|
@app.get("/metrics")
|
|
|
|
|
async def get_metrics():
|
|
|
|
|
"""Expose service metrics"""
|
|
|
|
|
return {
|
|
|
|
|
"training_jobs_active": metrics_collector.get_gauge("training_jobs_active", 0),
|
|
|
|
|
"training_jobs_completed": metrics_collector.get_counter("training_jobs_completed", 0),
|
|
|
|
|
"training_jobs_failed": metrics_collector.get_counter("training_jobs_failed", 0),
|
|
|
|
|
"models_trained_total": metrics_collector.get_counter("models_trained_total", 0),
|
|
|
|
|
"uptime_seconds": metrics_collector.get_gauge("uptime_seconds", 0)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@app.get("/")
|
|
|
|
|
async def root():
|
|
|
|
|
"""Root endpoint with service information"""
|
2025-07-17 13:09:24 +02:00
|
|
|
return {
|
|
|
|
|
"service": "training-service",
|
2025-07-19 16:59:37 +02:00
|
|
|
"version": "1.0.0",
|
|
|
|
|
"description": "ML model training service for bakery demand forecasting",
|
|
|
|
|
"docs": "/docs" if settings.DEBUG else "Documentation disabled in production",
|
|
|
|
|
"health": "/health"
|
2025-07-17 13:09:24 +02:00
|
|
|
}
|
|
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Development server configuration
|
2025-07-17 13:09:24 +02:00
|
|
|
if __name__ == "__main__":
|
2025-07-19 16:59:37 +02:00
|
|
|
uvicorn.run(
|
|
|
|
|
"app.main:app",
|
|
|
|
|
host="0.0.0.0",
|
|
|
|
|
port=8000,
|
|
|
|
|
reload=settings.DEBUG,
|
|
|
|
|
log_level=settings.LOG_LEVEL.lower(),
|
|
|
|
|
access_log=settings.DEBUG,
|
|
|
|
|
server_header=False,
|
|
|
|
|
date_header=False
|
|
|
|
|
)
|