Files
bakery-ia/services/training/app/main.py
2025-07-19 16:59:37 +02:00

282 lines
8.4 KiB
Python

# services/training/app/main.py
"""
Training Service Main Application
Enhanced with proper error handling, monitoring, and lifecycle management
"""
import structlog
import asyncio
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.trustedhost import TrustedHostMiddleware
from fastapi.responses import JSONResponse
import uvicorn
from app.core.config import settings
from app.core.database import database_manager, get_db_health
from app.api import training, models
from app.services.messaging import setup_messaging, cleanup_messaging
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector
from shared.auth.decorators import require_auth
# Setup structured logging
setup_logging("training-service", settings.LOG_LEVEL)
logger = structlog.get_logger()
# Initialize metrics collector
metrics_collector = MetricsCollector("training-service")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
Application lifespan manager for startup and shutdown events
"""
# Startup
logger.info("Starting Training Service", version="1.0.0")
try:
# Initialize database
logger.info("Initializing database connection")
await database_manager.create_tables()
logger.info("Database initialized successfully")
# Initialize messaging
logger.info("Setting up messaging")
await setup_messaging()
logger.info("Messaging setup completed")
# Start metrics server
logger.info("Starting metrics server")
metrics_collector.start_metrics_server(8080)
logger.info("Metrics server started on port 8080")
# Mark service as ready
app.state.ready = True
logger.info("Training Service startup completed successfully")
yield
except Exception as e:
logger.error("Failed to start Training Service", error=str(e))
app.state.ready = False
raise
# Shutdown
logger.info("Shutting down Training Service")
try:
# Cleanup messaging
logger.info("Cleaning up messaging")
await cleanup_messaging()
# Close database connections
logger.info("Closing database connections")
await database_manager.close_connections()
logger.info("Training Service shutdown completed")
except Exception as e:
logger.error("Error during shutdown", error=str(e))
# Create FastAPI app with lifespan
app = FastAPI(
title="Training Service",
description="ML model training service for bakery demand forecasting",
version="1.0.0",
docs_url="/docs" if settings.DEBUG else None,
redoc_url="/redoc" if settings.DEBUG else None,
lifespan=lifespan
)
# Initialize app state
app.state.ready = False
# Security middleware
if not settings.DEBUG:
app.add_middleware(
TrustedHostMiddleware,
allowed_hosts=["localhost", "127.0.0.1", "training-service", "*.bakery-forecast.local"]
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"] if settings.DEBUG else [
"http://localhost:3000",
"http://localhost:8000",
"https://dashboard.bakery-forecast.es"
],
allow_credentials=True,
allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
allow_headers=["*"],
)
# Request logging middleware
@app.middleware("http")
async def log_requests(request: Request, call_next):
"""Log all incoming requests with timing"""
start_time = asyncio.get_event_loop().time()
# Log request
logger.info(
"Request started",
method=request.method,
path=request.url.path,
client_ip=request.client.host if request.client else "unknown"
)
# Process request
try:
response = await call_next(request)
# Calculate duration
duration = asyncio.get_event_loop().time() - start_time
# Log response
logger.info(
"Request completed",
method=request.method,
path=request.url.path,
status_code=response.status_code,
duration_ms=round(duration * 1000, 2)
)
# Update metrics
metrics_collector.record_request(
method=request.method,
endpoint=request.url.path,
status_code=response.status_code,
duration=duration
)
return response
except Exception as e:
duration = asyncio.get_event_loop().time() - start_time
logger.error(
"Request failed",
method=request.method,
path=request.url.path,
error=str(e),
duration_ms=round(duration * 1000, 2)
)
metrics_collector.increment_counter("http_requests_failed_total")
raise
# Exception handlers
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
"""Global exception handler for unhandled errors"""
logger.error(
"Unhandled exception",
path=request.url.path,
method=request.method,
error=str(exc),
exc_info=True
)
metrics_collector.increment_counter("unhandled_exceptions_total")
return JSONResponse(
status_code=500,
content={
"detail": "Internal server error",
"error_id": structlog.get_logger().new().info("Error logged", error=str(exc))
}
)
# Include API routers
app.include_router(
training.router,
prefix="/training",
tags=["training"],
dependencies=[require_auth] if not settings.DEBUG else []
)
app.include_router(
models.router,
prefix="/models",
tags=["models"],
dependencies=[require_auth] if not settings.DEBUG else []
)
# Health check endpoints
@app.get("/health")
async def health_check():
"""Basic health check endpoint"""
return {
"status": "healthy" if app.state.ready else "starting",
"service": "training-service",
"version": "1.0.0",
"timestamp": structlog.get_logger().new().info("Health check")
}
@app.get("/health/ready")
async def readiness_check():
"""Kubernetes readiness probe"""
if not app.state.ready:
return JSONResponse(
status_code=503,
content={"status": "not_ready", "message": "Service is starting up"}
)
return {"status": "ready", "service": "training-service"}
@app.get("/health/live")
async def liveness_check():
"""Kubernetes liveness probe"""
# Check database connectivity
try:
db_healthy = await get_db_health()
if not db_healthy:
return JSONResponse(
status_code=503,
content={"status": "unhealthy", "reason": "database_unavailable"}
)
except Exception as e:
logger.error("Database health check failed", error=str(e))
return JSONResponse(
status_code=503,
content={"status": "unhealthy", "reason": "database_error"}
)
return {"status": "alive", "service": "training-service"}
@app.get("/metrics")
async def get_metrics():
"""Expose service metrics"""
return {
"training_jobs_active": metrics_collector.get_gauge("training_jobs_active", 0),
"training_jobs_completed": metrics_collector.get_counter("training_jobs_completed", 0),
"training_jobs_failed": metrics_collector.get_counter("training_jobs_failed", 0),
"models_trained_total": metrics_collector.get_counter("models_trained_total", 0),
"uptime_seconds": metrics_collector.get_gauge("uptime_seconds", 0)
}
@app.get("/")
async def root():
"""Root endpoint with service information"""
return {
"service": "training-service",
"version": "1.0.0",
"description": "ML model training service for bakery demand forecasting",
"docs": "/docs" if settings.DEBUG else "Documentation disabled in production",
"health": "/health"
}
# Development server configuration
if __name__ == "__main__":
uvicorn.run(
"app.main:app",
host="0.0.0.0",
port=8000,
reload=settings.DEBUG,
log_level=settings.LOG_LEVEL.lower(),
access_log=settings.DEBUG,
server_header=False,
date_header=False
)