Files
bakery-ia/services/training/app/main.py

282 lines
8.4 KiB
Python
Raw Normal View History

2025-07-19 16:59:37 +02:00
# services/training/app/main.py
"""
2025-07-19 16:59:37 +02:00
Training Service Main Application
Enhanced with proper error handling, monitoring, and lifecycle management
"""
2025-07-18 14:41:39 +02:00
import structlog
2025-07-19 16:59:37 +02:00
import asyncio
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
2025-07-19 16:59:37 +02:00
from fastapi.middleware.trustedhost import TrustedHostMiddleware
from fastapi.responses import JSONResponse
import uvicorn
from app.core.config import settings
2025-07-19 16:59:37 +02:00
from app.core.database import database_manager, get_db_health
from app.api import training, models
2025-07-18 14:41:39 +02:00
from app.services.messaging import setup_messaging, cleanup_messaging
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector
2025-07-19 16:59:37 +02:00
from shared.auth.decorators import require_auth
2025-07-19 16:59:37 +02:00
# Setup structured logging
setup_logging("training-service", settings.LOG_LEVEL)
2025-07-18 14:41:39 +02:00
logger = structlog.get_logger()
2025-07-19 16:59:37 +02:00
# Initialize metrics collector
metrics_collector = MetricsCollector("training-service")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
Application lifespan manager for startup and shutdown events
"""
# Startup
logger.info("Starting Training Service", version="1.0.0")
try:
# Initialize database
logger.info("Initializing database connection")
await database_manager.create_tables()
logger.info("Database initialized successfully")
# Initialize messaging
logger.info("Setting up messaging")
await setup_messaging()
logger.info("Messaging setup completed")
# Start metrics server
logger.info("Starting metrics server")
metrics_collector.start_metrics_server(8080)
logger.info("Metrics server started on port 8080")
# Mark service as ready
app.state.ready = True
logger.info("Training Service startup completed successfully")
yield
except Exception as e:
logger.error("Failed to start Training Service", error=str(e))
app.state.ready = False
raise
# Shutdown
logger.info("Shutting down Training Service")
try:
# Cleanup messaging
logger.info("Cleaning up messaging")
await cleanup_messaging()
# Close database connections
logger.info("Closing database connections")
await database_manager.close_connections()
logger.info("Training Service shutdown completed")
except Exception as e:
logger.error("Error during shutdown", error=str(e))
# Create FastAPI app with lifespan
app = FastAPI(
title="Training Service",
description="ML model training service for bakery demand forecasting",
2025-07-19 16:59:37 +02:00
version="1.0.0",
docs_url="/docs" if settings.DEBUG else None,
redoc_url="/redoc" if settings.DEBUG else None,
lifespan=lifespan
)
2025-07-19 16:59:37 +02:00
# Initialize app state
app.state.ready = False
# Security middleware
if not settings.DEBUG:
app.add_middleware(
TrustedHostMiddleware,
allowed_hosts=["localhost", "127.0.0.1", "training-service", "*.bakery-forecast.local"]
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
2025-07-19 16:59:37 +02:00
allow_origins=["*"] if settings.DEBUG else [
"http://localhost:3000",
"http://localhost:8000",
"https://dashboard.bakery-forecast.es"
],
allow_credentials=True,
2025-07-19 16:59:37 +02:00
allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
allow_headers=["*"],
)
2025-07-19 16:59:37 +02:00
# Request logging middleware
@app.middleware("http")
async def log_requests(request: Request, call_next):
"""Log all incoming requests with timing"""
start_time = asyncio.get_event_loop().time()
2025-07-19 16:59:37 +02:00
# Log request
logger.info(
"Request started",
method=request.method,
path=request.url.path,
client_ip=request.client.host if request.client else "unknown"
)
2025-07-19 16:59:37 +02:00
# Process request
try:
response = await call_next(request)
# Calculate duration
duration = asyncio.get_event_loop().time() - start_time
# Log response
logger.info(
"Request completed",
method=request.method,
path=request.url.path,
status_code=response.status_code,
duration_ms=round(duration * 1000, 2)
)
# Update metrics
metrics_collector.record_request(
method=request.method,
endpoint=request.url.path,
status_code=response.status_code,
duration=duration
)
return response
except Exception as e:
duration = asyncio.get_event_loop().time() - start_time
logger.error(
"Request failed",
method=request.method,
path=request.url.path,
error=str(e),
duration_ms=round(duration * 1000, 2)
)
metrics_collector.increment_counter("http_requests_failed_total")
raise
2025-07-19 16:59:37 +02:00
# Exception handlers
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
"""Global exception handler for unhandled errors"""
logger.error(
"Unhandled exception",
path=request.url.path,
method=request.method,
error=str(exc),
exc_info=True
)
2025-07-19 16:59:37 +02:00
metrics_collector.increment_counter("unhandled_exceptions_total")
2025-07-19 16:59:37 +02:00
return JSONResponse(
status_code=500,
content={
"detail": "Internal server error",
"error_id": structlog.get_logger().new().info("Error logged", error=str(exc))
}
)
2025-07-19 16:59:37 +02:00
# Include API routers
app.include_router(
training.router,
prefix="/training",
tags=["training"],
dependencies=[require_auth] if not settings.DEBUG else []
)
app.include_router(
models.router,
prefix="/models",
tags=["models"],
dependencies=[require_auth] if not settings.DEBUG else []
)
# Health check endpoints
@app.get("/health")
async def health_check():
2025-07-19 16:59:37 +02:00
"""Basic health check endpoint"""
return {
"status": "healthy" if app.state.ready else "starting",
"service": "training-service",
"version": "1.0.0",
"timestamp": structlog.get_logger().new().info("Health check")
}
@app.get("/health/ready")
async def readiness_check():
"""Kubernetes readiness probe"""
if not app.state.ready:
return JSONResponse(
status_code=503,
content={"status": "not_ready", "message": "Service is starting up"}
)
return {"status": "ready", "service": "training-service"}
@app.get("/health/live")
async def liveness_check():
"""Kubernetes liveness probe"""
# Check database connectivity
try:
db_healthy = await get_db_health()
if not db_healthy:
return JSONResponse(
status_code=503,
content={"status": "unhealthy", "reason": "database_unavailable"}
)
except Exception as e:
logger.error("Database health check failed", error=str(e))
return JSONResponse(
status_code=503,
content={"status": "unhealthy", "reason": "database_error"}
)
return {"status": "alive", "service": "training-service"}
@app.get("/metrics")
async def get_metrics():
"""Expose service metrics"""
return {
"training_jobs_active": metrics_collector.get_gauge("training_jobs_active", 0),
"training_jobs_completed": metrics_collector.get_counter("training_jobs_completed", 0),
"training_jobs_failed": metrics_collector.get_counter("training_jobs_failed", 0),
"models_trained_total": metrics_collector.get_counter("models_trained_total", 0),
"uptime_seconds": metrics_collector.get_gauge("uptime_seconds", 0)
}
@app.get("/")
async def root():
"""Root endpoint with service information"""
return {
"service": "training-service",
2025-07-19 16:59:37 +02:00
"version": "1.0.0",
"description": "ML model training service for bakery demand forecasting",
"docs": "/docs" if settings.DEBUG else "Documentation disabled in production",
"health": "/health"
}
2025-07-19 16:59:37 +02:00
# Development server configuration
if __name__ == "__main__":
2025-07-19 16:59:37 +02:00
uvicorn.run(
"app.main:app",
host="0.0.0.0",
port=8000,
reload=settings.DEBUG,
log_level=settings.LOG_LEVEL.lower(),
access_log=settings.DEBUG,
server_header=False,
date_header=False
)