Add all the code for training service
This commit is contained in:
@@ -1,81 +1,282 @@
|
||||
# services/training/app/main.py
|
||||
"""
|
||||
Training Service
|
||||
Handles ML model training for bakery demand forecasting
|
||||
Training Service Main Application
|
||||
Enhanced with proper error handling, monitoring, and lifecycle management
|
||||
"""
|
||||
|
||||
import structlog
|
||||
from fastapi import FastAPI, BackgroundTasks
|
||||
import asyncio
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.middleware.trustedhost import TrustedHostMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
import uvicorn
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.database import database_manager
|
||||
from app.core.database import database_manager, get_db_health
|
||||
from app.api import training, models
|
||||
from app.services.messaging import setup_messaging, cleanup_messaging
|
||||
from shared.monitoring.logging import setup_logging
|
||||
from shared.monitoring.metrics import MetricsCollector
|
||||
from shared.auth.decorators import require_auth
|
||||
|
||||
# Setup logging
|
||||
# Setup structured logging
|
||||
setup_logging("training-service", settings.LOG_LEVEL)
|
||||
logger = structlog.get_logger()
|
||||
|
||||
# Create FastAPI app
|
||||
app = FastAPI(
|
||||
title="Training Service",
|
||||
description="ML model training service for bakery demand forecasting",
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
# Initialize metrics collector
|
||||
metrics_collector = MetricsCollector("training-service")
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""
|
||||
Application lifespan manager for startup and shutdown events
|
||||
"""
|
||||
# Startup
|
||||
logger.info("Starting Training Service", version="1.0.0")
|
||||
|
||||
try:
|
||||
# Initialize database
|
||||
logger.info("Initializing database connection")
|
||||
await database_manager.create_tables()
|
||||
logger.info("Database initialized successfully")
|
||||
|
||||
# Initialize messaging
|
||||
logger.info("Setting up messaging")
|
||||
await setup_messaging()
|
||||
logger.info("Messaging setup completed")
|
||||
|
||||
# Start metrics server
|
||||
logger.info("Starting metrics server")
|
||||
metrics_collector.start_metrics_server(8080)
|
||||
logger.info("Metrics server started on port 8080")
|
||||
|
||||
# Mark service as ready
|
||||
app.state.ready = True
|
||||
logger.info("Training Service startup completed successfully")
|
||||
|
||||
yield
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to start Training Service", error=str(e))
|
||||
app.state.ready = False
|
||||
raise
|
||||
|
||||
# Shutdown
|
||||
logger.info("Shutting down Training Service")
|
||||
|
||||
try:
|
||||
# Cleanup messaging
|
||||
logger.info("Cleaning up messaging")
|
||||
await cleanup_messaging()
|
||||
|
||||
# Close database connections
|
||||
logger.info("Closing database connections")
|
||||
await database_manager.close_connections()
|
||||
|
||||
logger.info("Training Service shutdown completed")
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error during shutdown", error=str(e))
|
||||
|
||||
# Create FastAPI app with lifespan
|
||||
app = FastAPI(
|
||||
title="Training Service",
|
||||
description="ML model training service for bakery demand forecasting",
|
||||
version="1.0.0",
|
||||
docs_url="/docs" if settings.DEBUG else None,
|
||||
redoc_url="/redoc" if settings.DEBUG else None,
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
# Initialize app state
|
||||
app.state.ready = False
|
||||
|
||||
# Security middleware
|
||||
if not settings.DEBUG:
|
||||
app.add_middleware(
|
||||
TrustedHostMiddleware,
|
||||
allowed_hosts=["localhost", "127.0.0.1", "training-service", "*.bakery-forecast.local"]
|
||||
)
|
||||
|
||||
# CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_origins=["*"] if settings.DEBUG else [
|
||||
"http://localhost:3000",
|
||||
"http://localhost:8000",
|
||||
"https://dashboard.bakery-forecast.es"
|
||||
],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Include routers
|
||||
app.include_router(training.router, prefix="/training", tags=["training"])
|
||||
app.include_router(models.router, prefix="/models", tags=["models"])
|
||||
# Request logging middleware
|
||||
@app.middleware("http")
|
||||
async def log_requests(request: Request, call_next):
|
||||
"""Log all incoming requests with timing"""
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Log request
|
||||
logger.info(
|
||||
"Request started",
|
||||
method=request.method,
|
||||
path=request.url.path,
|
||||
client_ip=request.client.host if request.client else "unknown"
|
||||
)
|
||||
|
||||
# Process request
|
||||
try:
|
||||
response = await call_next(request)
|
||||
|
||||
# Calculate duration
|
||||
duration = asyncio.get_event_loop().time() - start_time
|
||||
|
||||
# Log response
|
||||
logger.info(
|
||||
"Request completed",
|
||||
method=request.method,
|
||||
path=request.url.path,
|
||||
status_code=response.status_code,
|
||||
duration_ms=round(duration * 1000, 2)
|
||||
)
|
||||
|
||||
# Update metrics
|
||||
metrics_collector.record_request(
|
||||
method=request.method,
|
||||
endpoint=request.url.path,
|
||||
status_code=response.status_code,
|
||||
duration=duration
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
duration = asyncio.get_event_loop().time() - start_time
|
||||
|
||||
logger.error(
|
||||
"Request failed",
|
||||
method=request.method,
|
||||
path=request.url.path,
|
||||
error=str(e),
|
||||
duration_ms=round(duration * 1000, 2)
|
||||
)
|
||||
|
||||
metrics_collector.increment_counter("http_requests_failed_total")
|
||||
raise
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
"""Application startup"""
|
||||
logger.info("Starting Training Service")
|
||||
# Exception handlers
|
||||
@app.exception_handler(Exception)
|
||||
async def global_exception_handler(request: Request, exc: Exception):
|
||||
"""Global exception handler for unhandled errors"""
|
||||
logger.error(
|
||||
"Unhandled exception",
|
||||
path=request.url.path,
|
||||
method=request.method,
|
||||
error=str(exc),
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
# Create database tables
|
||||
await database_manager.create_tables()
|
||||
metrics_collector.increment_counter("unhandled_exceptions_total")
|
||||
|
||||
# Initialize message publisher
|
||||
await setup_messaging()
|
||||
|
||||
# Start metrics server
|
||||
metrics_collector.start_metrics_server(8080)
|
||||
|
||||
logger.info("Training Service started successfully")
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content={
|
||||
"detail": "Internal server error",
|
||||
"error_id": structlog.get_logger().new().info("Error logged", error=str(exc))
|
||||
}
|
||||
)
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event():
|
||||
"""Application shutdown"""
|
||||
logger.info("Shutting down Training Service")
|
||||
|
||||
# Cleanup message publisher
|
||||
await cleanup_messaging()
|
||||
|
||||
logger.info("Training Service shutdown complete")
|
||||
# Include API routers
|
||||
app.include_router(
|
||||
training.router,
|
||||
prefix="/training",
|
||||
tags=["training"],
|
||||
dependencies=[require_auth] if not settings.DEBUG else []
|
||||
)
|
||||
|
||||
app.include_router(
|
||||
models.router,
|
||||
prefix="/models",
|
||||
tags=["models"],
|
||||
dependencies=[require_auth] if not settings.DEBUG else []
|
||||
)
|
||||
|
||||
# Health check endpoints
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint"""
|
||||
"""Basic health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"status": "healthy" if app.state.ready else "starting",
|
||||
"service": "training-service",
|
||||
"version": "1.0.0"
|
||||
"version": "1.0.0",
|
||||
"timestamp": structlog.get_logger().new().info("Health check")
|
||||
}
|
||||
|
||||
@app.get("/health/ready")
|
||||
async def readiness_check():
|
||||
"""Kubernetes readiness probe"""
|
||||
if not app.state.ready:
|
||||
return JSONResponse(
|
||||
status_code=503,
|
||||
content={"status": "not_ready", "message": "Service is starting up"}
|
||||
)
|
||||
|
||||
return {"status": "ready", "service": "training-service"}
|
||||
|
||||
@app.get("/health/live")
|
||||
async def liveness_check():
|
||||
"""Kubernetes liveness probe"""
|
||||
# Check database connectivity
|
||||
try:
|
||||
db_healthy = await get_db_health()
|
||||
if not db_healthy:
|
||||
return JSONResponse(
|
||||
status_code=503,
|
||||
content={"status": "unhealthy", "reason": "database_unavailable"}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Database health check failed", error=str(e))
|
||||
return JSONResponse(
|
||||
status_code=503,
|
||||
content={"status": "unhealthy", "reason": "database_error"}
|
||||
)
|
||||
|
||||
return {"status": "alive", "service": "training-service"}
|
||||
|
||||
@app.get("/metrics")
|
||||
async def get_metrics():
|
||||
"""Expose service metrics"""
|
||||
return {
|
||||
"training_jobs_active": metrics_collector.get_gauge("training_jobs_active", 0),
|
||||
"training_jobs_completed": metrics_collector.get_counter("training_jobs_completed", 0),
|
||||
"training_jobs_failed": metrics_collector.get_counter("training_jobs_failed", 0),
|
||||
"models_trained_total": metrics_collector.get_counter("models_trained_total", 0),
|
||||
"uptime_seconds": metrics_collector.get_gauge("uptime_seconds", 0)
|
||||
}
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint with service information"""
|
||||
return {
|
||||
"service": "training-service",
|
||||
"version": "1.0.0",
|
||||
"description": "ML model training service for bakery demand forecasting",
|
||||
"docs": "/docs" if settings.DEBUG else "Documentation disabled in production",
|
||||
"health": "/health"
|
||||
}
|
||||
|
||||
# Development server configuration
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
uvicorn.run(
|
||||
"app.main:app",
|
||||
host="0.0.0.0",
|
||||
port=8000,
|
||||
reload=settings.DEBUG,
|
||||
log_level=settings.LOG_LEVEL.lower(),
|
||||
access_log=settings.DEBUG,
|
||||
server_header=False,
|
||||
date_header=False
|
||||
)
|
||||
Reference in New Issue
Block a user