Files
bakery-ia/services/training/app/main.py

285 lines
8.7 KiB
Python
Raw Normal View History

2025-07-19 21:16:25 +02:00
# ================================================================
# services/training/app/main.py - FIXED VERSION
# ================================================================
"""
2025-07-19 16:59:37 +02:00
Training Service Main Application
Enhanced with proper error handling, monitoring, and lifecycle management
"""
2025-07-18 14:41:39 +02:00
import structlog
2025-07-19 16:59:37 +02:00
import asyncio
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
2025-07-19 16:59:37 +02:00
from fastapi.middleware.trustedhost import TrustedHostMiddleware
from fastapi.responses import JSONResponse
import uvicorn
from app.core.config import settings
from app.core.database import initialize_training_database, cleanup_training_database, get_db_health, get_comprehensive_db_health
from app.api import training, models
2025-08-08 09:08:41 +02:00
from app.api.websocket import websocket_router
2025-07-18 14:41:39 +02:00
from app.services.messaging import setup_messaging, cleanup_messaging
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector
2025-07-19 21:16:25 +02:00
# REMOVED: from shared.auth.decorators import require_auth
2025-07-19 16:59:37 +02:00
# Setup structured logging
setup_logging("training-service", settings.LOG_LEVEL)
2025-07-18 14:41:39 +02:00
logger = structlog.get_logger()
2025-07-19 16:59:37 +02:00
# Initialize metrics collector
metrics_collector = MetricsCollector("training-service")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
Application lifespan manager for startup and shutdown events
"""
# Startup
logger.info("Starting Training Service", version="1.0.0")
try:
# Initialize database
logger.info("Initializing database connection")
2025-07-25 19:40:49 +02:00
await initialize_training_database()
2025-07-19 16:59:37 +02:00
logger.info("Database initialized successfully")
# Initialize messaging
logger.info("Setting up messaging")
await setup_messaging()
logger.info("Messaging setup completed")
# Start metrics server
logger.info("Starting metrics server")
metrics_collector.start_metrics_server(8080)
logger.info("Metrics server started on port 8080")
2025-07-19 21:16:25 +02:00
# Store metrics collector in app state
app.state.metrics_collector = metrics_collector
2025-07-19 16:59:37 +02:00
# Mark service as ready
app.state.ready = True
logger.info("Training Service startup completed successfully")
yield
except Exception as e:
logger.error("Failed to start Training Service", error=str(e))
app.state.ready = False
raise
# Shutdown
logger.info("Shutting down Training Service")
try:
2025-07-19 21:16:25 +02:00
# Stop metrics server
if hasattr(app.state, 'metrics_collector'):
await app.state.metrics_collector.shutdown()
2025-07-19 16:59:37 +02:00
# Cleanup messaging
await cleanup_messaging()
2025-07-19 21:16:25 +02:00
logger.info("Messaging cleanup completed")
2025-07-19 16:59:37 +02:00
# Close database connections
2025-07-25 19:40:49 +02:00
await cleanup_training_database()
2025-07-19 21:16:25 +02:00
logger.info("Database connections closed")
2025-07-19 16:59:37 +02:00
except Exception as e:
logger.error("Error during shutdown", error=str(e))
2025-07-19 21:16:25 +02:00
logger.info("Training Service shutdown completed")
2025-07-19 16:59:37 +02:00
2025-07-19 21:16:25 +02:00
# Create FastAPI application with lifespan
app = FastAPI(
2025-07-19 21:16:25 +02:00
title="Bakery Training Service",
description="ML training service for bakery demand forecasting",
2025-07-19 16:59:37 +02:00
version="1.0.0",
2025-07-19 21:16:25 +02:00
docs_url="/docs",
redoc_url="/redoc",
2025-07-19 16:59:37 +02:00
lifespan=lifespan
)
2025-07-19 21:16:25 +02:00
# Add middleware
app.add_middleware(
CORSMiddleware,
2025-07-19 21:16:25 +02:00
allow_origins=settings.CORS_ORIGINS_LIST,
allow_credentials=True,
2025-07-19 21:16:25 +02:00
allow_methods=["*"],
allow_headers=["*"],
)
2025-07-19 21:16:25 +02:00
# Request middleware for logging and metrics
2025-07-19 16:59:37 +02:00
@app.middleware("http")
2025-07-19 21:16:25 +02:00
async def process_request(request: Request, call_next):
"""Process requests with logging and metrics"""
2025-07-19 16:59:37 +02:00
start_time = asyncio.get_event_loop().time()
2025-07-19 16:59:37 +02:00
try:
response = await call_next(request)
duration = asyncio.get_event_loop().time() - start_time
logger.info(
"Request completed",
method=request.method,
path=request.url.path,
status_code=response.status_code,
duration_ms=round(duration * 1000, 2)
)
# Update metrics
metrics_collector.record_request(
method=request.method,
endpoint=request.url.path,
status_code=response.status_code,
duration=duration
)
return response
except Exception as e:
duration = asyncio.get_event_loop().time() - start_time
logger.error(
"Request failed",
method=request.method,
path=request.url.path,
error=str(e),
duration_ms=round(duration * 1000, 2)
)
metrics_collector.increment_counter("http_requests_failed_total")
raise
2025-07-19 16:59:37 +02:00
# Exception handlers
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
"""Global exception handler for unhandled errors"""
logger.error(
"Unhandled exception",
path=request.url.path,
method=request.method,
error=str(exc),
exc_info=True
)
2025-07-19 16:59:37 +02:00
metrics_collector.increment_counter("unhandled_exceptions_total")
2025-07-19 16:59:37 +02:00
return JSONResponse(
status_code=500,
content={
"detail": "Internal server error",
"error_id": structlog.get_logger().new().info("Error logged", error=str(exc))
}
)
2025-07-26 18:46:52 +02:00
# Include API routers
app.include_router(training.router, prefix="/api/v1", tags=["training"])
2025-08-08 09:08:41 +02:00
2025-07-26 18:46:52 +02:00
app.include_router(models.router, prefix="/api/v1", tags=["models"])
app.include_router(websocket_router, prefix="/api/v1/ws", tags=["websocket"])
2025-07-19 16:59:37 +02:00
# Health check endpoints
@app.get("/health")
async def health_check():
2025-07-19 16:59:37 +02:00
"""Basic health check endpoint"""
return {
"status": "healthy" if app.state.ready else "starting",
"service": "training-service",
"version": "1.0.0",
"timestamp": structlog.get_logger().new().info("Health check")
}
@app.get("/health/ready")
async def readiness_check():
"""Kubernetes readiness probe endpoint with comprehensive database checks"""
try:
# Get comprehensive database health including table verification
db_health = await get_comprehensive_db_health()
checks = {
"database_connectivity": db_health["connectivity"],
"database_tables": db_health["tables_exist"],
"application": getattr(app.state, 'ready', False)
}
# Include detailed database info for debugging
database_details = {
"status": db_health["status"],
"tables_verified": db_health["tables_verified"],
"missing_tables": db_health["missing_tables"],
"errors": db_health["errors"]
}
# Service is ready only if all checks pass
all_ready = all(checks.values()) and db_health["status"] == "healthy"
if all_ready:
return {
"status": "ready",
"checks": checks,
"database": database_details
}
else:
return JSONResponse(
status_code=503,
content={
"status": "not ready",
"checks": checks,
"database": database_details
}
)
except Exception as e:
logger.error("Readiness check failed", error=str(e))
2025-07-19 16:59:37 +02:00
return JSONResponse(
status_code=503,
content={
"status": "not ready",
"error": f"Health check failed: {str(e)}"
}
)
@app.get("/health/database")
async def database_health_check():
"""Detailed database health endpoint for debugging"""
try:
db_health = await get_comprehensive_db_health()
status_code = 200 if db_health["status"] == "healthy" else 503
return JSONResponse(status_code=status_code, content=db_health)
except Exception as e:
logger.error("Database health check failed", error=str(e))
return JSONResponse(
status_code=503,
content={
"status": "unhealthy",
"error": f"Health check failed: {str(e)}"
}
2025-07-19 16:59:37 +02:00
)
@app.get("/metrics")
async def get_metrics():
2025-07-19 21:16:25 +02:00
"""Prometheus metrics endpoint"""
if hasattr(app.state, 'metrics_collector'):
return app.state.metrics_collector.get_metrics()
return {"status": "metrics not available"}
@app.get("/health/live")
async def liveness_check():
return {"status": "alive"}
@app.get("/")
async def root():
return {"service": "training-service", "version": "1.0.0"}
if __name__ == "__main__":
2025-07-19 16:59:37 +02:00
uvicorn.run(
"app.main:app",
host="0.0.0.0",
2025-07-19 21:16:25 +02:00
port=settings.PORT,
2025-07-19 16:59:37 +02:00
reload=settings.DEBUG,
2025-07-19 21:16:25 +02:00
log_level=settings.LOG_LEVEL.lower()
2025-07-19 16:59:37 +02:00
)