2025-07-19 21:16:25 +02:00
|
|
|
# ================================================================
|
|
|
|
|
# services/training/app/main.py - FIXED VERSION
|
|
|
|
|
# ================================================================
|
2025-07-17 13:09:24 +02:00
|
|
|
"""
|
2025-07-19 16:59:37 +02:00
|
|
|
Training Service Main Application
|
|
|
|
|
Enhanced with proper error handling, monitoring, and lifecycle management
|
2025-07-17 13:09:24 +02:00
|
|
|
"""
|
|
|
|
|
|
2025-07-18 14:41:39 +02:00
|
|
|
import structlog
|
2025-07-19 16:59:37 +02:00
|
|
|
import asyncio
|
|
|
|
|
from contextlib import asynccontextmanager
|
|
|
|
|
from fastapi import FastAPI, Request
|
2025-07-17 13:09:24 +02:00
|
|
|
from fastapi.middleware.cors import CORSMiddleware
|
2025-07-19 16:59:37 +02:00
|
|
|
from fastapi.middleware.trustedhost import TrustedHostMiddleware
|
|
|
|
|
from fastapi.responses import JSONResponse
|
|
|
|
|
import uvicorn
|
2025-07-17 13:09:24 +02:00
|
|
|
|
|
|
|
|
from app.core.config import settings
|
2025-07-19 16:59:37 +02:00
|
|
|
from app.core.database import database_manager, get_db_health
|
2025-07-17 13:09:24 +02:00
|
|
|
from app.api import training, models
|
2025-07-18 14:41:39 +02:00
|
|
|
from app.services.messaging import setup_messaging, cleanup_messaging
|
2025-07-17 13:09:24 +02:00
|
|
|
from shared.monitoring.logging import setup_logging
|
|
|
|
|
from shared.monitoring.metrics import MetricsCollector
|
2025-07-19 21:16:25 +02:00
|
|
|
# REMOVED: from shared.auth.decorators import require_auth
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Setup structured logging
|
2025-07-17 13:09:24 +02:00
|
|
|
setup_logging("training-service", settings.LOG_LEVEL)
|
2025-07-18 14:41:39 +02:00
|
|
|
logger = structlog.get_logger()
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Initialize metrics collector
|
|
|
|
|
metrics_collector = MetricsCollector("training-service")
|
|
|
|
|
|
|
|
|
|
@asynccontextmanager
|
|
|
|
|
async def lifespan(app: FastAPI):
|
|
|
|
|
"""
|
|
|
|
|
Application lifespan manager for startup and shutdown events
|
|
|
|
|
"""
|
|
|
|
|
# Startup
|
|
|
|
|
logger.info("Starting Training Service", version="1.0.0")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Initialize database
|
|
|
|
|
logger.info("Initializing database connection")
|
|
|
|
|
await database_manager.create_tables()
|
|
|
|
|
logger.info("Database initialized successfully")
|
|
|
|
|
|
|
|
|
|
# Initialize messaging
|
|
|
|
|
logger.info("Setting up messaging")
|
|
|
|
|
await setup_messaging()
|
|
|
|
|
logger.info("Messaging setup completed")
|
|
|
|
|
|
|
|
|
|
# Start metrics server
|
|
|
|
|
logger.info("Starting metrics server")
|
|
|
|
|
metrics_collector.start_metrics_server(8080)
|
|
|
|
|
logger.info("Metrics server started on port 8080")
|
|
|
|
|
|
2025-07-19 21:16:25 +02:00
|
|
|
# Store metrics collector in app state
|
|
|
|
|
app.state.metrics_collector = metrics_collector
|
|
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Mark service as ready
|
|
|
|
|
app.state.ready = True
|
|
|
|
|
logger.info("Training Service startup completed successfully")
|
|
|
|
|
|
|
|
|
|
yield
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error("Failed to start Training Service", error=str(e))
|
|
|
|
|
app.state.ready = False
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
# Shutdown
|
|
|
|
|
logger.info("Shutting down Training Service")
|
|
|
|
|
|
|
|
|
|
try:
|
2025-07-19 21:16:25 +02:00
|
|
|
# Stop metrics server
|
|
|
|
|
if hasattr(app.state, 'metrics_collector'):
|
|
|
|
|
await app.state.metrics_collector.shutdown()
|
|
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Cleanup messaging
|
|
|
|
|
await cleanup_messaging()
|
2025-07-19 21:16:25 +02:00
|
|
|
logger.info("Messaging cleanup completed")
|
2025-07-19 16:59:37 +02:00
|
|
|
|
|
|
|
|
# Close database connections
|
|
|
|
|
await database_manager.close_connections()
|
2025-07-19 21:16:25 +02:00
|
|
|
logger.info("Database connections closed")
|
2025-07-19 16:59:37 +02:00
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error("Error during shutdown", error=str(e))
|
2025-07-19 21:16:25 +02:00
|
|
|
|
|
|
|
|
logger.info("Training Service shutdown completed")
|
2025-07-19 16:59:37 +02:00
|
|
|
|
2025-07-19 21:16:25 +02:00
|
|
|
# Create FastAPI application with lifespan
|
2025-07-17 13:09:24 +02:00
|
|
|
app = FastAPI(
|
2025-07-19 21:16:25 +02:00
|
|
|
title="Bakery Training Service",
|
|
|
|
|
description="ML training service for bakery demand forecasting",
|
2025-07-19 16:59:37 +02:00
|
|
|
version="1.0.0",
|
2025-07-19 21:16:25 +02:00
|
|
|
docs_url="/docs",
|
|
|
|
|
redoc_url="/redoc",
|
2025-07-19 16:59:37 +02:00
|
|
|
lifespan=lifespan
|
2025-07-17 13:09:24 +02:00
|
|
|
)
|
|
|
|
|
|
2025-07-19 21:16:25 +02:00
|
|
|
# Add middleware
|
2025-07-17 13:09:24 +02:00
|
|
|
app.add_middleware(
|
|
|
|
|
CORSMiddleware,
|
2025-07-19 21:16:25 +02:00
|
|
|
allow_origins=settings.CORS_ORIGINS_LIST,
|
2025-07-17 13:09:24 +02:00
|
|
|
allow_credentials=True,
|
2025-07-19 21:16:25 +02:00
|
|
|
allow_methods=["*"],
|
2025-07-17 13:09:24 +02:00
|
|
|
allow_headers=["*"],
|
|
|
|
|
)
|
|
|
|
|
|
2025-07-19 21:16:25 +02:00
|
|
|
app.add_middleware(
|
|
|
|
|
TrustedHostMiddleware,
|
|
|
|
|
allowed_hosts=settings.ALLOWED_HOSTS
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Request middleware for logging and metrics
|
2025-07-19 16:59:37 +02:00
|
|
|
@app.middleware("http")
|
2025-07-19 21:16:25 +02:00
|
|
|
async def process_request(request: Request, call_next):
|
|
|
|
|
"""Process requests with logging and metrics"""
|
2025-07-19 16:59:37 +02:00
|
|
|
start_time = asyncio.get_event_loop().time()
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
try:
|
|
|
|
|
response = await call_next(request)
|
|
|
|
|
duration = asyncio.get_event_loop().time() - start_time
|
|
|
|
|
|
|
|
|
|
logger.info(
|
|
|
|
|
"Request completed",
|
|
|
|
|
method=request.method,
|
|
|
|
|
path=request.url.path,
|
|
|
|
|
status_code=response.status_code,
|
|
|
|
|
duration_ms=round(duration * 1000, 2)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Update metrics
|
|
|
|
|
metrics_collector.record_request(
|
|
|
|
|
method=request.method,
|
|
|
|
|
endpoint=request.url.path,
|
|
|
|
|
status_code=response.status_code,
|
|
|
|
|
duration=duration
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
duration = asyncio.get_event_loop().time() - start_time
|
|
|
|
|
|
|
|
|
|
logger.error(
|
|
|
|
|
"Request failed",
|
|
|
|
|
method=request.method,
|
|
|
|
|
path=request.url.path,
|
|
|
|
|
error=str(e),
|
|
|
|
|
duration_ms=round(duration * 1000, 2)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
metrics_collector.increment_counter("http_requests_failed_total")
|
|
|
|
|
raise
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
# Exception handlers
|
|
|
|
|
@app.exception_handler(Exception)
|
|
|
|
|
async def global_exception_handler(request: Request, exc: Exception):
|
|
|
|
|
"""Global exception handler for unhandled errors"""
|
|
|
|
|
logger.error(
|
|
|
|
|
"Unhandled exception",
|
|
|
|
|
path=request.url.path,
|
|
|
|
|
method=request.method,
|
|
|
|
|
error=str(exc),
|
|
|
|
|
exc_info=True
|
|
|
|
|
)
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
metrics_collector.increment_counter("unhandled_exceptions_total")
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 16:59:37 +02:00
|
|
|
return JSONResponse(
|
|
|
|
|
status_code=500,
|
|
|
|
|
content={
|
|
|
|
|
"detail": "Internal server error",
|
|
|
|
|
"error_id": structlog.get_logger().new().info("Error logged", error=str(exc))
|
|
|
|
|
}
|
|
|
|
|
)
|
2025-07-17 13:09:24 +02:00
|
|
|
|
2025-07-19 21:16:25 +02:00
|
|
|
# Include API routers - NO AUTH DEPENDENCIES HERE
|
|
|
|
|
# Authentication is handled by API Gateway
|
2025-07-19 16:59:37 +02:00
|
|
|
app.include_router(
|
|
|
|
|
training.router,
|
|
|
|
|
prefix="/training",
|
2025-07-19 21:16:25 +02:00
|
|
|
tags=["training"]
|
2025-07-19 16:59:37 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
app.include_router(
|
|
|
|
|
models.router,
|
2025-07-19 21:16:25 +02:00
|
|
|
prefix="/models",
|
|
|
|
|
tags=["models"]
|
2025-07-19 16:59:37 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Health check endpoints
|
2025-07-17 13:09:24 +02:00
|
|
|
@app.get("/health")
|
|
|
|
|
async def health_check():
|
2025-07-19 16:59:37 +02:00
|
|
|
"""Basic health check endpoint"""
|
|
|
|
|
return {
|
|
|
|
|
"status": "healthy" if app.state.ready else "starting",
|
|
|
|
|
"service": "training-service",
|
|
|
|
|
"version": "1.0.0",
|
|
|
|
|
"timestamp": structlog.get_logger().new().info("Health check")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@app.get("/health/ready")
|
|
|
|
|
async def readiness_check():
|
2025-07-19 21:16:25 +02:00
|
|
|
"""Kubernetes readiness probe endpoint"""
|
|
|
|
|
checks = {
|
|
|
|
|
"database": await get_db_health(),
|
|
|
|
|
"application": getattr(app.state, 'ready', False)
|
|
|
|
|
}
|
2025-07-19 16:59:37 +02:00
|
|
|
|
2025-07-19 21:16:25 +02:00
|
|
|
if all(checks.values()):
|
|
|
|
|
return {"status": "ready", "checks": checks}
|
|
|
|
|
else:
|
2025-07-19 16:59:37 +02:00
|
|
|
return JSONResponse(
|
|
|
|
|
status_code=503,
|
2025-07-19 21:16:25 +02:00
|
|
|
content={"status": "not ready", "checks": checks}
|
2025-07-19 16:59:37 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@app.get("/metrics")
|
|
|
|
|
async def get_metrics():
|
2025-07-19 21:16:25 +02:00
|
|
|
"""Prometheus metrics endpoint"""
|
|
|
|
|
if hasattr(app.state, 'metrics_collector'):
|
|
|
|
|
return app.state.metrics_collector.get_metrics()
|
|
|
|
|
return {"status": "metrics not available"}
|
2025-07-17 13:09:24 +02:00
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2025-07-19 16:59:37 +02:00
|
|
|
uvicorn.run(
|
|
|
|
|
"app.main:app",
|
|
|
|
|
host="0.0.0.0",
|
2025-07-19 21:16:25 +02:00
|
|
|
port=settings.PORT,
|
2025-07-19 16:59:37 +02:00
|
|
|
reload=settings.DEBUG,
|
2025-07-19 21:16:25 +02:00
|
|
|
log_level=settings.LOG_LEVEL.lower()
|
2025-07-19 16:59:37 +02:00
|
|
|
)
|