Initial microservices setup from artifacts

This commit is contained in:
Urtzi Alfaro
2025-07-17 13:09:24 +02:00
commit 347ff51bd7
200 changed files with 9559 additions and 0 deletions

View File

View File

View File

@@ -0,0 +1,33 @@
"""
Models API endpoints
"""
from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy.ext.asyncio import AsyncSession
from typing import List
import logging
from app.core.database import get_db
from app.core.auth import verify_token
from app.schemas.training import TrainedModelResponse
from app.services.training_service import TrainingService
logger = logging.getLogger(__name__)
router = APIRouter()
training_service = TrainingService()
@router.get("/", response_model=List[TrainedModelResponse])
async def get_trained_models(
user_data: dict = Depends(verify_token),
db: AsyncSession = Depends(get_db)
):
"""Get trained models"""
try:
return await training_service.get_trained_models(user_data, db)
except Exception as e:
logger.error(f"Get trained models error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to get trained models"
)

View File

@@ -0,0 +1,77 @@
"""
Training API endpoints
"""
from fastapi import APIRouter, Depends, HTTPException, status, Query
from sqlalchemy.ext.asyncio import AsyncSession
from typing import List, Optional
import logging
from app.core.database import get_db
from app.core.auth import verify_token
from app.schemas.training import TrainingRequest, TrainingJobResponse, TrainedModelResponse
from app.services.training_service import TrainingService
logger = logging.getLogger(__name__)
router = APIRouter()
training_service = TrainingService()
@router.post("/train", response_model=TrainingJobResponse)
async def start_training(
request: TrainingRequest,
user_data: dict = Depends(verify_token),
db: AsyncSession = Depends(get_db)
):
"""Start training job"""
try:
return await training_service.start_training(request, user_data, db)
except ValueError as e:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=str(e)
)
except Exception as e:
logger.error(f"Training start error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to start training"
)
@router.get("/status/{job_id}", response_model=TrainingJobResponse)
async def get_training_status(
job_id: str,
user_data: dict = Depends(verify_token),
db: AsyncSession = Depends(get_db)
):
"""Get training job status"""
try:
return await training_service.get_training_status(job_id, user_data, db)
except ValueError as e:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=str(e)
)
except Exception as e:
logger.error(f"Get training status error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to get training status"
)
@router.get("/jobs", response_model=List[TrainingJobResponse])
async def get_training_jobs(
limit: int = Query(10, ge=1, le=100),
offset: int = Query(0, ge=0),
user_data: dict = Depends(verify_token),
db: AsyncSession = Depends(get_db)
):
"""Get training jobs"""
try:
return await training_service.get_training_jobs(user_data, limit, offset, db)
except Exception as e:
logger.error(f"Get training jobs error: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="Failed to get training jobs"
)

View File

View File

@@ -0,0 +1,38 @@
"""
Authentication utilities for training service
"""
import httpx
from fastapi import HTTPException, status, Depends
from fastapi.security import HTTPBearer
import logging
from app.core.config import settings
logger = logging.getLogger(__name__)
security = HTTPBearer()
async def verify_token(token: str = Depends(security)):
"""Verify token with auth service"""
try:
async with httpx.AsyncClient() as client:
response = await client.post(
f"{settings.AUTH_SERVICE_URL}/auth/verify",
headers={"Authorization": f"Bearer {token.credentials}"}
)
if response.status_code == 200:
return response.json()
else:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid authentication credentials"
)
except httpx.RequestError as e:
logger.error(f"Auth service unavailable: {e}")
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Authentication service unavailable"
)

View File

@@ -0,0 +1,44 @@
"""
Training service configuration
"""
import os
from pydantic import BaseSettings
class Settings(BaseSettings):
"""Application settings"""
# Basic settings
APP_NAME: str = "Training Service"
VERSION: str = "1.0.0"
DEBUG: bool = os.getenv("DEBUG", "False").lower() == "true"
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
# Database settings
DATABASE_URL: str = os.getenv("DATABASE_URL", "postgresql+asyncpg://training_user:training_pass123@training-db:5432/training_db")
# Redis settings
REDIS_URL: str = os.getenv("REDIS_URL", "redis://redis:6379/1")
# RabbitMQ settings
RABBITMQ_URL: str = os.getenv("RABBITMQ_URL", "amqp://bakery:forecast123@rabbitmq:5672/")
# Service URLs
AUTH_SERVICE_URL: str = os.getenv("AUTH_SERVICE_URL", "http://auth-service:8000")
DATA_SERVICE_URL: str = os.getenv("DATA_SERVICE_URL", "http://data-service:8000")
# ML Settings
MODEL_STORAGE_PATH: str = os.getenv("MODEL_STORAGE_PATH", "/app/models")
MAX_TRAINING_TIME_MINUTES: int = int(os.getenv("MAX_TRAINING_TIME_MINUTES", "30"))
MIN_TRAINING_DATA_DAYS: int = int(os.getenv("MIN_TRAINING_DATA_DAYS", "30"))
# Prophet Settings
PROPHET_SEASONALITY_MODE: str = os.getenv("PROPHET_SEASONALITY_MODE", "additive")
PROPHET_DAILY_SEASONALITY: bool = os.getenv("PROPHET_DAILY_SEASONALITY", "true").lower() == "true"
PROPHET_WEEKLY_SEASONALITY: bool = os.getenv("PROPHET_WEEKLY_SEASONALITY", "true").lower() == "true"
PROPHET_YEARLY_SEASONALITY: bool = os.getenv("PROPHET_YEARLY_SEASONALITY", "true").lower() == "true"
class Config:
env_file = ".env"
settings = Settings()

View File

@@ -0,0 +1,12 @@
"""
Database configuration for training service
"""
from shared.database.base import DatabaseManager
from app.core.config import settings
# Initialize database manager
database_manager = DatabaseManager(settings.DATABASE_URL)
# Alias for convenience
get_db = database_manager.get_db

View File

@@ -0,0 +1,81 @@
"""
Training Service
Handles ML model training for bakery demand forecasting
"""
import logging
from fastapi import FastAPI, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from app.core.config import settings
from app.core.database import database_manager
from app.api import training, models
from app.services.messaging import message_publisher
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector
# Setup logging
setup_logging("training-service", settings.LOG_LEVEL)
logger = logging.getLogger(__name__)
# Create FastAPI app
app = FastAPI(
title="Training Service",
description="ML model training service for bakery demand forecasting",
version="1.0.0"
)
# Initialize metrics collector
metrics_collector = MetricsCollector("training-service")
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router(training.router, prefix="/training", tags=["training"])
app.include_router(models.router, prefix="/models", tags=["models"])
@app.on_event("startup")
async def startup_event():
"""Application startup"""
logger.info("Starting Training Service")
# Create database tables
await database_manager.create_tables()
# Initialize message publisher
await message_publisher.connect()
# Start metrics server
metrics_collector.start_metrics_server(8080)
logger.info("Training Service started successfully")
@app.on_event("shutdown")
async def shutdown_event():
"""Application shutdown"""
logger.info("Shutting down Training Service")
# Cleanup message publisher
await message_publisher.disconnect()
logger.info("Training Service shutdown complete")
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy",
"service": "training-service",
"version": "1.0.0"
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

View File

@@ -0,0 +1,174 @@
"""
ML Training implementation
"""
import asyncio
import logging
from typing import Dict, Any, List
import pandas as pd
from datetime import datetime
import joblib
import os
from prophet import Prophet
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from app.core.config import settings
logger = logging.getLogger(__name__)
class MLTrainer:
"""ML training implementation"""
def __init__(self):
self.model_storage_path = settings.MODEL_STORAGE_PATH
os.makedirs(self.model_storage_path, exist_ok=True)
async def train_models(self, training_data: Dict[str, Any], job_id: str, db) -> Dict[str, Any]:
"""Train models for all products"""
models_result = {}
# Get sales data
sales_data = training_data.get("sales_data", [])
external_data = training_data.get("external_data", {})
# Group by product
products_data = self._group_by_product(sales_data)
# Train model for each product
for product_name, product_sales in products_data.items():
try:
model_result = await self._train_product_model(
product_name,
product_sales,
external_data,
job_id
)
models_result[product_name] = model_result
except Exception as e:
logger.error(f"Failed to train model for {product_name}: {e}")
continue
return models_result
def _group_by_product(self, sales_data: List[Dict]) -> Dict[str, List[Dict]]:
"""Group sales data by product"""
products = {}
for sale in sales_data:
product_name = sale.get("product_name")
if product_name not in products:
products[product_name] = []
products[product_name].append(sale)
return products
async def _train_product_model(self, product_name: str, sales_data: List[Dict], external_data: Dict, job_id: str) -> Dict[str, Any]:
"""Train Prophet model for a single product"""
# Convert to DataFrame
df = pd.DataFrame(sales_data)
df['date'] = pd.to_datetime(df['date'])
# Aggregate daily sales
daily_sales = df.groupby('date')['quantity_sold'].sum().reset_index()
daily_sales.columns = ['ds', 'y']
# Add external features
daily_sales = self._add_external_features(daily_sales, external_data)
# Train Prophet model
model = Prophet(
seasonality_mode=settings.PROPHET_SEASONALITY_MODE,
daily_seasonality=settings.PROPHET_DAILY_SEASONALITY,
weekly_seasonality=settings.PROPHET_WEEKLY_SEASONALITY,
yearly_seasonality=settings.PROPHET_YEARLY_SEASONALITY
)
# Add regressors
model.add_regressor('temperature')
model.add_regressor('humidity')
model.add_regressor('precipitation')
model.add_regressor('traffic_volume')
# Fit model
model.fit(daily_sales)
# Save model
model_path = os.path.join(
self.model_storage_path,
f"{job_id}_{product_name}_prophet_model.pkl"
)
joblib.dump(model, model_path)
return {
"type": "prophet",
"path": model_path,
"training_samples": len(daily_sales),
"features": ["temperature", "humidity", "precipitation", "traffic_volume"],
"hyperparameters": {
"seasonality_mode": settings.PROPHET_SEASONALITY_MODE,
"daily_seasonality": settings.PROPHET_DAILY_SEASONALITY,
"weekly_seasonality": settings.PROPHET_WEEKLY_SEASONALITY,
"yearly_seasonality": settings.PROPHET_YEARLY_SEASONALITY
}
}
def _add_external_features(self, daily_sales: pd.DataFrame, external_data: Dict) -> pd.DataFrame:
"""Add external features to sales data"""
# Add weather data
weather_data = external_data.get("weather", [])
if weather_data:
weather_df = pd.DataFrame(weather_data)
weather_df['ds'] = pd.to_datetime(weather_df['date'])
daily_sales = daily_sales.merge(weather_df[['ds', 'temperature', 'humidity', 'precipitation']], on='ds', how='left')
# Add traffic data
traffic_data = external_data.get("traffic", [])
if traffic_data:
traffic_df = pd.DataFrame(traffic_data)
traffic_df['ds'] = pd.to_datetime(traffic_df['date'])
daily_sales = daily_sales.merge(traffic_df[['ds', 'traffic_volume']], on='ds', how='left')
# Fill missing values
daily_sales['temperature'] = daily_sales['temperature'].fillna(daily_sales['temperature'].mean())
daily_sales['humidity'] = daily_sales['humidity'].fillna(daily_sales['humidity'].mean())
daily_sales['precipitation'] = daily_sales['precipitation'].fillna(0)
daily_sales['traffic_volume'] = daily_sales['traffic_volume'].fillna(daily_sales['traffic_volume'].mean())
return daily_sales
async def validate_models(self, models_result: Dict[str, Any], db) -> Dict[str, Any]:
"""Validate trained models"""
validation_results = {}
for product_name, model_data in models_result.items():
try:
# Load model
model_path = model_data.get("path")
model = joblib.load(model_path)
# Mock validation for now (in production, you'd use actual validation data)
validation_results[product_name] = {
"mape": np.random.uniform(10, 25), # Mock MAPE between 10-25%
"rmse": np.random.uniform(8, 15), # Mock RMSE
"mae": np.random.uniform(5, 12), # Mock MAE
"r2_score": np.random.uniform(0.7, 0.9) # Mock R2 score
}
except Exception as e:
logger.error(f"Validation failed for {product_name}: {e}")
validation_results[product_name] = {
"mape": None,
"rmse": None,
"mae": None,
"r2_score": None,
"error": str(e)
}
return validation_results

View File

@@ -0,0 +1,91 @@
"""
Training schemas
"""
from pydantic import BaseModel, Field, validator
from typing import Optional, Dict, Any, List
from datetime import datetime
from enum import Enum
class TrainingJobStatus(str, Enum):
"""Training job status enum"""
QUEUED = "queued"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
class TrainingRequest(BaseModel):
"""Training request schema"""
tenant_id: Optional[str] = None # Will be set from auth
force_retrain: bool = Field(default=False, description="Force retrain even if recent models exist")
products: Optional[List[str]] = Field(default=None, description="Specific products to train, or None for all")
training_days: Optional[int] = Field(default=730, ge=30, le=1095, description="Number of days of historical data to use")
@validator('training_days')
def validate_training_days(cls, v):
if v < 30:
raise ValueError('Minimum training days is 30')
if v > 1095:
raise ValueError('Maximum training days is 1095 (3 years)')
return v
class TrainingJobResponse(BaseModel):
"""Training job response schema"""
id: str
tenant_id: str
status: TrainingJobStatus
progress: int
current_step: Optional[str]
started_at: datetime
completed_at: Optional[datetime]
duration_seconds: Optional[int]
models_trained: Optional[Dict[str, Any]]
metrics: Optional[Dict[str, Any]]
error_message: Optional[str]
class Config:
from_attributes = True
class TrainedModelResponse(BaseModel):
"""Trained model response schema"""
id: str
product_name: str
model_type: str
model_version: str
mape: Optional[float]
rmse: Optional[float]
mae: Optional[float]
r2_score: Optional[float]
training_samples: Optional[int]
features_used: Optional[List[str]]
is_active: bool
created_at: datetime
last_used_at: Optional[datetime]
class Config:
from_attributes = True
class TrainingProgress(BaseModel):
"""Training progress update schema"""
job_id: str
progress: int
current_step: str
estimated_completion: Optional[datetime]
class TrainingMetrics(BaseModel):
"""Training metrics schema"""
total_jobs: int
successful_jobs: int
failed_jobs: int
average_duration: float
models_trained: int
active_models: int
class ModelValidationResult(BaseModel):
"""Model validation result schema"""
product_name: str
is_valid: bool
accuracy_score: float
validation_error: Optional[str]
recommendations: List[str]

View File

@@ -0,0 +1,50 @@
"""
Messaging service for training service
"""
from shared.messaging.rabbitmq import RabbitMQClient
from app.core.config import settings
# Global message publisher
message_publisher = RabbitMQClient(settings.RABBITMQ_URL)
# services/training/Dockerfile
FROM python:3.11-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
g++ \
curl \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements
COPY requirements.txt .
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
# Copy shared libraries
COPY --from=shared /shared /app/shared
# Copy application code
COPY . .
# Create model storage directory
RUN mkdir -p /app/models
# Add shared libraries to Python path
ENV PYTHONPATH="/app:/app/shared:$PYTHONPATH"
# Expose port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# Run application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]