Initial microservices setup from artifacts

2025-07-17 13:09:24 +02:00
commit 347ff51bd7
200 changed files with 9559 additions and 0 deletions
--- a/services/training/app/init.py
+++ b/services/training/app/init.py
--- a/services/training/app/api/init.py
+++ b/services/training/app/api/init.py
--- a/services/training/app/api/models.py
+++ b/services/training/app/api/models.py
@@ -0,0 +1,33 @@
+"""
+Models API endpoints
+"""
+
+from fastapi import APIRouter, Depends, HTTPException, status
+from sqlalchemy.ext.asyncio import AsyncSession
+from typing import List
+import logging
+
+from app.core.database import get_db
+from app.core.auth import verify_token
+from app.schemas.training import TrainedModelResponse
+from app.services.training_service import TrainingService
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+training_service = TrainingService()
+
+@router.get("/", response_model=List[TrainedModelResponse])
+async def get_trained_models(
+    user_data: dict = Depends(verify_token),
+    db: AsyncSession = Depends(get_db)
+):
+    """Get trained models"""
+    try:
+        return await training_service.get_trained_models(user_data, db)
+    except Exception as e:
+        logger.error(f"Get trained models error: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to get trained models"
+        )
--- a/services/training/app/api/training.py
+++ b/services/training/app/api/training.py
@@ -0,0 +1,77 @@
+"""
+Training API endpoints
+"""
+
+from fastapi import APIRouter, Depends, HTTPException, status, Query
+from sqlalchemy.ext.asyncio import AsyncSession
+from typing import List, Optional
+import logging
+
+from app.core.database import get_db
+from app.core.auth import verify_token
+from app.schemas.training import TrainingRequest, TrainingJobResponse, TrainedModelResponse
+from app.services.training_service import TrainingService
+
+logger = logging.getLogger(__name__)
+router = APIRouter()
+
+training_service = TrainingService()
+
+@router.post("/train", response_model=TrainingJobResponse)
+async def start_training(
+    request: TrainingRequest,
+    user_data: dict = Depends(verify_token),
+    db: AsyncSession = Depends(get_db)
+):
+    """Start training job"""
+    try:
+        return await training_service.start_training(request, user_data, db)
+    except ValueError as e:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=str(e)
+        )
+    except Exception as e:
+        logger.error(f"Training start error: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to start training"
+        )
+
+@router.get("/status/{job_id}", response_model=TrainingJobResponse)
+async def get_training_status(
+    job_id: str,
+    user_data: dict = Depends(verify_token),
+    db: AsyncSession = Depends(get_db)
+):
+    """Get training job status"""
+    try:
+        return await training_service.get_training_status(job_id, user_data, db)
+    except ValueError as e:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=str(e)
+        )
+    except Exception as e:
+        logger.error(f"Get training status error: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to get training status"
+        )
+
+@router.get("/jobs", response_model=List[TrainingJobResponse])
+async def get_training_jobs(
+    limit: int = Query(10, ge=1, le=100),
+    offset: int = Query(0, ge=0),
+    user_data: dict = Depends(verify_token),
+    db: AsyncSession = Depends(get_db)
+):
+    """Get training jobs"""
+    try:
+        return await training_service.get_training_jobs(user_data, limit, offset, db)
+    except Exception as e:
+        logger.error(f"Get training jobs error: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to get training jobs"
+        )
--- a/services/training/app/core/init.py
+++ b/services/training/app/core/init.py
--- a/services/training/app/core/auth.py
+++ b/services/training/app/core/auth.py
@@ -0,0 +1,38 @@
+"""
+Authentication utilities for training service
+"""
+
+import httpx
+from fastapi import HTTPException, status, Depends
+from fastapi.security import HTTPBearer
+import logging
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+security = HTTPBearer()
+
+async def verify_token(token: str = Depends(security)):
+    """Verify token with auth service"""
+    try:
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{settings.AUTH_SERVICE_URL}/auth/verify",
+                headers={"Authorization": f"Bearer {token.credentials}"}
+            )
+            
+            if response.status_code == 200:
+                return response.json()
+            else:
+                raise HTTPException(
+                    status_code=status.HTTP_401_UNAUTHORIZED,
+                    detail="Invalid authentication credentials"
+                )
+                
+    except httpx.RequestError as e:
+        logger.error(f"Auth service unavailable: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Authentication service unavailable"
+        )
--- a/services/training/app/core/config.py
+++ b/services/training/app/core/config.py
@@ -0,0 +1,44 @@
+"""
+Training service configuration
+"""
+
+import os
+from pydantic import BaseSettings
+
+class Settings(BaseSettings):
+    """Application settings"""
+    
+    # Basic settings
+    APP_NAME: str = "Training Service"
+    VERSION: str = "1.0.0"
+    DEBUG: bool = os.getenv("DEBUG", "False").lower() == "true"
+    LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO")
+    
+    # Database settings
+    DATABASE_URL: str = os.getenv("DATABASE_URL", "postgresql+asyncpg://training_user:training_pass123@training-db:5432/training_db")
+    
+    # Redis settings
+    REDIS_URL: str = os.getenv("REDIS_URL", "redis://redis:6379/1")
+    
+    # RabbitMQ settings
+    RABBITMQ_URL: str = os.getenv("RABBITMQ_URL", "amqp://bakery:forecast123@rabbitmq:5672/")
+    
+    # Service URLs
+    AUTH_SERVICE_URL: str = os.getenv("AUTH_SERVICE_URL", "http://auth-service:8000")
+    DATA_SERVICE_URL: str = os.getenv("DATA_SERVICE_URL", "http://data-service:8000")
+    
+    # ML Settings
+    MODEL_STORAGE_PATH: str = os.getenv("MODEL_STORAGE_PATH", "/app/models")
+    MAX_TRAINING_TIME_MINUTES: int = int(os.getenv("MAX_TRAINING_TIME_MINUTES", "30"))
+    MIN_TRAINING_DATA_DAYS: int = int(os.getenv("MIN_TRAINING_DATA_DAYS", "30"))
+    
+    # Prophet Settings
+    PROPHET_SEASONALITY_MODE: str = os.getenv("PROPHET_SEASONALITY_MODE", "additive")
+    PROPHET_DAILY_SEASONALITY: bool = os.getenv("PROPHET_DAILY_SEASONALITY", "true").lower() == "true"
+    PROPHET_WEEKLY_SEASONALITY: bool = os.getenv("PROPHET_WEEKLY_SEASONALITY", "true").lower() == "true"
+    PROPHET_YEARLY_SEASONALITY: bool = os.getenv("PROPHET_YEARLY_SEASONALITY", "true").lower() == "true"
+    
+    class Config:
+        env_file = ".env"
+
+settings = Settings()
--- a/services/training/app/core/database.py
+++ b/services/training/app/core/database.py
@@ -0,0 +1,12 @@
+"""
+Database configuration for training service
+"""
+
+from shared.database.base import DatabaseManager
+from app.core.config import settings
+
+# Initialize database manager
+database_manager = DatabaseManager(settings.DATABASE_URL)
+
+# Alias for convenience
+get_db = database_manager.get_db
--- a/services/training/app/main.py
+++ b/services/training/app/main.py
@@ -0,0 +1,81 @@
+"""
+Training Service
+Handles ML model training for bakery demand forecasting
+"""
+
+import logging
+from fastapi import FastAPI, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+
+from app.core.config import settings
+from app.core.database import database_manager
+from app.api import training, models
+from app.services.messaging import message_publisher
+from shared.monitoring.logging import setup_logging
+from shared.monitoring.metrics import MetricsCollector
+
+# Setup logging
+setup_logging("training-service", settings.LOG_LEVEL)
+logger = logging.getLogger(__name__)
+
+# Create FastAPI app
+app = FastAPI(
+    title="Training Service",
+    description="ML model training service for bakery demand forecasting",
+    version="1.0.0"
+)
+
+# Initialize metrics collector
+metrics_collector = MetricsCollector("training-service")
+
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Include routers
+app.include_router(training.router, prefix="/training", tags=["training"])
+app.include_router(models.router, prefix="/models", tags=["models"])
+
+@app.on_event("startup")
+async def startup_event():
+    """Application startup"""
+    logger.info("Starting Training Service")
+    
+    # Create database tables
+    await database_manager.create_tables()
+    
+    # Initialize message publisher
+    await message_publisher.connect()
+    
+    # Start metrics server
+    metrics_collector.start_metrics_server(8080)
+    
+    logger.info("Training Service started successfully")
+
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Application shutdown"""
+    logger.info("Shutting down Training Service")
+    
+    # Cleanup message publisher
+    await message_publisher.disconnect()
+    
+    logger.info("Training Service shutdown complete")
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": "training-service",
+        "version": "1.0.0"
+    }
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/services/training/app/ml/init.py
+++ b/services/training/app/ml/init.py
--- a/services/training/app/ml/trainer.py
+++ b/services/training/app/ml/trainer.py
@@ -0,0 +1,174 @@
+"""
+ML Training implementation
+"""
+
+import asyncio
+import logging
+from typing import Dict, Any, List
+import pandas as pd
+from datetime import datetime
+import joblib
+import os
+from prophet import Prophet
+import numpy as np
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+class MLTrainer:
+    """ML training implementation"""
+    
+    def __init__(self):
+        self.model_storage_path = settings.MODEL_STORAGE_PATH
+        os.makedirs(self.model_storage_path, exist_ok=True)
+    
+    async def train_models(self, training_data: Dict[str, Any], job_id: str, db) -> Dict[str, Any]:
+        """Train models for all products"""
+        
+        models_result = {}
+        
+        # Get sales data
+        sales_data = training_data.get("sales_data", [])
+        external_data = training_data.get("external_data", {})
+        
+        # Group by product
+        products_data = self._group_by_product(sales_data)
+        
+        # Train model for each product
+        for product_name, product_sales in products_data.items():
+            try:
+                model_result = await self._train_product_model(
+                    product_name, 
+                    product_sales, 
+                    external_data, 
+                    job_id
+                )
+                models_result[product_name] = model_result
+                
+            except Exception as e:
+                logger.error(f"Failed to train model for {product_name}: {e}")
+                continue
+        
+        return models_result
+    
+    def _group_by_product(self, sales_data: List[Dict]) -> Dict[str, List[Dict]]:
+        """Group sales data by product"""
+        
+        products = {}
+        for sale in sales_data:
+            product_name = sale.get("product_name")
+            if product_name not in products:
+                products[product_name] = []
+            products[product_name].append(sale)
+        
+        return products
+    
+    async def _train_product_model(self, product_name: str, sales_data: List[Dict], external_data: Dict, job_id: str) -> Dict[str, Any]:
+        """Train Prophet model for a single product"""
+        
+        # Convert to DataFrame
+        df = pd.DataFrame(sales_data)
+        df['date'] = pd.to_datetime(df['date'])
+        
+        # Aggregate daily sales
+        daily_sales = df.groupby('date')['quantity_sold'].sum().reset_index()
+        daily_sales.columns = ['ds', 'y']
+        
+        # Add external features
+        daily_sales = self._add_external_features(daily_sales, external_data)
+        
+        # Train Prophet model
+        model = Prophet(
+            seasonality_mode=settings.PROPHET_SEASONALITY_MODE,
+            daily_seasonality=settings.PROPHET_DAILY_SEASONALITY,
+            weekly_seasonality=settings.PROPHET_WEEKLY_SEASONALITY,
+            yearly_seasonality=settings.PROPHET_YEARLY_SEASONALITY
+        )
+        
+        # Add regressors
+        model.add_regressor('temperature')
+        model.add_regressor('humidity')
+        model.add_regressor('precipitation')
+        model.add_regressor('traffic_volume')
+        
+        # Fit model
+        model.fit(daily_sales)
+        
+        # Save model
+        model_path = os.path.join(
+            self.model_storage_path,
+            f"{job_id}_{product_name}_prophet_model.pkl"
+        )
+        
+        joblib.dump(model, model_path)
+        
+        return {
+            "type": "prophet",
+            "path": model_path,
+            "training_samples": len(daily_sales),
+            "features": ["temperature", "humidity", "precipitation", "traffic_volume"],
+            "hyperparameters": {
+                "seasonality_mode": settings.PROPHET_SEASONALITY_MODE,
+                "daily_seasonality": settings.PROPHET_DAILY_SEASONALITY,
+                "weekly_seasonality": settings.PROPHET_WEEKLY_SEASONALITY,
+                "yearly_seasonality": settings.PROPHET_YEARLY_SEASONALITY
+            }
+        }
+    
+    def _add_external_features(self, daily_sales: pd.DataFrame, external_data: Dict) -> pd.DataFrame:
+        """Add external features to sales data"""
+        
+        # Add weather data
+        weather_data = external_data.get("weather", [])
+        if weather_data:
+            weather_df = pd.DataFrame(weather_data)
+            weather_df['ds'] = pd.to_datetime(weather_df['date'])
+            daily_sales = daily_sales.merge(weather_df[['ds', 'temperature', 'humidity', 'precipitation']], on='ds', how='left')
+        
+        # Add traffic data
+        traffic_data = external_data.get("traffic", [])
+        if traffic_data:
+            traffic_df = pd.DataFrame(traffic_data)
+            traffic_df['ds'] = pd.to_datetime(traffic_df['date'])
+            daily_sales = daily_sales.merge(traffic_df[['ds', 'traffic_volume']], on='ds', how='left')
+        
+        # Fill missing values
+        daily_sales['temperature'] = daily_sales['temperature'].fillna(daily_sales['temperature'].mean())
+        daily_sales['humidity'] = daily_sales['humidity'].fillna(daily_sales['humidity'].mean())
+        daily_sales['precipitation'] = daily_sales['precipitation'].fillna(0)
+        daily_sales['traffic_volume'] = daily_sales['traffic_volume'].fillna(daily_sales['traffic_volume'].mean())
+        
+        return daily_sales
+    
+    async def validate_models(self, models_result: Dict[str, Any], db) -> Dict[str, Any]:
+        """Validate trained models"""
+        
+        validation_results = {}
+        
+        for product_name, model_data in models_result.items():
+            try:
+                # Load model
+                model_path = model_data.get("path")
+                model = joblib.load(model_path)
+                
+                # Mock validation for now (in production, you'd use actual validation data)
+                validation_results[product_name] = {
+                    "mape": np.random.uniform(10, 25),  # Mock MAPE between 10-25%
+                    "rmse": np.random.uniform(8, 15),   # Mock RMSE
+                    "mae": np.random.uniform(5, 12),    # Mock MAE
+                    "r2_score": np.random.uniform(0.7, 0.9)  # Mock R2 score
+                }
+                
+            except Exception as e:
+                logger.error(f"Validation failed for {product_name}: {e}")
+                validation_results[product_name] = {
+                    "mape": None,
+                    "rmse": None,
+                    "mae": None,
+                    "r2_score": None,
+                    "error": str(e)
+                }
+        
+        return validation_results
--- a/services/training/app/schemas/init.py
+++ b/services/training/app/schemas/init.py
--- a/services/training/app/schemas/training.py
+++ b/services/training/app/schemas/training.py
@@ -0,0 +1,91 @@
+"""
+Training schemas
+"""
+
+from pydantic import BaseModel, Field, validator
+from typing import Optional, Dict, Any, List
+from datetime import datetime
+from enum import Enum
+
+class TrainingJobStatus(str, Enum):
+    """Training job status enum"""
+    QUEUED = "queued"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+
+class TrainingRequest(BaseModel):
+    """Training request schema"""
+    tenant_id: Optional[str] = None  # Will be set from auth
+    force_retrain: bool = Field(default=False, description="Force retrain even if recent models exist")
+    products: Optional[List[str]] = Field(default=None, description="Specific products to train, or None for all")
+    training_days: Optional[int] = Field(default=730, ge=30, le=1095, description="Number of days of historical data to use")
+    
+    @validator('training_days')
+    def validate_training_days(cls, v):
+        if v < 30:
+            raise ValueError('Minimum training days is 30')
+        if v > 1095:
+            raise ValueError('Maximum training days is 1095 (3 years)')
+        return v
+
+class TrainingJobResponse(BaseModel):
+    """Training job response schema"""
+    id: str
+    tenant_id: str
+    status: TrainingJobStatus
+    progress: int
+    current_step: Optional[str]
+    started_at: datetime
+    completed_at: Optional[datetime]
+    duration_seconds: Optional[int]
+    models_trained: Optional[Dict[str, Any]]
+    metrics: Optional[Dict[str, Any]]
+    error_message: Optional[str]
+    
+    class Config:
+        from_attributes = True
+
+class TrainedModelResponse(BaseModel):
+    """Trained model response schema"""
+    id: str
+    product_name: str
+    model_type: str
+    model_version: str
+    mape: Optional[float]
+    rmse: Optional[float]
+    mae: Optional[float]
+    r2_score: Optional[float]
+    training_samples: Optional[int]
+    features_used: Optional[List[str]]
+    is_active: bool
+    created_at: datetime
+    last_used_at: Optional[datetime]
+    
+    class Config:
+        from_attributes = True
+
+class TrainingProgress(BaseModel):
+    """Training progress update schema"""
+    job_id: str
+    progress: int
+    current_step: str
+    estimated_completion: Optional[datetime]
+    
+class TrainingMetrics(BaseModel):
+    """Training metrics schema"""
+    total_jobs: int
+    successful_jobs: int
+    failed_jobs: int
+    average_duration: float
+    models_trained: int
+    active_models: int
+
+class ModelValidationResult(BaseModel):
+    """Model validation result schema"""
+    product_name: str
+    is_valid: bool
+    accuracy_score: float
+    validation_error: Optional[str]
+    recommendations: List[str]
--- a/services/training/app/services/init.py
+++ b/services/training/app/services/init.py
--- a/services/training/app/services/messaging.py
+++ b/services/training/app/services/messaging.py
@@ -0,0 +1,50 @@
+"""
+Messaging service for training service
+"""
+
+from shared.messaging.rabbitmq import RabbitMQClient
+from app.core.config import settings
+
+# Global message publisher
+message_publisher = RabbitMQClient(settings.RABBITMQ_URL)
+
+
+# services/training/Dockerfile
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements
+COPY requirements.txt .
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy shared libraries
+COPY --from=shared /shared /app/shared
+
+# Copy application code
+COPY . .
+
+# Create model storage directory
+RUN mkdir -p /app/models
+
+# Add shared libraries to Python path
+ENV PYTHONPATH="/app:/app/shared:$PYTHONPATH"
+
+# Expose port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+# Run application
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]