Add all the code for training service

2025-07-19 16:59:37 +02:00
parent 42097202d2
commit f3071c00bd
21 changed files with 7504 additions and 764 deletions
--- a/services/training/app/ml/trainer.py
+++ b/services/training/app/ml/trainer.py
@@ -1,174 +1,372 @@
+# services/training/app/ml/trainer.py
 """
-ML Training implementation
+ML Trainer for Training Service
+Orchestrates the complete training process
 """

-import asyncio
-import structlog
-from typing import Dict, Any, List
+from typing import Dict, List, Any, Optional, Tuple
 import pandas as pd
-from datetime import datetime
-import joblib
-import os
-from prophet import Prophet
 import numpy as np
-from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from datetime import datetime, timedelta
+import logging
+import asyncio
+import uuid
+from pathlib import Path

+from app.ml.prophet_manager import BakeryProphetManager
+from app.ml.data_processor import BakeryDataProcessor
 from app.core.config import settings

-logger = structlog.get_logger()
+logger = logging.getLogger(__name__)

-class MLTrainer:
-    """ML training implementation"""
+class BakeryMLTrainer:
+    """
+    Main ML trainer that orchestrates the complete training process.
+    Replaces the old Celery-based training system with clean async implementation.
+    """
    
    def __init__(self):
-        self.model_storage_path = settings.MODEL_STORAGE_PATH
-        os.makedirs(self.model_storage_path, exist_ok=True)
+        self.prophet_manager = BakeryProphetManager()
+        self.data_processor = BakeryDataProcessor()
+        
+    async def train_tenant_models(self,
+                                 tenant_id: str,
+                                 sales_data: List[Dict],
+                                 weather_data: List[Dict] = None,
+                                 traffic_data: List[Dict] = None,
+                                 job_id: str = None) -> Dict[str, Any]:
+        """
+        Train models for all products of a tenant.
+        
+        Args:
+            tenant_id: Tenant identifier
+            sales_data: Historical sales data
+            weather_data: Weather data (optional)
+            traffic_data: Traffic data (optional)
+            job_id: Training job identifier
+            
+        Returns:
+            Dictionary with training results for each product
+        """
+        if not job_id:
+            job_id = f"training_{tenant_id}_{uuid.uuid4().hex[:8]}"
+            
+        logger.info(f"Starting training job {job_id} for tenant {tenant_id}")
+        
+        try:
+            # Convert input data to DataFrames
+            sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
+            weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
+            traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
+            
+            # Validate input data
+            await self._validate_input_data(sales_df, tenant_id)
+            
+            # Get unique products
+            products = sales_df['product_name'].unique().tolist()
+            logger.info(f"Training models for {len(products)} products: {products}")
+            
+            # Process data for each product
+            processed_data = await self._process_all_products(
+                sales_df, weather_df, traffic_df, products
+            )
+            
+            # Train models for each product
+            training_results = await self._train_all_models(
+                tenant_id, processed_data, job_id
+            )
+            
+            # Calculate overall training summary
+            summary = self._calculate_training_summary(training_results)
+            
+            result = {
+                "job_id": job_id,
+                "tenant_id": tenant_id,
+                "status": "completed",
+                "products_trained": len([r for r in training_results.values() if r.get('status') == 'success']),
+                "products_failed": len([r for r in training_results.values() if r.get('status') == 'error']),
+                "total_products": len(products),
+                "training_results": training_results,
+                "summary": summary,
+                "completed_at": datetime.now().isoformat()
+            }
+            
+            logger.info(f"Training job {job_id} completed successfully")
+            return result
+            
+        except Exception as e:
+            logger.error(f"Training job {job_id} failed: {str(e)}")
+            raise
    
-    async def train_models(self, training_data: Dict[str, Any], job_id: str, db) -> Dict[str, Any]:
-        """Train models for all products"""
+    async def train_single_product(self,
+                                  tenant_id: str,
+                                  product_name: str,
+                                  sales_data: List[Dict],
+                                  weather_data: List[Dict] = None,
+                                  traffic_data: List[Dict] = None,
+                                  job_id: str = None) -> Dict[str, Any]:
+        """
+        Train model for a single product.
        
-        models_result = {}
+        Args:
+            tenant_id: Tenant identifier
+            product_name: Product name
+            sales_data: Historical sales data
+            weather_data: Weather data (optional)
+            traffic_data: Traffic data (optional)
+            job_id: Training job identifier
+            
+        Returns:
+            Training result for the product
+        """
+        if not job_id:
+            job_id = f"training_{tenant_id}_{product_name}_{uuid.uuid4().hex[:8]}"
+            
+        logger.info(f"Starting single product training {job_id} for {product_name}")
        
-        # Get sales data
-        sales_data = training_data.get("sales_data", [])
-        external_data = training_data.get("external_data", {})
+        try:
+            # Convert input data to DataFrames
+            sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
+            weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
+            traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
+            
+            # Filter sales data for the specific product
+            product_sales = sales_df[sales_df['product_name'] == product_name].copy()
+            
+            # Validate product data
+            if product_sales.empty:
+                raise ValueError(f"No sales data found for product: {product_name}")
+            
+            # Prepare training data
+            processed_data = await self.data_processor.prepare_training_data(
+                sales_data=product_sales,
+                weather_data=weather_df,
+                traffic_data=traffic_df,
+                product_name=product_name
+            )
+            
+            # Train the model
+            model_info = await self.prophet_manager.train_bakery_model(
+                tenant_id=tenant_id,
+                product_name=product_name,
+                df=processed_data,
+                job_id=job_id
+            )
+            
+            result = {
+                "job_id": job_id,
+                "tenant_id": tenant_id,
+                "product_name": product_name,
+                "status": "success",
+                "model_info": model_info,
+                "data_points": len(processed_data),
+                "completed_at": datetime.now().isoformat()
+            }
+            
+            logger.info(f"Single product training {job_id} completed successfully")
+            return result
+            
+        except Exception as e:
+            logger.error(f"Single product training {job_id} failed: {str(e)}")
+            raise
+    
+    async def evaluate_model_performance(self,
+                                        tenant_id: str,
+                                        product_name: str,
+                                        model_path: str,
+                                        test_data: List[Dict]) -> Dict[str, Any]:
+        """
+        Evaluate model performance on test data.
        
-        # Group by product
-        products_data = self._group_by_product(sales_data)
+        Args:
+            tenant_id: Tenant identifier
+            product_name: Product name
+            model_path: Path to the trained model
+            test_data: Test data for evaluation
+            
+        Returns:
+            Performance metrics
+        """
+        try:
+            logger.info(f"Evaluating model performance for {product_name}")
+            
+            # Convert test data to DataFrame
+            test_df = pd.DataFrame(test_data)
+            
+            # Prepare test data
+            test_prepared = await self.data_processor.prepare_prediction_features(
+                future_dates=test_df['ds'],
+                weather_forecast=test_df if 'temperature' in test_df.columns else pd.DataFrame(),
+                traffic_forecast=test_df if 'traffic_volume' in test_df.columns else pd.DataFrame()
+            )
+            
+            # Get regressor columns
+            regressor_columns = [col for col in test_prepared.columns if col not in ['ds', 'y']]
+            
+            # Generate predictions
+            forecast = await self.prophet_manager.generate_forecast(
+                model_path=model_path,
+                future_dates=test_prepared,
+                regressor_columns=regressor_columns
+            )
+            
+            # Calculate performance metrics if we have actual values
+            metrics = {}
+            if 'y' in test_df.columns:
+                from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+                
+                y_true = test_df['y'].values
+                y_pred = forecast['yhat'].values
+                
+                metrics = {
+                    "mae": float(mean_absolute_error(y_true, y_pred)),
+                    "rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
+                    "mape": float(np.mean(np.abs((y_true - y_pred) / y_true)) * 100),
+                    "r2_score": float(r2_score(y_true, y_pred))
+                }
+            
+            result = {
+                "tenant_id": tenant_id,
+                "product_name": product_name,
+                "evaluation_metrics": metrics,
+                "forecast_samples": len(forecast),
+                "evaluated_at": datetime.now().isoformat()
+            }
+            
+            return result
+            
+        except Exception as e:
+            logger.error(f"Model evaluation failed: {str(e)}")
+            raise
+    
+    async def _validate_input_data(self, sales_df: pd.DataFrame, tenant_id: str):
+        """Validate input sales data"""
+        if sales_df.empty:
+            raise ValueError(f"No sales data provided for tenant {tenant_id}")
        
-        # Train model for each product
-        for product_name, product_sales in products_data.items():
+        required_columns = ['date', 'product_name', 'quantity']
+        missing_columns = [col for col in required_columns if col not in sales_df.columns]
+        if missing_columns:
+            raise ValueError(f"Missing required columns: {missing_columns}")
+        
+        # Check for valid dates
+        try:
+            sales_df['date'] = pd.to_datetime(sales_df['date'])
+        except Exception:
+            raise ValueError("Invalid date format in sales data")
+        
+        # Check for valid quantities
+        if not sales_df['quantity'].dtype in ['int64', 'float64']:
+            raise ValueError("Quantity column must be numeric")
+    
+    async def _process_all_products(self,
+                                   sales_df: pd.DataFrame,
+                                   weather_df: pd.DataFrame,
+                                   traffic_df: pd.DataFrame,
+                                   products: List[str]) -> Dict[str, pd.DataFrame]:
+        """Process data for all products"""
+        processed_data = {}
+        
+        for product_name in products:
            try:
-                model_result = await self._train_product_model(
-                    product_name, 
-                    product_sales, 
-                    external_data, 
-                    job_id
+                logger.info(f"Processing data for product: {product_name}")
+                
+                # Filter sales data for this product
+                product_sales = sales_df[sales_df['product_name'] == product_name].copy()
+                
+                # Process the product data
+                processed_product_data = await self.data_processor.prepare_training_data(
+                    sales_data=product_sales,
+                    weather_data=weather_df,
+                    traffic_data=traffic_df,
+                    product_name=product_name
                )
-                models_result[product_name] = model_result
+                
+                processed_data[product_name] = processed_product_data
+                logger.info(f"Processed {len(processed_product_data)} data points for {product_name}")
                
            except Exception as e:
-                logger.error(f"Failed to train model for {product_name}: {e}")
+                logger.error(f"Failed to process data for {product_name}: {str(e)}")
+                # Continue with other products
                continue
        
-        return models_result
+        return processed_data
    
-    def _group_by_product(self, sales_data: List[Dict]) -> Dict[str, List[Dict]]:
-        """Group sales data by product"""
+    async def _train_all_models(self,
+                               tenant_id: str,
+                               processed_data: Dict[str, pd.DataFrame],
+                               job_id: str) -> Dict[str, Any]:
+        """Train models for all processed products"""
+        training_results = {}
        
-        products = {}
-        for sale in sales_data:
-            product_name = sale.get("product_name")
-            if product_name not in products:
-                products[product_name] = []
-            products[product_name].append(sale)
-        
-        return products
-    
-    async def _train_product_model(self, product_name: str, sales_data: List[Dict], external_data: Dict, job_id: str) -> Dict[str, Any]:
-        """Train Prophet model for a single product"""
-        
-        # Convert to DataFrame
-        df = pd.DataFrame(sales_data)
-        df['date'] = pd.to_datetime(df['date'])
-        
-        # Aggregate daily sales
-        daily_sales = df.groupby('date')['quantity_sold'].sum().reset_index()
-        daily_sales.columns = ['ds', 'y']
-        
-        # Add external features
-        daily_sales = self._add_external_features(daily_sales, external_data)
-        
-        # Train Prophet model
-        model = Prophet(
-            seasonality_mode=settings.PROPHET_SEASONALITY_MODE,
-            daily_seasonality=settings.PROPHET_DAILY_SEASONALITY,
-            weekly_seasonality=settings.PROPHET_WEEKLY_SEASONALITY,
-            yearly_seasonality=settings.PROPHET_YEARLY_SEASONALITY
-        )
-        
-        # Add regressors
-        model.add_regressor('temperature')
-        model.add_regressor('humidity')
-        model.add_regressor('precipitation')
-        model.add_regressor('traffic_volume')
-        
-        # Fit model
-        model.fit(daily_sales)
-        
-        # Save model
-        model_path = os.path.join(
-            self.model_storage_path,
-            f"{job_id}_{product_name}_prophet_model.pkl"
-        )
-        
-        joblib.dump(model, model_path)
-        
-        return {
-            "type": "prophet",
-            "path": model_path,
-            "training_samples": len(daily_sales),
-            "features": ["temperature", "humidity", "precipitation", "traffic_volume"],
-            "hyperparameters": {
-                "seasonality_mode": settings.PROPHET_SEASONALITY_MODE,
-                "daily_seasonality": settings.PROPHET_DAILY_SEASONALITY,
-                "weekly_seasonality": settings.PROPHET_WEEKLY_SEASONALITY,
-                "yearly_seasonality": settings.PROPHET_YEARLY_SEASONALITY
-            }
-        }
-    
-    def _add_external_features(self, daily_sales: pd.DataFrame, external_data: Dict) -> pd.DataFrame:
-        """Add external features to sales data"""
-        
-        # Add weather data
-        weather_data = external_data.get("weather", [])
-        if weather_data:
-            weather_df = pd.DataFrame(weather_data)
-            weather_df['ds'] = pd.to_datetime(weather_df['date'])
-            daily_sales = daily_sales.merge(weather_df[['ds', 'temperature', 'humidity', 'precipitation']], on='ds', how='left')
-        
-        # Add traffic data
-        traffic_data = external_data.get("traffic", [])
-        if traffic_data:
-            traffic_df = pd.DataFrame(traffic_data)
-            traffic_df['ds'] = pd.to_datetime(traffic_df['date'])
-            daily_sales = daily_sales.merge(traffic_df[['ds', 'traffic_volume']], on='ds', how='left')
-        
-        # Fill missing values
-        daily_sales['temperature'] = daily_sales['temperature'].fillna(daily_sales['temperature'].mean())
-        daily_sales['humidity'] = daily_sales['humidity'].fillna(daily_sales['humidity'].mean())
-        daily_sales['precipitation'] = daily_sales['precipitation'].fillna(0)
-        daily_sales['traffic_volume'] = daily_sales['traffic_volume'].fillna(daily_sales['traffic_volume'].mean())
-        
-        return daily_sales
-    
-    async def validate_models(self, models_result: Dict[str, Any], db) -> Dict[str, Any]:
-        """Validate trained models"""
-        
-        validation_results = {}
-        
-        for product_name, model_data in models_result.items():
+        for product_name, product_data in processed_data.items():
            try:
-                # Load model
-                model_path = model_data.get("path")
-                model = joblib.load(model_path)
+                logger.info(f"Training model for product: {product_name}")
                
-                # Mock validation for now (in production, you'd use actual validation data)
-                validation_results[product_name] = {
-                    "mape": np.random.uniform(10, 25),  # Mock MAPE between 10-25%
-                    "rmse": np.random.uniform(8, 15),   # Mock RMSE
-                    "mae": np.random.uniform(5, 12),    # Mock MAE
-                    "r2_score": np.random.uniform(0.7, 0.9)  # Mock R2 score
+                # Check if we have enough data
+                if len(product_data) < settings.MIN_TRAINING_DATA_DAYS:
+                    training_results[product_name] = {
+                        'status': 'skipped',
+                        'reason': 'insufficient_data',
+                        'data_points': len(product_data),
+                        'min_required': settings.MIN_TRAINING_DATA_DAYS
+                    }
+                    continue
+                
+                # Train the model
+                model_info = await self.prophet_manager.train_bakery_model(
+                    tenant_id=tenant_id,
+                    product_name=product_name,
+                    df=product_data,
+                    job_id=job_id
+                )
+                
+                training_results[product_name] = {
+                    'status': 'success',
+                    'model_info': model_info,
+                    'data_points': len(product_data),
+                    'trained_at': datetime.now().isoformat()
                }
                
+                logger.info(f"Successfully trained model for {product_name}")
+                
            except Exception as e:
-                logger.error(f"Validation failed for {product_name}: {e}")
-                validation_results[product_name] = {
-                    "mape": None,
-                    "rmse": None,
-                    "mae": None,
-                    "r2_score": None,
-                    "error": str(e)
+                logger.error(f"Failed to train model for {product_name}: {str(e)}")
+                training_results[product_name] = {
+                    'status': 'error',
+                    'error_message': str(e),
+                    'data_points': len(product_data) if product_data is not None else 0
                }
        
-        return validation_results
+        return training_results
+    
+    def _calculate_training_summary(self, training_results: Dict[str, Any]) -> Dict[str, Any]:
+        """Calculate summary statistics from training results"""
+        total_products = len(training_results)
+        successful_products = len([r for r in training_results.values() if r.get('status') == 'success'])
+        failed_products = len([r for r in training_results.values() if r.get('status') == 'error'])
+        skipped_products = len([r for r in training_results.values() if r.get('status') == 'skipped'])
+        
+        # Calculate average training metrics for successful models
+        successful_results = [r for r in training_results.values() if r.get('status') == 'success']
+        
+        avg_metrics = {}
+        if successful_results:
+            metrics_list = [r['model_info'].get('training_metrics', {}) for r in successful_results]
+            
+            if metrics_list and all(metrics_list):
+                avg_metrics = {
+                    'avg_mae': np.mean([m.get('mae', 0) for m in metrics_list]),
+                    'avg_rmse': np.mean([m.get('rmse', 0) for m in metrics_list]),
+                    'avg_mape': np.mean([m.get('mape', 0) for m in metrics_list]),
+                    'avg_r2': np.mean([m.get('r2_score', 0) for m in metrics_list])
+                }
+        
+        return {
+            'total_products': total_products,
+            'successful_products': successful_products,
+            'failed_products': failed_products,
+            'skipped_products': skipped_products,
+            'success_rate': round(successful_products / total_products * 100, 2) if total_products > 0 else 0,
+            'average_metrics': avg_metrics
+        }