Add all the code for training service

2025-07-19 16:59:37 +02:00
parent 42097202d2
commit f3071c00bd
21 changed files with 7504 additions and 764 deletions
--- a/services/training/app/ml/data_processor.py
+++ b/services/training/app/ml/data_processor.py
@@ -0,0 +1,493 @@
+# services/training/app/ml/data_processor.py
+"""
+Data Processor for Training Service
+Handles data preparation and feature engineering for ML training
+"""
+
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Any, Optional, Tuple
+from datetime import datetime, timedelta
+import logging
+from sklearn.preprocessing import StandardScaler
+from sklearn.impute import SimpleImputer
+
+logger = logging.getLogger(__name__)
+
+class BakeryDataProcessor:
+    """
+    Enhanced data processor for bakery forecasting training service.
+    Handles data cleaning, feature engineering, and preparation for ML models.
+    """
+    
+    def __init__(self):
+        self.scalers = {}  # Store scalers for each feature
+        self.imputers = {}  # Store imputers for missing value handling
+        
+    async def prepare_training_data(self,
+                                   sales_data: pd.DataFrame,
+                                   weather_data: pd.DataFrame,
+                                   traffic_data: pd.DataFrame,
+                                   product_name: str) -> pd.DataFrame:
+        """
+        Prepare comprehensive training data for a specific product.
+        
+        Args:
+            sales_data: Historical sales data for the product
+            weather_data: Weather data
+            traffic_data: Traffic data
+            product_name: Product name for logging
+            
+        Returns:
+            DataFrame ready for Prophet training with 'ds' and 'y' columns plus features
+        """
+        try:
+            logger.info(f"Preparing training data for product: {product_name}")
+            
+            # Convert and validate sales data
+            sales_clean = await self._process_sales_data(sales_data, product_name)
+            
+            # Aggregate to daily level
+            daily_sales = await self._aggregate_daily_sales(sales_clean)
+            
+            # Add temporal features
+            daily_sales = self._add_temporal_features(daily_sales)
+            
+            # Merge external data sources
+            daily_sales = self._merge_weather_features(daily_sales, weather_data)
+            daily_sales = self._merge_traffic_features(daily_sales, traffic_data)
+            
+            # Engineer additional features
+            daily_sales = self._engineer_features(daily_sales)
+            
+            # Handle missing values
+            daily_sales = self._handle_missing_values(daily_sales)
+            
+            # Prepare for Prophet (rename columns and validate)
+            prophet_data = self._prepare_prophet_format(daily_sales)
+            
+            logger.info(f"Prepared {len(prophet_data)} data points for {product_name}")
+            return prophet_data
+            
+        except Exception as e:
+            logger.error(f"Error preparing training data for {product_name}: {str(e)}")
+            raise
+    
+    async def prepare_prediction_features(self,
+                                         future_dates: pd.DatetimeIndex,
+                                         weather_forecast: pd.DataFrame = None,
+                                         traffic_forecast: pd.DataFrame = None) -> pd.DataFrame:
+        """
+        Create features for future predictions.
+        
+        Args:
+            future_dates: Future dates to predict
+            weather_forecast: Weather forecast data
+            traffic_forecast: Traffic forecast data
+            
+        Returns:
+            DataFrame with features for prediction
+        """
+        try:
+            # Create base future dataframe
+            future_df = pd.DataFrame({'ds': future_dates})
+            
+            # Add temporal features
+            future_df = self._add_temporal_features(
+                future_df.rename(columns={'ds': 'date'})
+            ).rename(columns={'date': 'ds'})
+            
+            # Add weather features
+            if weather_forecast is not None and not weather_forecast.empty:
+                weather_features = weather_forecast.copy()
+                if 'date' in weather_features.columns:
+                    weather_features = weather_features.rename(columns={'date': 'ds'})
+                
+                future_df = future_df.merge(weather_features, on='ds', how='left')
+            
+            # Add traffic features  
+            if traffic_forecast is not None and not traffic_forecast.empty:
+                traffic_features = traffic_forecast.copy()
+                if 'date' in traffic_features.columns:
+                    traffic_features = traffic_features.rename(columns={'date': 'ds'})
+                
+                future_df = future_df.merge(traffic_features, on='ds', how='left')
+            
+            # Engineer additional features
+            future_df = self._engineer_features(future_df.rename(columns={'ds': 'date'}))
+            future_df = future_df.rename(columns={'date': 'ds'})
+            
+            # Handle missing values in future data
+            numeric_columns = future_df.select_dtypes(include=[np.number]).columns
+            for col in numeric_columns:
+                if future_df[col].isna().any():
+                    # Use reasonable defaults for Madrid
+                    if col == 'temperature':
+                        future_df[col] = future_df[col].fillna(15.0)  # Default Madrid temp
+                    elif col == 'precipitation':
+                        future_df[col] = future_df[col].fillna(0.0)  # Default no rain
+                    elif col == 'humidity':
+                        future_df[col] = future_df[col].fillna(60.0)  # Default humidity
+                    elif col == 'traffic_volume':
+                        future_df[col] = future_df[col].fillna(100.0)  # Default traffic
+                    else:
+                        future_df[col] = future_df[col].fillna(future_df[col].median())
+            
+            return future_df
+            
+        except Exception as e:
+            logger.error(f"Error creating prediction features: {e}")
+            # Return minimal features if error
+            return pd.DataFrame({'ds': future_dates})
+    
+    async def _process_sales_data(self, sales_data: pd.DataFrame, product_name: str) -> pd.DataFrame:
+        """Process and clean sales data"""
+        sales_clean = sales_data.copy()
+        
+        # Ensure date column exists and is datetime
+        if 'date' not in sales_clean.columns:
+            raise ValueError("Sales data must have a 'date' column")
+        
+        sales_clean['date'] = pd.to_datetime(sales_clean['date'])
+        
+        # Ensure quantity column exists and is numeric
+        if 'quantity' not in sales_clean.columns:
+            raise ValueError("Sales data must have a 'quantity' column")
+        
+        sales_clean['quantity'] = pd.to_numeric(sales_clean['quantity'], errors='coerce')
+        
+        # Remove rows with invalid quantities
+        sales_clean = sales_clean.dropna(subset=['quantity'])
+        sales_clean = sales_clean[sales_clean['quantity'] >= 0]  # No negative sales
+        
+        # Filter for the specific product if product_name column exists
+        if 'product_name' in sales_clean.columns:
+            sales_clean = sales_clean[sales_clean['product_name'] == product_name]
+        
+        return sales_clean
+    
+    async def _aggregate_daily_sales(self, sales_data: pd.DataFrame) -> pd.DataFrame:
+        """Aggregate sales to daily level"""
+        daily_sales = sales_data.groupby('date').agg({
+            'quantity': 'sum'
+        }).reset_index()
+        
+        # Ensure we have data for all dates in the range
+        date_range = pd.date_range(
+            start=daily_sales['date'].min(),
+            end=daily_sales['date'].max(),
+            freq='D'
+        )
+        
+        full_date_df = pd.DataFrame({'date': date_range})
+        daily_sales = full_date_df.merge(daily_sales, on='date', how='left')
+        daily_sales['quantity'] = daily_sales['quantity'].fillna(0)  # Fill missing days with 0 sales
+        
+        return daily_sales
+    
+    def _add_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Add temporal features like day of week, month, etc."""
+        df = df.copy()
+        
+        # Ensure we have a date column
+        if 'date' not in df.columns:
+            raise ValueError("DataFrame must have a 'date' column")
+        
+        df['date'] = pd.to_datetime(df['date'])
+        
+        # Day of week (0=Monday, 6=Sunday)
+        df['day_of_week'] = df['date'].dt.dayofweek
+        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
+        
+        # Month and season
+        df['month'] = df['date'].dt.month
+        df['season'] = df['month'].apply(self._get_season)
+        
+        # Week of year
+        df['week_of_year'] = df['date'].dt.isocalendar().week
+        
+        # Quarter
+        df['quarter'] = df['date'].dt.quarter
+        
+        # Holiday indicators (basic Spanish holidays)
+        df['is_holiday'] = df['date'].apply(self._is_spanish_holiday).astype(int)
+        
+        # School calendar effects (approximate)
+        df['is_school_holiday'] = df['date'].apply(self._is_school_holiday).astype(int)
+        
+        return df
+    
+    def _merge_weather_features(self, 
+                               daily_sales: pd.DataFrame,
+                               weather_data: pd.DataFrame) -> pd.DataFrame:
+        """Merge weather features with sales data"""
+        
+        if weather_data.empty:
+            # Add default weather columns with neutral values
+            daily_sales['temperature'] = 15.0  # Mild temperature
+            daily_sales['precipitation'] = 0.0  # No rain
+            daily_sales['humidity'] = 60.0  # Moderate humidity
+            daily_sales['wind_speed'] = 5.0  # Light wind
+            return daily_sales
+        
+        try:
+            weather_clean = weather_data.copy()
+            
+            # Ensure weather data has date column
+            if 'date' not in weather_clean.columns and 'ds' in weather_clean.columns:
+                weather_clean = weather_clean.rename(columns={'ds': 'date'})
+            
+            weather_clean['date'] = pd.to_datetime(weather_clean['date'])
+            
+            # Select relevant weather features
+            weather_features = ['date']
+            
+            # Add available weather columns with default names
+            weather_mapping = {
+                'temperature': ['temperature', 'temp', 'temperatura'],
+                'precipitation': ['precipitation', 'rain', 'lluvia', 'precipitacion'],
+                'humidity': ['humidity', 'humedad'],
+                'wind_speed': ['wind_speed', 'viento', 'wind']
+            }
+            
+            for standard_name, possible_names in weather_mapping.items():
+                for possible_name in possible_names:
+                    if possible_name in weather_clean.columns:
+                        weather_clean[standard_name] = weather_clean[possible_name]
+                        weather_features.append(standard_name)
+                        break
+            
+            # Keep only the features we found
+            weather_clean = weather_clean[weather_features].copy()
+            
+            # Merge with sales data
+            merged = daily_sales.merge(weather_clean, on='date', how='left')
+            
+            # Fill missing weather values with reasonable defaults
+            if 'temperature' in merged.columns:
+                merged['temperature'] = merged['temperature'].fillna(15.0)
+            if 'precipitation' in merged.columns:
+                merged['precipitation'] = merged['precipitation'].fillna(0.0)
+            if 'humidity' in merged.columns:
+                merged['humidity'] = merged['humidity'].fillna(60.0)
+            if 'wind_speed' in merged.columns:
+                merged['wind_speed'] = merged['wind_speed'].fillna(5.0)
+            
+            return merged
+            
+        except Exception as e:
+            logger.warning(f"Error merging weather data: {e}")
+            # Add default weather columns if merge fails
+            daily_sales['temperature'] = 15.0
+            daily_sales['precipitation'] = 0.0
+            daily_sales['humidity'] = 60.0
+            daily_sales['wind_speed'] = 5.0
+            return daily_sales
+    
+    def _merge_traffic_features(self, 
+                               daily_sales: pd.DataFrame,
+                               traffic_data: pd.DataFrame) -> pd.DataFrame:
+        """Merge traffic features with sales data"""
+        
+        if traffic_data.empty:
+            # Add default traffic column
+            daily_sales['traffic_volume'] = 100.0  # Neutral traffic level
+            return daily_sales
+        
+        try:
+            traffic_clean = traffic_data.copy()
+            
+            # Ensure traffic data has date column
+            if 'date' not in traffic_clean.columns and 'ds' in traffic_clean.columns:
+                traffic_clean = traffic_clean.rename(columns={'ds': 'date'})
+            
+            traffic_clean['date'] = pd.to_datetime(traffic_clean['date'])
+            
+            # Select relevant traffic features
+            traffic_features = ['date']
+            
+            # Map traffic column names
+            traffic_mapping = {
+                'traffic_volume': ['traffic_volume', 'traffic_intensity', 'trafico', 'intensidad'],
+                'pedestrian_count': ['pedestrian_count', 'peatones'],
+                'occupancy_rate': ['occupancy_rate', 'ocupacion']
+            }
+            
+            for standard_name, possible_names in traffic_mapping.items():
+                for possible_name in possible_names:
+                    if possible_name in traffic_clean.columns:
+                        traffic_clean[standard_name] = traffic_clean[possible_name]
+                        traffic_features.append(standard_name)
+                        break
+            
+            # Keep only the features we found
+            traffic_clean = traffic_clean[traffic_features].copy()
+            
+            # Merge with sales data
+            merged = daily_sales.merge(traffic_clean, on='date', how='left')
+            
+            # Fill missing traffic values
+            if 'traffic_volume' in merged.columns:
+                merged['traffic_volume'] = merged['traffic_volume'].fillna(100.0)
+            if 'pedestrian_count' in merged.columns:
+                merged['pedestrian_count'] = merged['pedestrian_count'].fillna(50.0)
+            if 'occupancy_rate' in merged.columns:
+                merged['occupancy_rate'] = merged['occupancy_rate'].fillna(0.5)
+            
+            return merged
+            
+        except Exception as e:
+            logger.warning(f"Error merging traffic data: {e}")
+            # Add default traffic column if merge fails
+            daily_sales['traffic_volume'] = 100.0
+            return daily_sales
+    
+    def _engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Engineer additional features from existing data"""
+        df = df.copy()
+        
+        # Weather-based features
+        if 'temperature' in df.columns:
+            df['temp_squared'] = df['temperature'] ** 2
+            df['is_hot_day'] = (df['temperature'] > 25).astype(int)
+            df['is_cold_day'] = (df['temperature'] < 10).astype(int)
+        
+        if 'precipitation' in df.columns:
+            df['is_rainy_day'] = (df['precipitation'] > 0).astype(int)
+            df['heavy_rain'] = (df['precipitation'] > 10).astype(int)
+        
+        # Traffic-based features
+        if 'traffic_volume' in df.columns:
+            df['high_traffic'] = (df['traffic_volume'] > df['traffic_volume'].quantile(0.75)).astype(int)
+            df['low_traffic'] = (df['traffic_volume'] < df['traffic_volume'].quantile(0.25)).astype(int)
+        
+        # Interaction features
+        if 'is_weekend' in df.columns and 'temperature' in df.columns:
+            df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
+        
+        if 'is_rainy_day' in df.columns and 'traffic_volume' in df.columns:
+            df['rain_traffic_interaction'] = df['is_rainy_day'] * df['traffic_volume']
+        
+        return df
+    
+    def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Handle missing values in the dataset"""
+        df = df.copy()
+        
+        # For numeric columns, use median imputation
+        numeric_columns = df.select_dtypes(include=[np.number]).columns
+        
+        for col in numeric_columns:
+            if col != 'quantity' and df[col].isna().any():
+                median_value = df[col].median()
+                df[col] = df[col].fillna(median_value)
+        
+        return df
+    
+    def _prepare_prophet_format(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Prepare data in Prophet format with 'ds' and 'y' columns"""
+        prophet_df = df.copy()
+        
+        # Rename columns for Prophet
+        if 'date' in prophet_df.columns:
+            prophet_df = prophet_df.rename(columns={'date': 'ds'})
+        
+        if 'quantity' in prophet_df.columns:
+            prophet_df = prophet_df.rename(columns={'quantity': 'y'})
+        
+        # Ensure ds is datetime
+        if 'ds' in prophet_df.columns:
+            prophet_df['ds'] = pd.to_datetime(prophet_df['ds'])
+        
+        # Validate required columns
+        if 'ds' not in prophet_df.columns or 'y' not in prophet_df.columns:
+            raise ValueError("Prophet data must have 'ds' and 'y' columns")
+        
+        # Remove any rows with missing target values
+        prophet_df = prophet_df.dropna(subset=['y'])
+        
+        # Sort by date
+        prophet_df = prophet_df.sort_values('ds').reset_index(drop=True)
+        
+        return prophet_df
+    
+    def _get_season(self, month: int) -> int:
+        """Get season from month (1-4 for Winter, Spring, Summer, Autumn)"""
+        if month in [12, 1, 2]:
+            return 1  # Winter
+        elif month in [3, 4, 5]:
+            return 2  # Spring
+        elif month in [6, 7, 8]:
+            return 3  # Summer
+        else:
+            return 4  # Autumn
+    
+    def _is_spanish_holiday(self, date: datetime) -> bool:
+        """Check if a date is a major Spanish holiday"""
+        month_day = (date.month, date.day)
+        
+        # Major Spanish holidays that affect bakery sales
+        spanish_holidays = [
+            (1, 1),   # New Year
+            (1, 6),   # Epiphany
+            (5, 1),   # Labour Day
+            (8, 15),  # Assumption
+            (10, 12), # National Day
+            (11, 1),  # All Saints
+            (12, 6),  # Constitution
+            (12, 8),  # Immaculate Conception
+            (12, 25), # Christmas
+            (5, 15),  # San Isidro (Madrid)
+            (5, 2),   # Madrid Community Day
+        ]
+        
+        return month_day in spanish_holidays
+    
+    def _is_school_holiday(self, date: datetime) -> bool:
+        """Check if a date is during school holidays (approximate)"""
+        month = date.month
+        
+        # Approximate Spanish school holiday periods
+        # Summer holidays (July-August)
+        if month in [7, 8]:
+            return True
+        
+        # Christmas holidays (mid December to early January)
+        if month == 12 and date.day >= 20:
+            return True
+        if month == 1 and date.day <= 10:
+            return True
+        
+        # Easter holidays (approximate - first two weeks of April)
+        if month == 4 and date.day <= 14:
+            return True
+        
+        return False
+    
+    def calculate_feature_importance(self, 
+                                   model_data: pd.DataFrame,
+                                   target_column: str = 'y') -> Dict[str, float]:
+        """
+        Calculate feature importance for the model.
+        """
+        try:
+            # Simple correlation-based importance
+            numeric_features = model_data.select_dtypes(include=[np.number]).columns
+            numeric_features = [col for col in numeric_features if col != target_column]
+            
+            importance_scores = {}
+            
+            for feature in numeric_features:
+                if feature in model_data.columns:
+                    correlation = model_data[feature].corr(model_data[target_column])
+                    importance_scores[feature] = abs(correlation) if not pd.isna(correlation) else 0.0
+            
+            # Sort by importance
+            importance_scores = dict(sorted(importance_scores.items(), 
+                                          key=lambda x: x[1], reverse=True))
+            
+            return importance_scores
+            
+        except Exception as e:
+            logger.error(f"Error calculating feature importance: {e}")
+            return {}
--- a/services/training/app/ml/prophet_manager.py
+++ b/services/training/app/ml/prophet_manager.py
@@ -0,0 +1,408 @@
+# services/training/app/ml/prophet_manager.py
+"""
+Enhanced Prophet Manager for Training Service
+Migrated from the monolithic backend to microservices architecture
+"""
+
+from typing import Dict, List, Any, Optional, Tuple
+import pandas as pd
+import numpy as np
+from prophet import Prophet
+import pickle
+import logging
+from datetime import datetime, timedelta
+import uuid
+import asyncio
+import os
+import joblib
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+import json
+from pathlib import Path
+
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+class BakeryProphetManager:
+    """
+    Enhanced Prophet model manager for the training service.
+    Handles training, validation, and model persistence for bakery forecasting.
+    """
+    
+    def __init__(self):
+        self.models = {}  # In-memory model storage
+        self.model_metadata = {}  # Store model metadata
+        self.feature_scalers = {}  # Store feature scalers per model
+        
+        # Ensure model storage directory exists
+        os.makedirs(settings.MODEL_STORAGE_PATH, exist_ok=True)
+        
+    async def train_bakery_model(self, 
+                                tenant_id: str, 
+                                product_name: str, 
+                                df: pd.DataFrame,
+                                job_id: str) -> Dict[str, Any]:
+        """
+        Train a Prophet model for bakery forecasting with enhanced features.
+        
+        Args:
+            tenant_id: Tenant identifier
+            product_name: Product name
+            df: Training data with 'ds' and 'y' columns plus regressors
+            job_id: Training job identifier
+            
+        Returns:
+            Dictionary with model information and metrics
+        """
+        try:
+            logger.info(f"Training bakery model for tenant {tenant_id}, product {product_name}")
+            
+            # Validate input data
+            await self._validate_training_data(df, product_name)
+            
+            # Prepare data for Prophet
+            prophet_data = await self._prepare_prophet_data(df)
+            
+            # Get regressor columns
+            regressor_columns = self._extract_regressor_columns(prophet_data)
+            
+            # Initialize Prophet model with bakery-specific settings
+            model = self._create_prophet_model(regressor_columns)
+            
+            # Add regressors to model
+            for regressor in regressor_columns:
+                if regressor in prophet_data.columns:
+                    model.add_regressor(regressor)
+            
+            # Fit the model
+            model.fit(prophet_data)
+            
+            # Generate model ID and store model
+            model_id = f"{job_id}_{product_name}_{uuid.uuid4().hex[:8]}"
+            model_path = await self._store_model(
+                tenant_id, product_name, model, model_id, prophet_data, regressor_columns
+            )
+            
+            # Calculate training metrics
+            training_metrics = await self._calculate_training_metrics(model, prophet_data)
+            
+            # Prepare model information
+            model_info = {
+                "model_id": model_id,
+                "model_path": model_path,
+                "type": "prophet",
+                "training_samples": len(prophet_data),
+                "features": regressor_columns,
+                "hyperparameters": {
+                    "seasonality_mode": settings.PROPHET_SEASONALITY_MODE,
+                    "daily_seasonality": settings.PROPHET_DAILY_SEASONALITY,
+                    "weekly_seasonality": settings.PROPHET_WEEKLY_SEASONALITY,
+                    "yearly_seasonality": settings.PROPHET_YEARLY_SEASONALITY
+                },
+                "training_metrics": training_metrics,
+                "trained_at": datetime.now().isoformat(),
+                "data_period": {
+                    "start_date": prophet_data['ds'].min().isoformat(),
+                    "end_date": prophet_data['ds'].max().isoformat(),
+                    "total_days": len(prophet_data)
+                }
+            }
+            
+            logger.info(f"Model trained successfully for {product_name}")
+            return model_info
+            
+        except Exception as e:
+            logger.error(f"Failed to train bakery model for {product_name}: {str(e)}")
+            raise
+    
+    async def generate_forecast(self, 
+                               model_path: str,
+                               future_dates: pd.DataFrame,
+                               regressor_columns: List[str]) -> pd.DataFrame:
+        """
+        Generate forecast using a stored Prophet model.
+        
+        Args:
+            model_path: Path to the stored model
+            future_dates: DataFrame with future dates and regressors
+            regressor_columns: List of regressor column names
+            
+        Returns:
+            DataFrame with forecast results
+        """
+        try:
+            # Load the model
+            model = joblib.load(model_path)
+            
+            # Validate future data has required regressors
+            for regressor in regressor_columns:
+                if regressor not in future_dates.columns:
+                    logger.warning(f"Missing regressor {regressor}, filling with median")
+                    future_dates[regressor] = 0  # Default value
+            
+            # Generate forecast
+            forecast = model.predict(future_dates)
+            
+            return forecast
+            
+        except Exception as e:
+            logger.error(f"Failed to generate forecast: {str(e)}")
+            raise
+    
+    async def _validate_training_data(self, df: pd.DataFrame, product_name: str):
+        """Validate training data quality"""
+        if df.empty:
+            raise ValueError(f"No training data available for {product_name}")
+        
+        if len(df) < settings.MIN_TRAINING_DATA_DAYS:
+            raise ValueError(
+                f"Insufficient training data for {product_name}: "
+                f"{len(df)} days, minimum required: {settings.MIN_TRAINING_DATA_DAYS}"
+            )
+        
+        required_columns = ['ds', 'y']
+        missing_columns = [col for col in required_columns if col not in df.columns]
+        if missing_columns:
+            raise ValueError(f"Missing required columns: {missing_columns}")
+        
+        # Check for valid date range
+        if df['ds'].isna().any():
+            raise ValueError("Invalid dates found in training data")
+        
+        # Check for valid target values
+        if df['y'].isna().all():
+            raise ValueError("No valid target values found")
+    
+    async def _prepare_prophet_data(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Prepare data for Prophet training"""
+        prophet_data = df.copy()
+        
+        # Ensure ds column is datetime
+        prophet_data['ds'] = pd.to_datetime(prophet_data['ds'])
+        
+        # Handle missing values in target
+        if prophet_data['y'].isna().any():
+            logger.warning("Filling missing target values with interpolation")
+            prophet_data['y'] = prophet_data['y'].interpolate(method='linear')
+        
+        # Remove extreme outliers (values > 3 standard deviations)
+        mean_val = prophet_data['y'].mean()
+        std_val = prophet_data['y'].std()
+        
+        if std_val > 0:  # Avoid division by zero
+            lower_bound = mean_val - 3 * std_val
+            upper_bound = mean_val + 3 * std_val
+            
+            before_count = len(prophet_data)
+            prophet_data = prophet_data[
+                (prophet_data['y'] >= lower_bound) & 
+                (prophet_data['y'] <= upper_bound)
+            ]
+            after_count = len(prophet_data)
+            
+            if before_count != after_count:
+                logger.info(f"Removed {before_count - after_count} outliers")
+        
+        # Ensure chronological order
+        prophet_data = prophet_data.sort_values('ds').reset_index(drop=True)
+        
+        # Fill missing values in regressors
+        numeric_columns = prophet_data.select_dtypes(include=[np.number]).columns
+        for col in numeric_columns:
+            if col != 'y' and prophet_data[col].isna().any():
+                prophet_data[col] = prophet_data[col].fillna(prophet_data[col].median())
+        
+        return prophet_data
+    
+    def _extract_regressor_columns(self, df: pd.DataFrame) -> List[str]:
+        """Extract regressor columns from the dataframe"""
+        excluded_columns = ['ds', 'y']
+        regressor_columns = []
+        
+        for col in df.columns:
+            if col not in excluded_columns and df[col].dtype in ['int64', 'float64']:
+                regressor_columns.append(col)
+        
+        logger.info(f"Identified regressor columns: {regressor_columns}")
+        return regressor_columns
+    
+    def _create_prophet_model(self, regressor_columns: List[str]) -> Prophet:
+        """Create Prophet model with bakery-specific settings"""
+        
+        # Get Spanish holidays
+        holidays = self._get_spanish_holidays()
+        
+        # Bakery-specific Prophet configuration
+        model = Prophet(
+            holidays=holidays if not holidays.empty else None,
+            daily_seasonality=settings.PROPHET_DAILY_SEASONALITY,
+            weekly_seasonality=settings.PROPHET_WEEKLY_SEASONALITY,
+            yearly_seasonality=settings.PROPHET_YEARLY_SEASONALITY,
+            seasonality_mode=settings.PROPHET_SEASONALITY_MODE,
+            changepoint_prior_scale=0.05,  # Conservative changepoint detection
+            seasonality_prior_scale=10,  # Strong seasonality for bakeries
+            holidays_prior_scale=10,  # Strong holiday effects
+            interval_width=0.8,  # 80% confidence intervals
+            mcmc_samples=0,  # Use MAP estimation (faster)
+            uncertainty_samples=1000  # For uncertainty estimation
+        )
+        
+        return model
+    
+    def _get_spanish_holidays(self) -> pd.DataFrame:
+        """Get Spanish holidays for Prophet model"""
+        try:
+            # Define major Spanish holidays that affect bakery sales
+            holidays_list = []
+            
+            years = range(2020, 2030)  # Cover training and prediction period
+            
+            for year in years:
+                holidays_list.extend([
+                    {'holiday': 'new_year', 'ds': f'{year}-01-01'},
+                    {'holiday': 'epiphany', 'ds': f'{year}-01-06'},
+                    {'holiday': 'may_day', 'ds': f'{year}-05-01'},
+                    {'holiday': 'assumption', 'ds': f'{year}-08-15'},
+                    {'holiday': 'national_day', 'ds': f'{year}-10-12'},
+                    {'holiday': 'all_saints', 'ds': f'{year}-11-01'},
+                    {'holiday': 'constitution', 'ds': f'{year}-12-06'},
+                    {'holiday': 'immaculate', 'ds': f'{year}-12-08'},
+                    {'holiday': 'christmas', 'ds': f'{year}-12-25'},
+                    
+                    # Madrid specific holidays
+                    {'holiday': 'madrid_patron', 'ds': f'{year}-05-15'},  # San Isidro
+                    {'holiday': 'madrid_community', 'ds': f'{year}-05-02'},
+                ])
+            
+            holidays_df = pd.DataFrame(holidays_list)
+            holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])
+            
+            return holidays_df
+            
+        except Exception as e:
+            logger.warning(f"Error creating holidays dataframe: {e}")
+            return pd.DataFrame()
+    
+    async def _store_model(self, 
+                          tenant_id: str,
+                          product_name: str,
+                          model: Prophet, 
+                          model_id: str, 
+                          training_data: pd.DataFrame, 
+                          regressor_columns: List[str]) -> str:
+        """Store model and metadata to filesystem"""
+        
+        # Create model filename
+        model_filename = f"{model_id}_prophet_model.pkl"
+        model_path = os.path.join(settings.MODEL_STORAGE_PATH, model_filename)
+        
+        # Store the model
+        joblib.dump(model, model_path)
+        
+        # Store metadata
+        metadata = {
+            "tenant_id": tenant_id,
+            "product_name": product_name,
+            "model_id": model_id,
+            "regressor_columns": regressor_columns,
+            "training_samples": len(training_data),
+            "training_period": {
+                "start": training_data['ds'].min().isoformat(),
+                "end": training_data['ds'].max().isoformat()
+            },
+            "created_at": datetime.now().isoformat(),
+            "model_type": "prophet",
+            "file_path": model_path
+        }
+        
+        metadata_path = model_path.replace('.pkl', '_metadata.json')
+        with open(metadata_path, 'w') as f:
+            json.dump(metadata, f, indent=2)
+        
+        # Store in memory for quick access
+        model_key = f"{tenant_id}:{product_name}"
+        self.models[model_key] = model
+        self.model_metadata[model_key] = metadata
+        
+        logger.info(f"Model stored at: {model_path}")
+        return model_path
+    
+    async def _calculate_training_metrics(self, 
+                                         model: Prophet, 
+                                         training_data: pd.DataFrame) -> Dict[str, float]:
+        """Calculate training metrics for the model"""
+        try:
+            # Generate in-sample predictions
+            forecast = model.predict(training_data[['ds'] + [col for col in training_data.columns if col not in ['ds', 'y']]])
+            
+            # Calculate metrics
+            y_true = training_data['y'].values
+            y_pred = forecast['yhat'].values
+            
+            # Basic metrics
+            mae = mean_absolute_error(y_true, y_pred)
+            mse = mean_squared_error(y_true, y_pred)
+            rmse = np.sqrt(mse)
+            
+            # MAPE (Mean Absolute Percentage Error)
+            mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
+            
+            # R-squared
+            r2 = r2_score(y_true, y_pred)
+            
+            return {
+                "mae": round(mae, 2),
+                "mse": round(mse, 2),
+                "rmse": round(rmse, 2),
+                "mape": round(mape, 2),
+                "r2_score": round(r2, 4),
+                "mean_actual": round(np.mean(y_true), 2),
+                "mean_predicted": round(np.mean(y_pred), 2)
+            }
+            
+        except Exception as e:
+            logger.error(f"Error calculating training metrics: {e}")
+            return {
+                "mae": 0.0,
+                "mse": 0.0,
+                "rmse": 0.0,
+                "mape": 0.0,
+                "r2_score": 0.0,
+                "mean_actual": 0.0,
+                "mean_predicted": 0.0
+            }
+    
+    def get_model_info(self, tenant_id: str, product_name: str) -> Optional[Dict[str, Any]]:
+        """Get model information for a specific tenant and product"""
+        model_key = f"{tenant_id}:{product_name}"
+        return self.model_metadata.get(model_key)
+    
+    def list_models(self, tenant_id: str) -> List[Dict[str, Any]]:
+        """List all models for a tenant"""
+        tenant_models = []
+        
+        for model_key, metadata in self.model_metadata.items():
+            if metadata['tenant_id'] == tenant_id:
+                tenant_models.append(metadata)
+        
+        return tenant_models
+    
+    async def cleanup_old_models(self, days_old: int = 30):
+        """Clean up old model files"""
+        try:
+            cutoff_date = datetime.now() - timedelta(days=days_old)
+            
+            for model_path in Path(settings.MODEL_STORAGE_PATH).glob("*.pkl"):
+                # Check file modification time
+                if model_path.stat().st_mtime < cutoff_date.timestamp():
+                    # Remove model and metadata files
+                    model_path.unlink()
+                    
+                    metadata_path = model_path.with_suffix('.json')
+                    if metadata_path.exists():
+                        metadata_path.unlink()
+                    
+                    logger.info(f"Cleaned up old model: {model_path}")
+                    
+        except Exception as e:
+            logger.error(f"Error during model cleanup: {e}")
--- a/services/training/app/ml/trainer.py
+++ b/services/training/app/ml/trainer.py
@@ -1,174 +1,372 @@
+# services/training/app/ml/trainer.py
 """
-ML Training implementation
+ML Trainer for Training Service
+Orchestrates the complete training process
 """

-import asyncio
-import structlog
-from typing import Dict, Any, List
+from typing import Dict, List, Any, Optional, Tuple
 import pandas as pd
-from datetime import datetime
-import joblib
-import os
-from prophet import Prophet
 import numpy as np
-from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from datetime import datetime, timedelta
+import logging
+import asyncio
+import uuid
+from pathlib import Path

+from app.ml.prophet_manager import BakeryProphetManager
+from app.ml.data_processor import BakeryDataProcessor
 from app.core.config import settings

-logger = structlog.get_logger()
+logger = logging.getLogger(__name__)

-class MLTrainer:
-    """ML training implementation"""
+class BakeryMLTrainer:
+    """
+    Main ML trainer that orchestrates the complete training process.
+    Replaces the old Celery-based training system with clean async implementation.
+    """
    
    def __init__(self):
-        self.model_storage_path = settings.MODEL_STORAGE_PATH
-        os.makedirs(self.model_storage_path, exist_ok=True)
+        self.prophet_manager = BakeryProphetManager()
+        self.data_processor = BakeryDataProcessor()
+        
+    async def train_tenant_models(self,
+                                 tenant_id: str,
+                                 sales_data: List[Dict],
+                                 weather_data: List[Dict] = None,
+                                 traffic_data: List[Dict] = None,
+                                 job_id: str = None) -> Dict[str, Any]:
+        """
+        Train models for all products of a tenant.
+        
+        Args:
+            tenant_id: Tenant identifier
+            sales_data: Historical sales data
+            weather_data: Weather data (optional)
+            traffic_data: Traffic data (optional)
+            job_id: Training job identifier
+            
+        Returns:
+            Dictionary with training results for each product
+        """
+        if not job_id:
+            job_id = f"training_{tenant_id}_{uuid.uuid4().hex[:8]}"
+            
+        logger.info(f"Starting training job {job_id} for tenant {tenant_id}")
+        
+        try:
+            # Convert input data to DataFrames
+            sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
+            weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
+            traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
+            
+            # Validate input data
+            await self._validate_input_data(sales_df, tenant_id)
+            
+            # Get unique products
+            products = sales_df['product_name'].unique().tolist()
+            logger.info(f"Training models for {len(products)} products: {products}")
+            
+            # Process data for each product
+            processed_data = await self._process_all_products(
+                sales_df, weather_df, traffic_df, products
+            )
+            
+            # Train models for each product
+            training_results = await self._train_all_models(
+                tenant_id, processed_data, job_id
+            )
+            
+            # Calculate overall training summary
+            summary = self._calculate_training_summary(training_results)
+            
+            result = {
+                "job_id": job_id,
+                "tenant_id": tenant_id,
+                "status": "completed",
+                "products_trained": len([r for r in training_results.values() if r.get('status') == 'success']),
+                "products_failed": len([r for r in training_results.values() if r.get('status') == 'error']),
+                "total_products": len(products),
+                "training_results": training_results,
+                "summary": summary,
+                "completed_at": datetime.now().isoformat()
+            }
+            
+            logger.info(f"Training job {job_id} completed successfully")
+            return result
+            
+        except Exception as e:
+            logger.error(f"Training job {job_id} failed: {str(e)}")
+            raise
    
-    async def train_models(self, training_data: Dict[str, Any], job_id: str, db) -> Dict[str, Any]:
-        """Train models for all products"""
+    async def train_single_product(self,
+                                  tenant_id: str,
+                                  product_name: str,
+                                  sales_data: List[Dict],
+                                  weather_data: List[Dict] = None,
+                                  traffic_data: List[Dict] = None,
+                                  job_id: str = None) -> Dict[str, Any]:
+        """
+        Train model for a single product.
        
-        models_result = {}
+        Args:
+            tenant_id: Tenant identifier
+            product_name: Product name
+            sales_data: Historical sales data
+            weather_data: Weather data (optional)
+            traffic_data: Traffic data (optional)
+            job_id: Training job identifier
+            
+        Returns:
+            Training result for the product
+        """
+        if not job_id:
+            job_id = f"training_{tenant_id}_{product_name}_{uuid.uuid4().hex[:8]}"
+            
+        logger.info(f"Starting single product training {job_id} for {product_name}")
        
-        # Get sales data
-        sales_data = training_data.get("sales_data", [])
-        external_data = training_data.get("external_data", {})
+        try:
+            # Convert input data to DataFrames
+            sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
+            weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
+            traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
+            
+            # Filter sales data for the specific product
+            product_sales = sales_df[sales_df['product_name'] == product_name].copy()
+            
+            # Validate product data
+            if product_sales.empty:
+                raise ValueError(f"No sales data found for product: {product_name}")
+            
+            # Prepare training data
+            processed_data = await self.data_processor.prepare_training_data(
+                sales_data=product_sales,
+                weather_data=weather_df,
+                traffic_data=traffic_df,
+                product_name=product_name
+            )
+            
+            # Train the model
+            model_info = await self.prophet_manager.train_bakery_model(
+                tenant_id=tenant_id,
+                product_name=product_name,
+                df=processed_data,
+                job_id=job_id
+            )
+            
+            result = {
+                "job_id": job_id,
+                "tenant_id": tenant_id,
+                "product_name": product_name,
+                "status": "success",
+                "model_info": model_info,
+                "data_points": len(processed_data),
+                "completed_at": datetime.now().isoformat()
+            }
+            
+            logger.info(f"Single product training {job_id} completed successfully")
+            return result
+            
+        except Exception as e:
+            logger.error(f"Single product training {job_id} failed: {str(e)}")
+            raise
+    
+    async def evaluate_model_performance(self,
+                                        tenant_id: str,
+                                        product_name: str,
+                                        model_path: str,
+                                        test_data: List[Dict]) -> Dict[str, Any]:
+        """
+        Evaluate model performance on test data.
        
-        # Group by product
-        products_data = self._group_by_product(sales_data)
+        Args:
+            tenant_id: Tenant identifier
+            product_name: Product name
+            model_path: Path to the trained model
+            test_data: Test data for evaluation
+            
+        Returns:
+            Performance metrics
+        """
+        try:
+            logger.info(f"Evaluating model performance for {product_name}")
+            
+            # Convert test data to DataFrame
+            test_df = pd.DataFrame(test_data)
+            
+            # Prepare test data
+            test_prepared = await self.data_processor.prepare_prediction_features(
+                future_dates=test_df['ds'],
+                weather_forecast=test_df if 'temperature' in test_df.columns else pd.DataFrame(),
+                traffic_forecast=test_df if 'traffic_volume' in test_df.columns else pd.DataFrame()
+            )
+            
+            # Get regressor columns
+            regressor_columns = [col for col in test_prepared.columns if col not in ['ds', 'y']]
+            
+            # Generate predictions
+            forecast = await self.prophet_manager.generate_forecast(
+                model_path=model_path,
+                future_dates=test_prepared,
+                regressor_columns=regressor_columns
+            )
+            
+            # Calculate performance metrics if we have actual values
+            metrics = {}
+            if 'y' in test_df.columns:
+                from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+                
+                y_true = test_df['y'].values
+                y_pred = forecast['yhat'].values
+                
+                metrics = {
+                    "mae": float(mean_absolute_error(y_true, y_pred)),
+                    "rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
+                    "mape": float(np.mean(np.abs((y_true - y_pred) / y_true)) * 100),
+                    "r2_score": float(r2_score(y_true, y_pred))
+                }
+            
+            result = {
+                "tenant_id": tenant_id,
+                "product_name": product_name,
+                "evaluation_metrics": metrics,
+                "forecast_samples": len(forecast),
+                "evaluated_at": datetime.now().isoformat()
+            }
+            
+            return result
+            
+        except Exception as e:
+            logger.error(f"Model evaluation failed: {str(e)}")
+            raise
+    
+    async def _validate_input_data(self, sales_df: pd.DataFrame, tenant_id: str):
+        """Validate input sales data"""
+        if sales_df.empty:
+            raise ValueError(f"No sales data provided for tenant {tenant_id}")
        
-        # Train model for each product
-        for product_name, product_sales in products_data.items():
+        required_columns = ['date', 'product_name', 'quantity']
+        missing_columns = [col for col in required_columns if col not in sales_df.columns]
+        if missing_columns:
+            raise ValueError(f"Missing required columns: {missing_columns}")
+        
+        # Check for valid dates
+        try:
+            sales_df['date'] = pd.to_datetime(sales_df['date'])
+        except Exception:
+            raise ValueError("Invalid date format in sales data")
+        
+        # Check for valid quantities
+        if not sales_df['quantity'].dtype in ['int64', 'float64']:
+            raise ValueError("Quantity column must be numeric")
+    
+    async def _process_all_products(self,
+                                   sales_df: pd.DataFrame,
+                                   weather_df: pd.DataFrame,
+                                   traffic_df: pd.DataFrame,
+                                   products: List[str]) -> Dict[str, pd.DataFrame]:
+        """Process data for all products"""
+        processed_data = {}
+        
+        for product_name in products:
            try:
-                model_result = await self._train_product_model(
-                    product_name, 
-                    product_sales, 
-                    external_data, 
-                    job_id
+                logger.info(f"Processing data for product: {product_name}")
+                
+                # Filter sales data for this product
+                product_sales = sales_df[sales_df['product_name'] == product_name].copy()
+                
+                # Process the product data
+                processed_product_data = await self.data_processor.prepare_training_data(
+                    sales_data=product_sales,
+                    weather_data=weather_df,
+                    traffic_data=traffic_df,
+                    product_name=product_name
                )
-                models_result[product_name] = model_result
+                
+                processed_data[product_name] = processed_product_data
+                logger.info(f"Processed {len(processed_product_data)} data points for {product_name}")
                
            except Exception as e:
-                logger.error(f"Failed to train model for {product_name}: {e}")
+                logger.error(f"Failed to process data for {product_name}: {str(e)}")
+                # Continue with other products
                continue
        
-        return models_result
+        return processed_data
    
-    def _group_by_product(self, sales_data: List[Dict]) -> Dict[str, List[Dict]]:
-        """Group sales data by product"""
+    async def _train_all_models(self,
+                               tenant_id: str,
+                               processed_data: Dict[str, pd.DataFrame],
+                               job_id: str) -> Dict[str, Any]:
+        """Train models for all processed products"""
+        training_results = {}
        
-        products = {}
-        for sale in sales_data:
-            product_name = sale.get("product_name")
-            if product_name not in products:
-                products[product_name] = []
-            products[product_name].append(sale)
-        
-        return products
-    
-    async def _train_product_model(self, product_name: str, sales_data: List[Dict], external_data: Dict, job_id: str) -> Dict[str, Any]:
-        """Train Prophet model for a single product"""
-        
-        # Convert to DataFrame
-        df = pd.DataFrame(sales_data)
-        df['date'] = pd.to_datetime(df['date'])
-        
-        # Aggregate daily sales
-        daily_sales = df.groupby('date')['quantity_sold'].sum().reset_index()
-        daily_sales.columns = ['ds', 'y']
-        
-        # Add external features
-        daily_sales = self._add_external_features(daily_sales, external_data)
-        
-        # Train Prophet model
-        model = Prophet(
-            seasonality_mode=settings.PROPHET_SEASONALITY_MODE,
-            daily_seasonality=settings.PROPHET_DAILY_SEASONALITY,
-            weekly_seasonality=settings.PROPHET_WEEKLY_SEASONALITY,
-            yearly_seasonality=settings.PROPHET_YEARLY_SEASONALITY
-        )
-        
-        # Add regressors
-        model.add_regressor('temperature')
-        model.add_regressor('humidity')
-        model.add_regressor('precipitation')
-        model.add_regressor('traffic_volume')
-        
-        # Fit model
-        model.fit(daily_sales)
-        
-        # Save model
-        model_path = os.path.join(
-            self.model_storage_path,
-            f"{job_id}_{product_name}_prophet_model.pkl"
-        )
-        
-        joblib.dump(model, model_path)
-        
-        return {
-            "type": "prophet",
-            "path": model_path,
-            "training_samples": len(daily_sales),
-            "features": ["temperature", "humidity", "precipitation", "traffic_volume"],
-            "hyperparameters": {
-                "seasonality_mode": settings.PROPHET_SEASONALITY_MODE,
-                "daily_seasonality": settings.PROPHET_DAILY_SEASONALITY,
-                "weekly_seasonality": settings.PROPHET_WEEKLY_SEASONALITY,
-                "yearly_seasonality": settings.PROPHET_YEARLY_SEASONALITY
-            }
-        }
-    
-    def _add_external_features(self, daily_sales: pd.DataFrame, external_data: Dict) -> pd.DataFrame:
-        """Add external features to sales data"""
-        
-        # Add weather data
-        weather_data = external_data.get("weather", [])
-        if weather_data:
-            weather_df = pd.DataFrame(weather_data)
-            weather_df['ds'] = pd.to_datetime(weather_df['date'])
-            daily_sales = daily_sales.merge(weather_df[['ds', 'temperature', 'humidity', 'precipitation']], on='ds', how='left')
-        
-        # Add traffic data
-        traffic_data = external_data.get("traffic", [])
-        if traffic_data:
-            traffic_df = pd.DataFrame(traffic_data)
-            traffic_df['ds'] = pd.to_datetime(traffic_df['date'])
-            daily_sales = daily_sales.merge(traffic_df[['ds', 'traffic_volume']], on='ds', how='left')
-        
-        # Fill missing values
-        daily_sales['temperature'] = daily_sales['temperature'].fillna(daily_sales['temperature'].mean())
-        daily_sales['humidity'] = daily_sales['humidity'].fillna(daily_sales['humidity'].mean())
-        daily_sales['precipitation'] = daily_sales['precipitation'].fillna(0)
-        daily_sales['traffic_volume'] = daily_sales['traffic_volume'].fillna(daily_sales['traffic_volume'].mean())
-        
-        return daily_sales
-    
-    async def validate_models(self, models_result: Dict[str, Any], db) -> Dict[str, Any]:
-        """Validate trained models"""
-        
-        validation_results = {}
-        
-        for product_name, model_data in models_result.items():
+        for product_name, product_data in processed_data.items():
            try:
-                # Load model
-                model_path = model_data.get("path")
-                model = joblib.load(model_path)
+                logger.info(f"Training model for product: {product_name}")
                
-                # Mock validation for now (in production, you'd use actual validation data)
-                validation_results[product_name] = {
-                    "mape": np.random.uniform(10, 25),  # Mock MAPE between 10-25%
-                    "rmse": np.random.uniform(8, 15),   # Mock RMSE
-                    "mae": np.random.uniform(5, 12),    # Mock MAE
-                    "r2_score": np.random.uniform(0.7, 0.9)  # Mock R2 score
+                # Check if we have enough data
+                if len(product_data) < settings.MIN_TRAINING_DATA_DAYS:
+                    training_results[product_name] = {
+                        'status': 'skipped',
+                        'reason': 'insufficient_data',
+                        'data_points': len(product_data),
+                        'min_required': settings.MIN_TRAINING_DATA_DAYS
+                    }
+                    continue
+                
+                # Train the model
+                model_info = await self.prophet_manager.train_bakery_model(
+                    tenant_id=tenant_id,
+                    product_name=product_name,
+                    df=product_data,
+                    job_id=job_id
+                )
+                
+                training_results[product_name] = {
+                    'status': 'success',
+                    'model_info': model_info,
+                    'data_points': len(product_data),
+                    'trained_at': datetime.now().isoformat()
                }
                
+                logger.info(f"Successfully trained model for {product_name}")
+                
            except Exception as e:
-                logger.error(f"Validation failed for {product_name}: {e}")
-                validation_results[product_name] = {
-                    "mape": None,
-                    "rmse": None,
-                    "mae": None,
-                    "r2_score": None,
-                    "error": str(e)
+                logger.error(f"Failed to train model for {product_name}: {str(e)}")
+                training_results[product_name] = {
+                    'status': 'error',
+                    'error_message': str(e),
+                    'data_points': len(product_data) if product_data is not None else 0
                }
        
-        return validation_results
+        return training_results
+    
+    def _calculate_training_summary(self, training_results: Dict[str, Any]) -> Dict[str, Any]:
+        """Calculate summary statistics from training results"""
+        total_products = len(training_results)
+        successful_products = len([r for r in training_results.values() if r.get('status') == 'success'])
+        failed_products = len([r for r in training_results.values() if r.get('status') == 'error'])
+        skipped_products = len([r for r in training_results.values() if r.get('status') == 'skipped'])
+        
+        # Calculate average training metrics for successful models
+        successful_results = [r for r in training_results.values() if r.get('status') == 'success']
+        
+        avg_metrics = {}
+        if successful_results:
+            metrics_list = [r['model_info'].get('training_metrics', {}) for r in successful_results]
+            
+            if metrics_list and all(metrics_list):
+                avg_metrics = {
+                    'avg_mae': np.mean([m.get('mae', 0) for m in metrics_list]),
+                    'avg_rmse': np.mean([m.get('rmse', 0) for m in metrics_list]),
+                    'avg_mape': np.mean([m.get('mape', 0) for m in metrics_list]),
+                    'avg_r2': np.mean([m.get('r2_score', 0) for m in metrics_list])
+                }
+        
+        return {
+            'total_products': total_products,
+            'successful_products': successful_products,
+            'failed_products': failed_products,
+            'skipped_products': skipped_products,
+            'success_rate': round(successful_products / total_products * 100, 2) if total_products > 0 else 0,
+            'average_metrics': avg_metrics
+        }