# services/training/app/ml/trainer.py """ ML Trainer for Training Service Orchestrates the complete training process """ from typing import Dict, List, Any, Optional, Tuple import pandas as pd import numpy as np from datetime import datetime, timedelta import logging import asyncio import uuid from pathlib import Path from app.ml.prophet_manager import BakeryProphetManager from app.ml.data_processor import BakeryDataProcessor from app.core.config import settings logger = logging.getLogger(__name__) class BakeryMLTrainer: """ Main ML trainer that orchestrates the complete training process. Replaces the old Celery-based training system with clean async implementation. """ def __init__(self): self.prophet_manager = BakeryProphetManager() self.data_processor = BakeryDataProcessor() async def train_tenant_models(self, tenant_id: str, sales_data: List[Dict], weather_data: List[Dict] = None, traffic_data: List[Dict] = None, job_id: str = None) -> Dict[str, Any]: """ Train models for all products of a tenant. Args: tenant_id: Tenant identifier sales_data: Historical sales data weather_data: Weather data (optional) traffic_data: Traffic data (optional) job_id: Training job identifier Returns: Dictionary with training results for each product """ if not job_id: job_id = f"training_{tenant_id}_{uuid.uuid4().hex[:8]}" logger.info(f"Starting training job {job_id} for tenant {tenant_id}") try: # Convert input data to DataFrames sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame() weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame() traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame() # Validate input data await self._validate_input_data(sales_df, tenant_id) # Get unique products products = sales_df['product_name'].unique().tolist() logger.info(f"Training models for {len(products)} products: {products}") # Process data for each product processed_data = await self._process_all_products( sales_df, weather_df, traffic_df, products ) # Train models for each product training_results = await self._train_all_models( tenant_id, processed_data, job_id ) # Calculate overall training summary summary = self._calculate_training_summary(training_results) result = { "job_id": job_id, "tenant_id": tenant_id, "status": "completed", "products_trained": len([r for r in training_results.values() if r.get('status') == 'success']), "products_failed": len([r for r in training_results.values() if r.get('status') == 'error']), "total_products": len(products), "training_results": training_results, "summary": summary, "completed_at": datetime.now().isoformat() } logger.info(f"Training job {job_id} completed successfully") return result except Exception as e: logger.error(f"Training job {job_id} failed: {str(e)}") raise async def train_single_product(self, tenant_id: str, product_name: str, sales_data: List[Dict], weather_data: List[Dict] = None, traffic_data: List[Dict] = None, job_id: str = None) -> Dict[str, Any]: """ Train model for a single product. Args: tenant_id: Tenant identifier product_name: Product name sales_data: Historical sales data weather_data: Weather data (optional) traffic_data: Traffic data (optional) job_id: Training job identifier Returns: Training result for the product """ if not job_id: job_id = f"training_{tenant_id}_{product_name}_{uuid.uuid4().hex[:8]}" logger.info(f"Starting single product training {job_id} for {product_name}") try: # Convert input data to DataFrames sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame() weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame() traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame() # Filter sales data for the specific product product_sales = sales_df[sales_df['product_name'] == product_name].copy() # Validate product data if product_sales.empty: raise ValueError(f"No sales data found for product: {product_name}") # Prepare training data processed_data = await self.data_processor.prepare_training_data( sales_data=product_sales, weather_data=weather_df, traffic_data=traffic_df, product_name=product_name ) # Train the model model_info = await self.prophet_manager.train_bakery_model( tenant_id=tenant_id, product_name=product_name, df=processed_data, job_id=job_id ) result = { "job_id": job_id, "tenant_id": tenant_id, "product_name": product_name, "status": "success", "model_info": model_info, "data_points": len(processed_data), "completed_at": datetime.now().isoformat() } logger.info(f"Single product training {job_id} completed successfully") return result except Exception as e: logger.error(f"Single product training {job_id} failed: {str(e)}") raise async def evaluate_model_performance(self, tenant_id: str, product_name: str, model_path: str, test_data: List[Dict]) -> Dict[str, Any]: """ Evaluate model performance on test data. Args: tenant_id: Tenant identifier product_name: Product name model_path: Path to the trained model test_data: Test data for evaluation Returns: Performance metrics """ try: logger.info(f"Evaluating model performance for {product_name}") # Convert test data to DataFrame test_df = pd.DataFrame(test_data) # Prepare test data test_prepared = await self.data_processor.prepare_prediction_features( future_dates=test_df['ds'], weather_forecast=test_df if 'temperature' in test_df.columns else pd.DataFrame(), traffic_forecast=test_df if 'traffic_volume' in test_df.columns else pd.DataFrame() ) # Get regressor columns regressor_columns = [col for col in test_prepared.columns if col not in ['ds', 'y']] # Generate predictions forecast = await self.prophet_manager.generate_forecast( model_path=model_path, future_dates=test_prepared, regressor_columns=regressor_columns ) # Calculate performance metrics if we have actual values metrics = {} if 'y' in test_df.columns: from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score y_true = test_df['y'].values y_pred = forecast['yhat'].values metrics = { "mae": float(mean_absolute_error(y_true, y_pred)), "rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))), "mape": float(np.mean(np.abs((y_true - y_pred) / y_true)) * 100), "r2_score": float(r2_score(y_true, y_pred)) } result = { "tenant_id": tenant_id, "product_name": product_name, "evaluation_metrics": metrics, "forecast_samples": len(forecast), "evaluated_at": datetime.now().isoformat() } return result except Exception as e: logger.error(f"Model evaluation failed: {str(e)}") raise async def _validate_input_data(self, sales_df: pd.DataFrame, tenant_id: str): """Validate input sales data""" if sales_df.empty: raise ValueError(f"No sales data provided for tenant {tenant_id}") required_columns = ['date', 'product_name', 'quantity'] missing_columns = [col for col in required_columns if col not in sales_df.columns] if missing_columns: raise ValueError(f"Missing required columns: {missing_columns}") # Check for valid dates try: sales_df['date'] = pd.to_datetime(sales_df['date']) except Exception: raise ValueError("Invalid date format in sales data") # Check for valid quantities if not sales_df['quantity'].dtype in ['int64', 'float64']: raise ValueError("Quantity column must be numeric") async def _process_all_products(self, sales_df: pd.DataFrame, weather_df: pd.DataFrame, traffic_df: pd.DataFrame, products: List[str]) -> Dict[str, pd.DataFrame]: """Process data for all products""" processed_data = {} for product_name in products: try: logger.info(f"Processing data for product: {product_name}") # Filter sales data for this product product_sales = sales_df[sales_df['product_name'] == product_name].copy() # Process the product data processed_product_data = await self.data_processor.prepare_training_data( sales_data=product_sales, weather_data=weather_df, traffic_data=traffic_df, product_name=product_name ) processed_data[product_name] = processed_product_data logger.info(f"Processed {len(processed_product_data)} data points for {product_name}") except Exception as e: logger.error(f"Failed to process data for {product_name}: {str(e)}") # Continue with other products continue return processed_data async def _train_all_models(self, tenant_id: str, processed_data: Dict[str, pd.DataFrame], job_id: str) -> Dict[str, Any]: """Train models for all processed products""" training_results = {} for product_name, product_data in processed_data.items(): try: logger.info(f"Training model for product: {product_name}") # Check if we have enough data if len(product_data) < settings.MIN_TRAINING_DATA_DAYS: training_results[product_name] = { 'status': 'skipped', 'reason': 'insufficient_data', 'data_points': len(product_data), 'min_required': settings.MIN_TRAINING_DATA_DAYS } continue # Train the model model_info = await self.prophet_manager.train_bakery_model( tenant_id=tenant_id, product_name=product_name, df=product_data, job_id=job_id ) training_results[product_name] = { 'status': 'success', 'model_info': model_info, 'data_points': len(product_data), 'trained_at': datetime.now().isoformat() } logger.info(f"Successfully trained model for {product_name}") except Exception as e: logger.error(f"Failed to train model for {product_name}: {str(e)}") training_results[product_name] = { 'status': 'error', 'error_message': str(e), 'data_points': len(product_data) if product_data is not None else 0 } return training_results def _calculate_training_summary(self, training_results: Dict[str, Any]) -> Dict[str, Any]: """Calculate summary statistics from training results""" total_products = len(training_results) successful_products = len([r for r in training_results.values() if r.get('status') == 'success']) failed_products = len([r for r in training_results.values() if r.get('status') == 'error']) skipped_products = len([r for r in training_results.values() if r.get('status') == 'skipped']) # Calculate average training metrics for successful models successful_results = [r for r in training_results.values() if r.get('status') == 'success'] avg_metrics = {} if successful_results: metrics_list = [r['model_info'].get('training_metrics', {}) for r in successful_results] if metrics_list and all(metrics_list): avg_metrics = { 'avg_mae': np.mean([m.get('mae', 0) for m in metrics_list]), 'avg_rmse': np.mean([m.get('rmse', 0) for m in metrics_list]), 'avg_mape': np.mean([m.get('mape', 0) for m in metrics_list]), 'avg_r2': np.mean([m.get('r2_score', 0) for m in metrics_list]) } return { 'total_products': total_products, 'successful_products': successful_products, 'failed_products': failed_products, 'skipped_products': skipped_products, 'success_rate': round(successful_products / total_products * 100, 2) if total_products > 0 else 0, 'average_metrics': avg_metrics }