Add all the code for training service

This commit is contained in:
Urtzi Alfaro
2025-07-19 16:59:37 +02:00
parent 42097202d2
commit f3071c00bd
21 changed files with 7504 additions and 764 deletions

View File

@@ -1,174 +1,372 @@
# services/training/app/ml/trainer.py
"""
ML Training implementation
ML Trainer for Training Service
Orchestrates the complete training process
"""
import asyncio
import structlog
from typing import Dict, Any, List
from typing import Dict, List, Any, Optional, Tuple
import pandas as pd
from datetime import datetime
import joblib
import os
from prophet import Prophet
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime, timedelta
import logging
import asyncio
import uuid
from pathlib import Path
from app.ml.prophet_manager import BakeryProphetManager
from app.ml.data_processor import BakeryDataProcessor
from app.core.config import settings
logger = structlog.get_logger()
logger = logging.getLogger(__name__)
class MLTrainer:
"""ML training implementation"""
class BakeryMLTrainer:
"""
Main ML trainer that orchestrates the complete training process.
Replaces the old Celery-based training system with clean async implementation.
"""
def __init__(self):
self.model_storage_path = settings.MODEL_STORAGE_PATH
os.makedirs(self.model_storage_path, exist_ok=True)
self.prophet_manager = BakeryProphetManager()
self.data_processor = BakeryDataProcessor()
async def train_tenant_models(self,
tenant_id: str,
sales_data: List[Dict],
weather_data: List[Dict] = None,
traffic_data: List[Dict] = None,
job_id: str = None) -> Dict[str, Any]:
"""
Train models for all products of a tenant.
Args:
tenant_id: Tenant identifier
sales_data: Historical sales data
weather_data: Weather data (optional)
traffic_data: Traffic data (optional)
job_id: Training job identifier
Returns:
Dictionary with training results for each product
"""
if not job_id:
job_id = f"training_{tenant_id}_{uuid.uuid4().hex[:8]}"
logger.info(f"Starting training job {job_id} for tenant {tenant_id}")
try:
# Convert input data to DataFrames
sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
# Validate input data
await self._validate_input_data(sales_df, tenant_id)
# Get unique products
products = sales_df['product_name'].unique().tolist()
logger.info(f"Training models for {len(products)} products: {products}")
# Process data for each product
processed_data = await self._process_all_products(
sales_df, weather_df, traffic_df, products
)
# Train models for each product
training_results = await self._train_all_models(
tenant_id, processed_data, job_id
)
# Calculate overall training summary
summary = self._calculate_training_summary(training_results)
result = {
"job_id": job_id,
"tenant_id": tenant_id,
"status": "completed",
"products_trained": len([r for r in training_results.values() if r.get('status') == 'success']),
"products_failed": len([r for r in training_results.values() if r.get('status') == 'error']),
"total_products": len(products),
"training_results": training_results,
"summary": summary,
"completed_at": datetime.now().isoformat()
}
logger.info(f"Training job {job_id} completed successfully")
return result
except Exception as e:
logger.error(f"Training job {job_id} failed: {str(e)}")
raise
async def train_models(self, training_data: Dict[str, Any], job_id: str, db) -> Dict[str, Any]:
"""Train models for all products"""
async def train_single_product(self,
tenant_id: str,
product_name: str,
sales_data: List[Dict],
weather_data: List[Dict] = None,
traffic_data: List[Dict] = None,
job_id: str = None) -> Dict[str, Any]:
"""
Train model for a single product.
models_result = {}
Args:
tenant_id: Tenant identifier
product_name: Product name
sales_data: Historical sales data
weather_data: Weather data (optional)
traffic_data: Traffic data (optional)
job_id: Training job identifier
Returns:
Training result for the product
"""
if not job_id:
job_id = f"training_{tenant_id}_{product_name}_{uuid.uuid4().hex[:8]}"
logger.info(f"Starting single product training {job_id} for {product_name}")
# Get sales data
sales_data = training_data.get("sales_data", [])
external_data = training_data.get("external_data", {})
try:
# Convert input data to DataFrames
sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
# Filter sales data for the specific product
product_sales = sales_df[sales_df['product_name'] == product_name].copy()
# Validate product data
if product_sales.empty:
raise ValueError(f"No sales data found for product: {product_name}")
# Prepare training data
processed_data = await self.data_processor.prepare_training_data(
sales_data=product_sales,
weather_data=weather_df,
traffic_data=traffic_df,
product_name=product_name
)
# Train the model
model_info = await self.prophet_manager.train_bakery_model(
tenant_id=tenant_id,
product_name=product_name,
df=processed_data,
job_id=job_id
)
result = {
"job_id": job_id,
"tenant_id": tenant_id,
"product_name": product_name,
"status": "success",
"model_info": model_info,
"data_points": len(processed_data),
"completed_at": datetime.now().isoformat()
}
logger.info(f"Single product training {job_id} completed successfully")
return result
except Exception as e:
logger.error(f"Single product training {job_id} failed: {str(e)}")
raise
async def evaluate_model_performance(self,
tenant_id: str,
product_name: str,
model_path: str,
test_data: List[Dict]) -> Dict[str, Any]:
"""
Evaluate model performance on test data.
# Group by product
products_data = self._group_by_product(sales_data)
Args:
tenant_id: Tenant identifier
product_name: Product name
model_path: Path to the trained model
test_data: Test data for evaluation
Returns:
Performance metrics
"""
try:
logger.info(f"Evaluating model performance for {product_name}")
# Convert test data to DataFrame
test_df = pd.DataFrame(test_data)
# Prepare test data
test_prepared = await self.data_processor.prepare_prediction_features(
future_dates=test_df['ds'],
weather_forecast=test_df if 'temperature' in test_df.columns else pd.DataFrame(),
traffic_forecast=test_df if 'traffic_volume' in test_df.columns else pd.DataFrame()
)
# Get regressor columns
regressor_columns = [col for col in test_prepared.columns if col not in ['ds', 'y']]
# Generate predictions
forecast = await self.prophet_manager.generate_forecast(
model_path=model_path,
future_dates=test_prepared,
regressor_columns=regressor_columns
)
# Calculate performance metrics if we have actual values
metrics = {}
if 'y' in test_df.columns:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
y_true = test_df['y'].values
y_pred = forecast['yhat'].values
metrics = {
"mae": float(mean_absolute_error(y_true, y_pred)),
"rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
"mape": float(np.mean(np.abs((y_true - y_pred) / y_true)) * 100),
"r2_score": float(r2_score(y_true, y_pred))
}
result = {
"tenant_id": tenant_id,
"product_name": product_name,
"evaluation_metrics": metrics,
"forecast_samples": len(forecast),
"evaluated_at": datetime.now().isoformat()
}
return result
except Exception as e:
logger.error(f"Model evaluation failed: {str(e)}")
raise
async def _validate_input_data(self, sales_df: pd.DataFrame, tenant_id: str):
"""Validate input sales data"""
if sales_df.empty:
raise ValueError(f"No sales data provided for tenant {tenant_id}")
# Train model for each product
for product_name, product_sales in products_data.items():
required_columns = ['date', 'product_name', 'quantity']
missing_columns = [col for col in required_columns if col not in sales_df.columns]
if missing_columns:
raise ValueError(f"Missing required columns: {missing_columns}")
# Check for valid dates
try:
sales_df['date'] = pd.to_datetime(sales_df['date'])
except Exception:
raise ValueError("Invalid date format in sales data")
# Check for valid quantities
if not sales_df['quantity'].dtype in ['int64', 'float64']:
raise ValueError("Quantity column must be numeric")
async def _process_all_products(self,
sales_df: pd.DataFrame,
weather_df: pd.DataFrame,
traffic_df: pd.DataFrame,
products: List[str]) -> Dict[str, pd.DataFrame]:
"""Process data for all products"""
processed_data = {}
for product_name in products:
try:
model_result = await self._train_product_model(
product_name,
product_sales,
external_data,
job_id
logger.info(f"Processing data for product: {product_name}")
# Filter sales data for this product
product_sales = sales_df[sales_df['product_name'] == product_name].copy()
# Process the product data
processed_product_data = await self.data_processor.prepare_training_data(
sales_data=product_sales,
weather_data=weather_df,
traffic_data=traffic_df,
product_name=product_name
)
models_result[product_name] = model_result
processed_data[product_name] = processed_product_data
logger.info(f"Processed {len(processed_product_data)} data points for {product_name}")
except Exception as e:
logger.error(f"Failed to train model for {product_name}: {e}")
logger.error(f"Failed to process data for {product_name}: {str(e)}")
# Continue with other products
continue
return models_result
return processed_data
def _group_by_product(self, sales_data: List[Dict]) -> Dict[str, List[Dict]]:
"""Group sales data by product"""
async def _train_all_models(self,
tenant_id: str,
processed_data: Dict[str, pd.DataFrame],
job_id: str) -> Dict[str, Any]:
"""Train models for all processed products"""
training_results = {}
products = {}
for sale in sales_data:
product_name = sale.get("product_name")
if product_name not in products:
products[product_name] = []
products[product_name].append(sale)
return products
async def _train_product_model(self, product_name: str, sales_data: List[Dict], external_data: Dict, job_id: str) -> Dict[str, Any]:
"""Train Prophet model for a single product"""
# Convert to DataFrame
df = pd.DataFrame(sales_data)
df['date'] = pd.to_datetime(df['date'])
# Aggregate daily sales
daily_sales = df.groupby('date')['quantity_sold'].sum().reset_index()
daily_sales.columns = ['ds', 'y']
# Add external features
daily_sales = self._add_external_features(daily_sales, external_data)
# Train Prophet model
model = Prophet(
seasonality_mode=settings.PROPHET_SEASONALITY_MODE,
daily_seasonality=settings.PROPHET_DAILY_SEASONALITY,
weekly_seasonality=settings.PROPHET_WEEKLY_SEASONALITY,
yearly_seasonality=settings.PROPHET_YEARLY_SEASONALITY
)
# Add regressors
model.add_regressor('temperature')
model.add_regressor('humidity')
model.add_regressor('precipitation')
model.add_regressor('traffic_volume')
# Fit model
model.fit(daily_sales)
# Save model
model_path = os.path.join(
self.model_storage_path,
f"{job_id}_{product_name}_prophet_model.pkl"
)
joblib.dump(model, model_path)
return {
"type": "prophet",
"path": model_path,
"training_samples": len(daily_sales),
"features": ["temperature", "humidity", "precipitation", "traffic_volume"],
"hyperparameters": {
"seasonality_mode": settings.PROPHET_SEASONALITY_MODE,
"daily_seasonality": settings.PROPHET_DAILY_SEASONALITY,
"weekly_seasonality": settings.PROPHET_WEEKLY_SEASONALITY,
"yearly_seasonality": settings.PROPHET_YEARLY_SEASONALITY
}
}
def _add_external_features(self, daily_sales: pd.DataFrame, external_data: Dict) -> pd.DataFrame:
"""Add external features to sales data"""
# Add weather data
weather_data = external_data.get("weather", [])
if weather_data:
weather_df = pd.DataFrame(weather_data)
weather_df['ds'] = pd.to_datetime(weather_df['date'])
daily_sales = daily_sales.merge(weather_df[['ds', 'temperature', 'humidity', 'precipitation']], on='ds', how='left')
# Add traffic data
traffic_data = external_data.get("traffic", [])
if traffic_data:
traffic_df = pd.DataFrame(traffic_data)
traffic_df['ds'] = pd.to_datetime(traffic_df['date'])
daily_sales = daily_sales.merge(traffic_df[['ds', 'traffic_volume']], on='ds', how='left')
# Fill missing values
daily_sales['temperature'] = daily_sales['temperature'].fillna(daily_sales['temperature'].mean())
daily_sales['humidity'] = daily_sales['humidity'].fillna(daily_sales['humidity'].mean())
daily_sales['precipitation'] = daily_sales['precipitation'].fillna(0)
daily_sales['traffic_volume'] = daily_sales['traffic_volume'].fillna(daily_sales['traffic_volume'].mean())
return daily_sales
async def validate_models(self, models_result: Dict[str, Any], db) -> Dict[str, Any]:
"""Validate trained models"""
validation_results = {}
for product_name, model_data in models_result.items():
for product_name, product_data in processed_data.items():
try:
# Load model
model_path = model_data.get("path")
model = joblib.load(model_path)
logger.info(f"Training model for product: {product_name}")
# Mock validation for now (in production, you'd use actual validation data)
validation_results[product_name] = {
"mape": np.random.uniform(10, 25), # Mock MAPE between 10-25%
"rmse": np.random.uniform(8, 15), # Mock RMSE
"mae": np.random.uniform(5, 12), # Mock MAE
"r2_score": np.random.uniform(0.7, 0.9) # Mock R2 score
# Check if we have enough data
if len(product_data) < settings.MIN_TRAINING_DATA_DAYS:
training_results[product_name] = {
'status': 'skipped',
'reason': 'insufficient_data',
'data_points': len(product_data),
'min_required': settings.MIN_TRAINING_DATA_DAYS
}
continue
# Train the model
model_info = await self.prophet_manager.train_bakery_model(
tenant_id=tenant_id,
product_name=product_name,
df=product_data,
job_id=job_id
)
training_results[product_name] = {
'status': 'success',
'model_info': model_info,
'data_points': len(product_data),
'trained_at': datetime.now().isoformat()
}
logger.info(f"Successfully trained model for {product_name}")
except Exception as e:
logger.error(f"Validation failed for {product_name}: {e}")
validation_results[product_name] = {
"mape": None,
"rmse": None,
"mae": None,
"r2_score": None,
"error": str(e)
logger.error(f"Failed to train model for {product_name}: {str(e)}")
training_results[product_name] = {
'status': 'error',
'error_message': str(e),
'data_points': len(product_data) if product_data is not None else 0
}
return validation_results
return training_results
def _calculate_training_summary(self, training_results: Dict[str, Any]) -> Dict[str, Any]:
"""Calculate summary statistics from training results"""
total_products = len(training_results)
successful_products = len([r for r in training_results.values() if r.get('status') == 'success'])
failed_products = len([r for r in training_results.values() if r.get('status') == 'error'])
skipped_products = len([r for r in training_results.values() if r.get('status') == 'skipped'])
# Calculate average training metrics for successful models
successful_results = [r for r in training_results.values() if r.get('status') == 'success']
avg_metrics = {}
if successful_results:
metrics_list = [r['model_info'].get('training_metrics', {}) for r in successful_results]
if metrics_list and all(metrics_list):
avg_metrics = {
'avg_mae': np.mean([m.get('mae', 0) for m in metrics_list]),
'avg_rmse': np.mean([m.get('rmse', 0) for m in metrics_list]),
'avg_mape': np.mean([m.get('mape', 0) for m in metrics_list]),
'avg_r2': np.mean([m.get('r2_score', 0) for m in metrics_list])
}
return {
'total_products': total_products,
'successful_products': successful_products,
'failed_products': failed_products,
'skipped_products': skipped_products,
'success_rate': round(successful_products / total_products * 100, 2) if total_products > 0 else 0,
'average_metrics': avg_metrics
}