372 lines
15 KiB
Python
372 lines
15 KiB
Python
# services/training/app/ml/trainer.py
|
|
"""
|
|
ML Trainer for Training Service
|
|
Orchestrates the complete training process
|
|
"""
|
|
|
|
from typing import Dict, List, Any, Optional, Tuple
|
|
import pandas as pd
|
|
import numpy as np
|
|
from datetime import datetime, timedelta
|
|
import logging
|
|
import asyncio
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
from app.ml.prophet_manager import BakeryProphetManager
|
|
from app.ml.data_processor import BakeryDataProcessor
|
|
from app.core.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class BakeryMLTrainer:
|
|
"""
|
|
Main ML trainer that orchestrates the complete training process.
|
|
Replaces the old Celery-based training system with clean async implementation.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.prophet_manager = BakeryProphetManager()
|
|
self.data_processor = BakeryDataProcessor()
|
|
|
|
async def train_tenant_models(self,
|
|
tenant_id: str,
|
|
sales_data: List[Dict],
|
|
weather_data: List[Dict] = None,
|
|
traffic_data: List[Dict] = None,
|
|
job_id: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Train models for all products of a tenant.
|
|
|
|
Args:
|
|
tenant_id: Tenant identifier
|
|
sales_data: Historical sales data
|
|
weather_data: Weather data (optional)
|
|
traffic_data: Traffic data (optional)
|
|
job_id: Training job identifier
|
|
|
|
Returns:
|
|
Dictionary with training results for each product
|
|
"""
|
|
if not job_id:
|
|
job_id = f"training_{tenant_id}_{uuid.uuid4().hex[:8]}"
|
|
|
|
logger.info(f"Starting training job {job_id} for tenant {tenant_id}")
|
|
|
|
try:
|
|
# Convert input data to DataFrames
|
|
sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
|
|
weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
|
|
traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
|
|
|
|
# Validate input data
|
|
await self._validate_input_data(sales_df, tenant_id)
|
|
|
|
# Get unique products
|
|
products = sales_df['product_name'].unique().tolist()
|
|
logger.info(f"Training models for {len(products)} products: {products}")
|
|
|
|
# Process data for each product
|
|
processed_data = await self._process_all_products(
|
|
sales_df, weather_df, traffic_df, products
|
|
)
|
|
|
|
# Train models for each product
|
|
training_results = await self._train_all_models(
|
|
tenant_id, processed_data, job_id
|
|
)
|
|
|
|
# Calculate overall training summary
|
|
summary = self._calculate_training_summary(training_results)
|
|
|
|
result = {
|
|
"job_id": job_id,
|
|
"tenant_id": tenant_id,
|
|
"status": "completed",
|
|
"products_trained": len([r for r in training_results.values() if r.get('status') == 'success']),
|
|
"products_failed": len([r for r in training_results.values() if r.get('status') == 'error']),
|
|
"total_products": len(products),
|
|
"training_results": training_results,
|
|
"summary": summary,
|
|
"completed_at": datetime.now().isoformat()
|
|
}
|
|
|
|
logger.info(f"Training job {job_id} completed successfully")
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Training job {job_id} failed: {str(e)}")
|
|
raise
|
|
|
|
async def train_single_product(self,
|
|
tenant_id: str,
|
|
product_name: str,
|
|
sales_data: List[Dict],
|
|
weather_data: List[Dict] = None,
|
|
traffic_data: List[Dict] = None,
|
|
job_id: str = None) -> Dict[str, Any]:
|
|
"""
|
|
Train model for a single product.
|
|
|
|
Args:
|
|
tenant_id: Tenant identifier
|
|
product_name: Product name
|
|
sales_data: Historical sales data
|
|
weather_data: Weather data (optional)
|
|
traffic_data: Traffic data (optional)
|
|
job_id: Training job identifier
|
|
|
|
Returns:
|
|
Training result for the product
|
|
"""
|
|
if not job_id:
|
|
job_id = f"training_{tenant_id}_{product_name}_{uuid.uuid4().hex[:8]}"
|
|
|
|
logger.info(f"Starting single product training {job_id} for {product_name}")
|
|
|
|
try:
|
|
# Convert input data to DataFrames
|
|
sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
|
|
weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
|
|
traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
|
|
|
|
# Filter sales data for the specific product
|
|
product_sales = sales_df[sales_df['product_name'] == product_name].copy()
|
|
|
|
# Validate product data
|
|
if product_sales.empty:
|
|
raise ValueError(f"No sales data found for product: {product_name}")
|
|
|
|
# Prepare training data
|
|
processed_data = await self.data_processor.prepare_training_data(
|
|
sales_data=product_sales,
|
|
weather_data=weather_df,
|
|
traffic_data=traffic_df,
|
|
product_name=product_name
|
|
)
|
|
|
|
# Train the model
|
|
model_info = await self.prophet_manager.train_bakery_model(
|
|
tenant_id=tenant_id,
|
|
product_name=product_name,
|
|
df=processed_data,
|
|
job_id=job_id
|
|
)
|
|
|
|
result = {
|
|
"job_id": job_id,
|
|
"tenant_id": tenant_id,
|
|
"product_name": product_name,
|
|
"status": "success",
|
|
"model_info": model_info,
|
|
"data_points": len(processed_data),
|
|
"completed_at": datetime.now().isoformat()
|
|
}
|
|
|
|
logger.info(f"Single product training {job_id} completed successfully")
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Single product training {job_id} failed: {str(e)}")
|
|
raise
|
|
|
|
async def evaluate_model_performance(self,
|
|
tenant_id: str,
|
|
product_name: str,
|
|
model_path: str,
|
|
test_data: List[Dict]) -> Dict[str, Any]:
|
|
"""
|
|
Evaluate model performance on test data.
|
|
|
|
Args:
|
|
tenant_id: Tenant identifier
|
|
product_name: Product name
|
|
model_path: Path to the trained model
|
|
test_data: Test data for evaluation
|
|
|
|
Returns:
|
|
Performance metrics
|
|
"""
|
|
try:
|
|
logger.info(f"Evaluating model performance for {product_name}")
|
|
|
|
# Convert test data to DataFrame
|
|
test_df = pd.DataFrame(test_data)
|
|
|
|
# Prepare test data
|
|
test_prepared = await self.data_processor.prepare_prediction_features(
|
|
future_dates=test_df['ds'],
|
|
weather_forecast=test_df if 'temperature' in test_df.columns else pd.DataFrame(),
|
|
traffic_forecast=test_df if 'traffic_volume' in test_df.columns else pd.DataFrame()
|
|
)
|
|
|
|
# Get regressor columns
|
|
regressor_columns = [col for col in test_prepared.columns if col not in ['ds', 'y']]
|
|
|
|
# Generate predictions
|
|
forecast = await self.prophet_manager.generate_forecast(
|
|
model_path=model_path,
|
|
future_dates=test_prepared,
|
|
regressor_columns=regressor_columns
|
|
)
|
|
|
|
# Calculate performance metrics if we have actual values
|
|
metrics = {}
|
|
if 'y' in test_df.columns:
|
|
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
|
|
|
y_true = test_df['y'].values
|
|
y_pred = forecast['yhat'].values
|
|
|
|
metrics = {
|
|
"mae": float(mean_absolute_error(y_true, y_pred)),
|
|
"rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
|
|
"mape": float(np.mean(np.abs((y_true - y_pred) / y_true)) * 100),
|
|
"r2_score": float(r2_score(y_true, y_pred))
|
|
}
|
|
|
|
result = {
|
|
"tenant_id": tenant_id,
|
|
"product_name": product_name,
|
|
"evaluation_metrics": metrics,
|
|
"forecast_samples": len(forecast),
|
|
"evaluated_at": datetime.now().isoformat()
|
|
}
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Model evaluation failed: {str(e)}")
|
|
raise
|
|
|
|
async def _validate_input_data(self, sales_df: pd.DataFrame, tenant_id: str):
|
|
"""Validate input sales data"""
|
|
if sales_df.empty:
|
|
raise ValueError(f"No sales data provided for tenant {tenant_id}")
|
|
|
|
required_columns = ['date', 'product_name', 'quantity']
|
|
missing_columns = [col for col in required_columns if col not in sales_df.columns]
|
|
if missing_columns:
|
|
raise ValueError(f"Missing required columns: {missing_columns}")
|
|
|
|
# Check for valid dates
|
|
try:
|
|
sales_df['date'] = pd.to_datetime(sales_df['date'])
|
|
except Exception:
|
|
raise ValueError("Invalid date format in sales data")
|
|
|
|
# Check for valid quantities
|
|
if not sales_df['quantity'].dtype in ['int64', 'float64']:
|
|
raise ValueError("Quantity column must be numeric")
|
|
|
|
async def _process_all_products(self,
|
|
sales_df: pd.DataFrame,
|
|
weather_df: pd.DataFrame,
|
|
traffic_df: pd.DataFrame,
|
|
products: List[str]) -> Dict[str, pd.DataFrame]:
|
|
"""Process data for all products"""
|
|
processed_data = {}
|
|
|
|
for product_name in products:
|
|
try:
|
|
logger.info(f"Processing data for product: {product_name}")
|
|
|
|
# Filter sales data for this product
|
|
product_sales = sales_df[sales_df['product_name'] == product_name].copy()
|
|
|
|
# Process the product data
|
|
processed_product_data = await self.data_processor.prepare_training_data(
|
|
sales_data=product_sales,
|
|
weather_data=weather_df,
|
|
traffic_data=traffic_df,
|
|
product_name=product_name
|
|
)
|
|
|
|
processed_data[product_name] = processed_product_data
|
|
logger.info(f"Processed {len(processed_product_data)} data points for {product_name}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to process data for {product_name}: {str(e)}")
|
|
# Continue with other products
|
|
continue
|
|
|
|
return processed_data
|
|
|
|
async def _train_all_models(self,
|
|
tenant_id: str,
|
|
processed_data: Dict[str, pd.DataFrame],
|
|
job_id: str) -> Dict[str, Any]:
|
|
"""Train models for all processed products"""
|
|
training_results = {}
|
|
|
|
for product_name, product_data in processed_data.items():
|
|
try:
|
|
logger.info(f"Training model for product: {product_name}")
|
|
|
|
# Check if we have enough data
|
|
if len(product_data) < settings.MIN_TRAINING_DATA_DAYS:
|
|
training_results[product_name] = {
|
|
'status': 'skipped',
|
|
'reason': 'insufficient_data',
|
|
'data_points': len(product_data),
|
|
'min_required': settings.MIN_TRAINING_DATA_DAYS
|
|
}
|
|
continue
|
|
|
|
# Train the model
|
|
model_info = await self.prophet_manager.train_bakery_model(
|
|
tenant_id=tenant_id,
|
|
product_name=product_name,
|
|
df=product_data,
|
|
job_id=job_id
|
|
)
|
|
|
|
training_results[product_name] = {
|
|
'status': 'success',
|
|
'model_info': model_info,
|
|
'data_points': len(product_data),
|
|
'trained_at': datetime.now().isoformat()
|
|
}
|
|
|
|
logger.info(f"Successfully trained model for {product_name}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to train model for {product_name}: {str(e)}")
|
|
training_results[product_name] = {
|
|
'status': 'error',
|
|
'error_message': str(e),
|
|
'data_points': len(product_data) if product_data is not None else 0
|
|
}
|
|
|
|
return training_results
|
|
|
|
def _calculate_training_summary(self, training_results: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Calculate summary statistics from training results"""
|
|
total_products = len(training_results)
|
|
successful_products = len([r for r in training_results.values() if r.get('status') == 'success'])
|
|
failed_products = len([r for r in training_results.values() if r.get('status') == 'error'])
|
|
skipped_products = len([r for r in training_results.values() if r.get('status') == 'skipped'])
|
|
|
|
# Calculate average training metrics for successful models
|
|
successful_results = [r for r in training_results.values() if r.get('status') == 'success']
|
|
|
|
avg_metrics = {}
|
|
if successful_results:
|
|
metrics_list = [r['model_info'].get('training_metrics', {}) for r in successful_results]
|
|
|
|
if metrics_list and all(metrics_list):
|
|
avg_metrics = {
|
|
'avg_mae': np.mean([m.get('mae', 0) for m in metrics_list]),
|
|
'avg_rmse': np.mean([m.get('rmse', 0) for m in metrics_list]),
|
|
'avg_mape': np.mean([m.get('mape', 0) for m in metrics_list]),
|
|
'avg_r2': np.mean([m.get('r2_score', 0) for m in metrics_list])
|
|
}
|
|
|
|
return {
|
|
'total_products': total_products,
|
|
'successful_products': successful_products,
|
|
'failed_products': failed_products,
|
|
'skipped_products': skipped_products,
|
|
'success_rate': round(successful_products / total_products * 100, 2) if total_products > 0 else 0,
|
|
'average_metrics': avg_metrics
|
|
} |