Improve training code
This commit is contained in:
@@ -1,77 +1,76 @@
|
||||
# services/training/app/ml/trainer.py
|
||||
"""
|
||||
ML Trainer for Training Service
|
||||
Orchestrates the complete training process
|
||||
ML Trainer - Main ML pipeline coordinator
|
||||
Receives prepared data and orchestrates the complete ML training process
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
from typing import Dict, List, Any, Optional
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import asyncio
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from app.ml.prophet_manager import BakeryProphetManager
|
||||
from app.ml.data_processor import BakeryDataProcessor
|
||||
from app.ml.prophet_manager import BakeryProphetManager
|
||||
from app.services.training_orchestrator import TrainingDataSet
|
||||
from app.core.config import settings
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class BakeryMLTrainer:
|
||||
"""
|
||||
Main ML trainer that orchestrates the complete training process.
|
||||
Replaces the old Celery-based training system with clean async implementation.
|
||||
Main ML trainer that orchestrates the complete ML training pipeline.
|
||||
Receives prepared TrainingDataSet and coordinates data processing and model training.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.prophet_manager = BakeryProphetManager()
|
||||
def __init__(self, db_session: AsyncSession = None):
|
||||
self.data_processor = BakeryDataProcessor()
|
||||
self.prophet_manager = BakeryProphetManager(db_session=db_session)
|
||||
|
||||
async def train_tenant_models(self,
|
||||
tenant_id: str,
|
||||
sales_data: List[Dict],
|
||||
weather_data: List[Dict] = None,
|
||||
traffic_data: List[Dict] = None,
|
||||
job_id: str = None) -> Dict[str, Any]:
|
||||
training_dataset: TrainingDataSet,
|
||||
job_id: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Train models for all products of a tenant.
|
||||
Train models for all products using prepared training dataset.
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant identifier
|
||||
sales_data: Historical sales data
|
||||
weather_data: Weather data (optional)
|
||||
traffic_data: Traffic data (optional)
|
||||
training_dataset: Prepared training dataset with aligned dates
|
||||
job_id: Training job identifier
|
||||
|
||||
Returns:
|
||||
Dictionary with training results for each product
|
||||
"""
|
||||
if not job_id:
|
||||
job_id = f"training_{tenant_id}_{uuid.uuid4().hex[:8]}"
|
||||
job_id = f"ml_training_{tenant_id}_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
logger.info(f"Starting training job {job_id} for tenant {tenant_id}")
|
||||
logger.info(f"Starting ML training pipeline {job_id} for tenant {tenant_id}")
|
||||
|
||||
try:
|
||||
# Convert input data to DataFrames
|
||||
sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
|
||||
weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
|
||||
traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
|
||||
# Convert sales data to DataFrame
|
||||
sales_df = pd.DataFrame(training_dataset.sales_data)
|
||||
weather_df = pd.DataFrame(training_dataset.weather_data)
|
||||
traffic_df = pd.DataFrame(training_dataset.traffic_data)
|
||||
|
||||
# Validate input data
|
||||
await self._validate_input_data(sales_df, tenant_id)
|
||||
|
||||
# Get unique products
|
||||
# Get unique products from the sales data
|
||||
products = sales_df['product_name'].unique().tolist()
|
||||
logger.info(f"Training models for {len(products)} products: {products}")
|
||||
|
||||
# Process data for each product
|
||||
logger.info("Processing data for all products...")
|
||||
processed_data = await self._process_all_products(
|
||||
sales_df, weather_df, traffic_df, products
|
||||
)
|
||||
|
||||
# Train models for each product
|
||||
# Train models for each processed product
|
||||
logger.info("Training models for all products...")
|
||||
training_results = await self._train_all_models(
|
||||
tenant_id, processed_data, job_id
|
||||
)
|
||||
@@ -85,50 +84,56 @@ class BakeryMLTrainer:
|
||||
"status": "completed",
|
||||
"products_trained": len([r for r in training_results.values() if r.get('status') == 'success']),
|
||||
"products_failed": len([r for r in training_results.values() if r.get('status') == 'error']),
|
||||
"products_skipped": len([r for r in training_results.values() if r.get('status') == 'skipped']),
|
||||
"total_products": len(products),
|
||||
"training_results": training_results,
|
||||
"summary": summary,
|
||||
"data_info": {
|
||||
"date_range": {
|
||||
"start": training_dataset.date_range.start.isoformat(),
|
||||
"end": training_dataset.date_range.end.isoformat(),
|
||||
"duration_days": (training_dataset.date_range.end - training_dataset.date_range.start).days
|
||||
},
|
||||
"data_sources": [source.value for source in training_dataset.date_range.available_sources],
|
||||
"constraints_applied": training_dataset.date_range.constraints
|
||||
},
|
||||
"completed_at": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
logger.info(f"Training job {job_id} completed successfully")
|
||||
logger.info(f"ML training pipeline {job_id} completed successfully")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Training job {job_id} failed: {str(e)}")
|
||||
logger.error(f"ML training pipeline {job_id} failed: {str(e)}")
|
||||
raise
|
||||
|
||||
async def train_single_product(self,
|
||||
tenant_id: str,
|
||||
product_name: str,
|
||||
sales_data: List[Dict],
|
||||
weather_data: List[Dict] = None,
|
||||
traffic_data: List[Dict] = None,
|
||||
job_id: str = None) -> Dict[str, Any]:
|
||||
async def train_single_product_model(self,
|
||||
tenant_id: str,
|
||||
product_name: str,
|
||||
training_dataset: TrainingDataSet,
|
||||
job_id: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Train model for a single product.
|
||||
Train model for a single product using prepared training dataset.
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant identifier
|
||||
product_name: Product name
|
||||
sales_data: Historical sales data
|
||||
weather_data: Weather data (optional)
|
||||
traffic_data: Traffic data (optional)
|
||||
training_dataset: Prepared training dataset
|
||||
job_id: Training job identifier
|
||||
|
||||
Returns:
|
||||
Training result for the product
|
||||
"""
|
||||
if not job_id:
|
||||
job_id = f"training_{tenant_id}_{product_name}_{uuid.uuid4().hex[:8]}"
|
||||
job_id = f"single_ml_{tenant_id}_{product_name}_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
logger.info(f"Starting single product training {job_id} for {product_name}")
|
||||
logger.info(f"Starting single product ML training {job_id} for {product_name}")
|
||||
|
||||
try:
|
||||
# Convert input data to DataFrames
|
||||
sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
|
||||
weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
|
||||
traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
|
||||
# Convert training data to DataFrames
|
||||
sales_df = pd.DataFrame(training_dataset.sales_data)
|
||||
weather_df = pd.DataFrame(training_dataset.weather_data)
|
||||
traffic_df = pd.DataFrame(training_dataset.traffic_data)
|
||||
|
||||
# Filter sales data for the specific product
|
||||
product_sales = sales_df[sales_df['product_name'] == product_name].copy()
|
||||
@@ -137,7 +142,7 @@ class BakeryMLTrainer:
|
||||
if product_sales.empty:
|
||||
raise ValueError(f"No sales data found for product: {product_name}")
|
||||
|
||||
# Prepare training data
|
||||
# Process data for this specific product
|
||||
processed_data = await self.data_processor.prepare_training_data(
|
||||
sales_data=product_sales,
|
||||
weather_data=weather_df,
|
||||
@@ -160,29 +165,38 @@ class BakeryMLTrainer:
|
||||
"status": "success",
|
||||
"model_info": model_info,
|
||||
"data_points": len(processed_data),
|
||||
"data_info": {
|
||||
"date_range": {
|
||||
"start": training_dataset.date_range.start.isoformat(),
|
||||
"end": training_dataset.date_range.end.isoformat(),
|
||||
"duration_days": (training_dataset.date_range.end - training_dataset.date_range.start).days
|
||||
},
|
||||
"data_sources": [source.value for source in training_dataset.date_range.available_sources],
|
||||
"constraints_applied": training_dataset.date_range.constraints
|
||||
},
|
||||
"completed_at": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
logger.info(f"Single product training {job_id} completed successfully")
|
||||
logger.info(f"Single product ML training {job_id} completed successfully")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Single product training {job_id} failed: {str(e)}")
|
||||
logger.error(f"Single product ML training {job_id} failed: {str(e)}")
|
||||
raise
|
||||
|
||||
async def evaluate_model_performance(self,
|
||||
tenant_id: str,
|
||||
product_name: str,
|
||||
model_path: str,
|
||||
test_data: List[Dict]) -> Dict[str, Any]:
|
||||
test_dataset: TrainingDataSet) -> Dict[str, Any]:
|
||||
"""
|
||||
Evaluate model performance on test data.
|
||||
Evaluate model performance using test dataset.
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant identifier
|
||||
product_name: Product name
|
||||
model_path: Path to the trained model
|
||||
test_data: Test data for evaluation
|
||||
test_dataset: Test dataset for evaluation
|
||||
|
||||
Returns:
|
||||
Performance metrics
|
||||
@@ -190,46 +204,75 @@ class BakeryMLTrainer:
|
||||
try:
|
||||
logger.info(f"Evaluating model performance for {product_name}")
|
||||
|
||||
# Convert test data to DataFrame
|
||||
test_df = pd.DataFrame(test_data)
|
||||
# Convert test data to DataFrames
|
||||
test_sales_df = pd.DataFrame(test_dataset.sales_data)
|
||||
test_weather_df = pd.DataFrame(test_dataset.weather_data)
|
||||
test_traffic_df = pd.DataFrame(test_dataset.traffic_data)
|
||||
|
||||
# Prepare test data
|
||||
test_prepared = await self.data_processor.prepare_prediction_features(
|
||||
future_dates=test_df['ds'],
|
||||
weather_forecast=test_df if 'temperature' in test_df.columns else pd.DataFrame(),
|
||||
traffic_forecast=test_df if 'traffic_volume' in test_df.columns else pd.DataFrame()
|
||||
# Filter for specific product
|
||||
product_test_sales = test_sales_df[test_sales_df['product_name'] == product_name].copy()
|
||||
|
||||
if product_test_sales.empty:
|
||||
raise ValueError(f"No test data found for product: {product_name}")
|
||||
|
||||
# Process test data
|
||||
processed_test_data = await self.data_processor.prepare_training_data(
|
||||
sales_data=product_test_sales,
|
||||
weather_data=test_weather_df,
|
||||
traffic_data=test_traffic_df,
|
||||
product_name=product_name
|
||||
)
|
||||
|
||||
# Get regressor columns
|
||||
regressor_columns = [col for col in test_prepared.columns if col not in ['ds', 'y']]
|
||||
# Create future dataframe for prediction
|
||||
future_dates = processed_test_data[['ds']].copy()
|
||||
|
||||
# Add regressor columns
|
||||
regressor_columns = [col for col in processed_test_data.columns if col not in ['ds', 'y']]
|
||||
for col in regressor_columns:
|
||||
future_dates[col] = processed_test_data[col]
|
||||
|
||||
# Generate predictions
|
||||
forecast = await self.prophet_manager.generate_forecast(
|
||||
model_path=model_path,
|
||||
future_dates=test_prepared,
|
||||
future_dates=future_dates,
|
||||
regressor_columns=regressor_columns
|
||||
)
|
||||
|
||||
# Calculate performance metrics if we have actual values
|
||||
metrics = {}
|
||||
if 'y' in test_df.columns:
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||
|
||||
y_true = test_df['y'].values
|
||||
y_pred = forecast['yhat'].values
|
||||
|
||||
metrics = {
|
||||
"mae": float(mean_absolute_error(y_true, y_pred)),
|
||||
"rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
|
||||
"mape": float(np.mean(np.abs((y_true - y_pred) / y_true)) * 100),
|
||||
"r2_score": float(r2_score(y_true, y_pred))
|
||||
}
|
||||
# Calculate performance metrics
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||
|
||||
y_true = processed_test_data['y'].values
|
||||
y_pred = forecast['yhat'].values
|
||||
|
||||
# Ensure arrays are the same length
|
||||
min_len = min(len(y_true), len(y_pred))
|
||||
y_true = y_true[:min_len]
|
||||
y_pred = y_pred[:min_len]
|
||||
|
||||
metrics = {
|
||||
"mae": float(mean_absolute_error(y_true, y_pred)),
|
||||
"rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
|
||||
"r2_score": float(r2_score(y_true, y_pred))
|
||||
}
|
||||
|
||||
# Calculate MAPE safely
|
||||
non_zero_mask = y_true > 0.1
|
||||
if np.sum(non_zero_mask) > 0:
|
||||
mape = np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
|
||||
metrics["mape"] = float(min(mape, 200)) # Cap at 200%
|
||||
else:
|
||||
metrics["mape"] = 100.0
|
||||
|
||||
result = {
|
||||
"tenant_id": tenant_id,
|
||||
"product_name": product_name,
|
||||
"evaluation_metrics": metrics,
|
||||
"forecast_samples": len(forecast),
|
||||
"test_samples": len(processed_test_data),
|
||||
"prediction_samples": len(forecast),
|
||||
"test_period": {
|
||||
"start": test_dataset.date_range.start.isoformat(),
|
||||
"end": test_dataset.date_range.end.isoformat()
|
||||
},
|
||||
"evaluated_at": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
@@ -244,6 +287,7 @@ class BakeryMLTrainer:
|
||||
if sales_df.empty:
|
||||
raise ValueError(f"No sales data provided for tenant {tenant_id}")
|
||||
|
||||
# Handle quantity column mapping
|
||||
if 'quantity_sold' in sales_df.columns and 'quantity' not in sales_df.columns:
|
||||
sales_df['quantity'] = sales_df['quantity_sold']
|
||||
logger.info("Mapped 'quantity_sold' to 'quantity' column")
|
||||
@@ -261,14 +305,17 @@ class BakeryMLTrainer:
|
||||
|
||||
# Check for valid quantities
|
||||
if not sales_df['quantity'].dtype in ['int64', 'float64']:
|
||||
raise ValueError("Quantity column must be numeric")
|
||||
try:
|
||||
sales_df['quantity'] = pd.to_numeric(sales_df['quantity'], errors='coerce')
|
||||
except Exception:
|
||||
raise ValueError("Quantity column must be numeric")
|
||||
|
||||
async def _process_all_products(self,
|
||||
sales_df: pd.DataFrame,
|
||||
weather_df: pd.DataFrame,
|
||||
traffic_df: pd.DataFrame,
|
||||
products: List[str]) -> Dict[str, pd.DataFrame]:
|
||||
"""Process data for all products"""
|
||||
"""Process data for all products using the data processor"""
|
||||
processed_data = {}
|
||||
|
||||
for product_name in products:
|
||||
@@ -278,7 +325,11 @@ class BakeryMLTrainer:
|
||||
# Filter sales data for this product
|
||||
product_sales = sales_df[sales_df['product_name'] == product_name].copy()
|
||||
|
||||
# Process the product data
|
||||
if product_sales.empty:
|
||||
logger.warning(f"No sales data found for product: {product_name}")
|
||||
continue
|
||||
|
||||
# Use data processor to prepare training data
|
||||
processed_product_data = await self.data_processor.prepare_training_data(
|
||||
sales_data=product_sales,
|
||||
weather_data=weather_df,
|
||||
@@ -300,7 +351,7 @@ class BakeryMLTrainer:
|
||||
tenant_id: str,
|
||||
processed_data: Dict[str, pd.DataFrame],
|
||||
job_id: str) -> Dict[str, Any]:
|
||||
"""Train models for all processed products"""
|
||||
"""Train models for all processed products using Prophet manager"""
|
||||
training_results = {}
|
||||
|
||||
for product_name, product_data in processed_data.items():
|
||||
@@ -313,11 +364,13 @@ class BakeryMLTrainer:
|
||||
'status': 'skipped',
|
||||
'reason': 'insufficient_data',
|
||||
'data_points': len(product_data),
|
||||
'min_required': settings.MIN_TRAINING_DATA_DAYS
|
||||
'min_required': settings.MIN_TRAINING_DATA_DAYS,
|
||||
'message': f'Need at least {settings.MIN_TRAINING_DATA_DAYS} data points, got {len(product_data)}'
|
||||
}
|
||||
logger.warning(f"Skipping {product_name}: insufficient data ({len(product_data)} < {settings.MIN_TRAINING_DATA_DAYS})")
|
||||
continue
|
||||
|
||||
# Train the model
|
||||
# Train the model using Prophet manager
|
||||
model_info = await self.prophet_manager.train_bakery_model(
|
||||
tenant_id=tenant_id,
|
||||
product_name=product_name,
|
||||
@@ -339,7 +392,8 @@ class BakeryMLTrainer:
|
||||
training_results[product_name] = {
|
||||
'status': 'error',
|
||||
'error_message': str(e),
|
||||
'data_points': len(product_data) if product_data is not None else 0
|
||||
'data_points': len(product_data) if product_data is not None else 0,
|
||||
'failed_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
return training_results
|
||||
@@ -360,17 +414,27 @@ class BakeryMLTrainer:
|
||||
|
||||
if metrics_list and all(metrics_list):
|
||||
avg_metrics = {
|
||||
'avg_mae': np.mean([m.get('mae', 0) for m in metrics_list]),
|
||||
'avg_rmse': np.mean([m.get('rmse', 0) for m in metrics_list]),
|
||||
'avg_mape': np.mean([m.get('mape', 0) for m in metrics_list]),
|
||||
'avg_r2': np.mean([m.get('r2_score', 0) for m in metrics_list])
|
||||
'avg_mae': round(np.mean([m.get('mae', 0) for m in metrics_list]), 2),
|
||||
'avg_rmse': round(np.mean([m.get('rmse', 0) for m in metrics_list]), 2),
|
||||
'avg_mape': round(np.mean([m.get('mape', 0) for m in metrics_list]), 2),
|
||||
'avg_r2': round(np.mean([m.get('r2', 0) for m in metrics_list]), 3),
|
||||
'avg_improvement': round(np.mean([m.get('improvement_estimated', 0) for m in metrics_list]), 1)
|
||||
}
|
||||
|
||||
# Calculate data quality insights
|
||||
data_points_list = [r.get('data_points', 0) for r in training_results.values()]
|
||||
|
||||
return {
|
||||
'total_products': total_products,
|
||||
'successful_products': successful_products,
|
||||
'failed_products': failed_products,
|
||||
'skipped_products': skipped_products,
|
||||
'success_rate': round(successful_products / total_products * 100, 2) if total_products > 0 else 0,
|
||||
'average_metrics': avg_metrics
|
||||
'average_metrics': avg_metrics,
|
||||
'data_summary': {
|
||||
'total_data_points': sum(data_points_list),
|
||||
'avg_data_points_per_product': round(np.mean(data_points_list), 1) if data_points_list else 0,
|
||||
'min_data_points': min(data_points_list) if data_points_list else 0,
|
||||
'max_data_points': max(data_points_list) if data_points_list else 0
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user