Files
bakery-ia/services/training/app/ml/hybrid_trainer.py

459 lines
16 KiB
Python
Raw Normal View History

2025-11-05 13:34:56 +01:00
"""
Hybrid Prophet + XGBoost Trainer
Combines Prophet's seasonality modeling with XGBoost's pattern learning
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
import structlog
from datetime import datetime
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')
# Import XGBoost
try:
import xgboost as xgb
except ImportError:
raise ImportError("XGBoost not installed. Run: pip install xgboost")
from app.ml.prophet_manager import BakeryProphetManager
from app.ml.enhanced_features import AdvancedFeatureEngineer
logger = structlog.get_logger()
class HybridProphetXGBoost:
"""
Hybrid forecasting model combining Prophet and XGBoost.
Approach:
1. Train Prophet on historical data (captures trend, seasonality, holidays)
2. Calculate residuals (actual - prophet_prediction)
3. Train XGBoost on residuals using enhanced features
4. Final prediction = prophet_prediction + xgboost_residual_prediction
Benefits:
- Prophet handles seasonality, holidays, trends
- XGBoost captures complex patterns Prophet misses
- Maintains Prophet's interpretability
- Improves accuracy by 10-25% over Prophet alone
"""
def __init__(self, database_manager=None):
self.prophet_manager = BakeryProphetManager(database_manager)
self.feature_engineer = AdvancedFeatureEngineer()
self.xgb_model = None
self.feature_columns = []
self.prophet_model_data = None
async def train_hybrid_model(
self,
tenant_id: str,
inventory_product_id: str,
df: pd.DataFrame,
job_id: str,
Fix training hang caused by nested database sessions and deadlocks Root Cause: The training process was hanging at the first progress update due to a nested database session issue. The main trainer created a session and repositories, then called prophet_manager.train_bakery_model() which created another nested session with an advisory lock. This caused a deadlock where: 1. Outer session had uncommitted UPDATE on model_training_logs 2. Inner session tried to acquire advisory lock 3. Neither could proceed, causing training to hang indefinitely Changes Made: 1. prophet_manager.py: - Added optional 'session' parameter to train_bakery_model() - Refactored to use parent session if provided, otherwise create new one - Prevents nested session creation during training 2. hybrid_trainer.py: - Added optional 'session' parameter to train_hybrid_model() - Passes session to prophet_manager to maintain single session context 3. trainer.py: - Updated _train_single_product() to accept and pass session - Updated _train_all_models_enhanced() to accept and pass session - Pass db_session from main training context to all training methods - Added explicit db_session.flush() after critical progress update - This ensures updates are visible before acquiring locks Impact: - Eliminates nested session deadlocks - Training now proceeds past initial progress update - Maintains single database session context throughout training - Prevents database transaction conflicts Related Issues: - Fixes training hang during onboarding process - Not directly related to audit_metadata changes but exposed by them 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 16:13:32 +01:00
validation_split: float = 0.2,
session = None
2025-11-05 13:34:56 +01:00
) -> Dict[str, Any]:
"""
Train hybrid Prophet + XGBoost model.
Args:
tenant_id: Tenant identifier
inventory_product_id: Product identifier
df: Training data (must have 'ds', 'y' and regressor columns)
job_id: Training job identifier
validation_split: Fraction of data for validation
Fix training hang caused by nested database sessions and deadlocks Root Cause: The training process was hanging at the first progress update due to a nested database session issue. The main trainer created a session and repositories, then called prophet_manager.train_bakery_model() which created another nested session with an advisory lock. This caused a deadlock where: 1. Outer session had uncommitted UPDATE on model_training_logs 2. Inner session tried to acquire advisory lock 3. Neither could proceed, causing training to hang indefinitely Changes Made: 1. prophet_manager.py: - Added optional 'session' parameter to train_bakery_model() - Refactored to use parent session if provided, otherwise create new one - Prevents nested session creation during training 2. hybrid_trainer.py: - Added optional 'session' parameter to train_hybrid_model() - Passes session to prophet_manager to maintain single session context 3. trainer.py: - Updated _train_single_product() to accept and pass session - Updated _train_all_models_enhanced() to accept and pass session - Pass db_session from main training context to all training methods - Added explicit db_session.flush() after critical progress update - This ensures updates are visible before acquiring locks Impact: - Eliminates nested session deadlocks - Training now proceeds past initial progress update - Maintains single database session context throughout training - Prevents database transaction conflicts Related Issues: - Fixes training hang during onboarding process - Not directly related to audit_metadata changes but exposed by them 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 16:13:32 +01:00
session: Optional database session (uses parent session if provided to avoid nested sessions)
2025-11-05 13:34:56 +01:00
Returns:
Dictionary with model metadata and performance metrics
"""
logger.info(
"Starting hybrid Prophet + XGBoost training",
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
data_points=len(df)
)
# Step 1: Train Prophet model (base forecaster)
logger.info("Step 1: Training Prophet base model")
Fix training hang caused by nested database sessions and deadlocks Root Cause: The training process was hanging at the first progress update due to a nested database session issue. The main trainer created a session and repositories, then called prophet_manager.train_bakery_model() which created another nested session with an advisory lock. This caused a deadlock where: 1. Outer session had uncommitted UPDATE on model_training_logs 2. Inner session tried to acquire advisory lock 3. Neither could proceed, causing training to hang indefinitely Changes Made: 1. prophet_manager.py: - Added optional 'session' parameter to train_bakery_model() - Refactored to use parent session if provided, otherwise create new one - Prevents nested session creation during training 2. hybrid_trainer.py: - Added optional 'session' parameter to train_hybrid_model() - Passes session to prophet_manager to maintain single session context 3. trainer.py: - Updated _train_single_product() to accept and pass session - Updated _train_all_models_enhanced() to accept and pass session - Pass db_session from main training context to all training methods - Added explicit db_session.flush() after critical progress update - This ensures updates are visible before acquiring locks Impact: - Eliminates nested session deadlocks - Training now proceeds past initial progress update - Maintains single database session context throughout training - Prevents database transaction conflicts Related Issues: - Fixes training hang during onboarding process - Not directly related to audit_metadata changes but exposed by them 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 16:13:32 +01:00
# ✅ FIX: Pass session to prophet_manager to avoid nested session issues
2025-11-05 13:34:56 +01:00
prophet_result = await self.prophet_manager.train_bakery_model(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
df=df.copy(),
Fix training hang caused by nested database sessions and deadlocks Root Cause: The training process was hanging at the first progress update due to a nested database session issue. The main trainer created a session and repositories, then called prophet_manager.train_bakery_model() which created another nested session with an advisory lock. This caused a deadlock where: 1. Outer session had uncommitted UPDATE on model_training_logs 2. Inner session tried to acquire advisory lock 3. Neither could proceed, causing training to hang indefinitely Changes Made: 1. prophet_manager.py: - Added optional 'session' parameter to train_bakery_model() - Refactored to use parent session if provided, otherwise create new one - Prevents nested session creation during training 2. hybrid_trainer.py: - Added optional 'session' parameter to train_hybrid_model() - Passes session to prophet_manager to maintain single session context 3. trainer.py: - Updated _train_single_product() to accept and pass session - Updated _train_all_models_enhanced() to accept and pass session - Pass db_session from main training context to all training methods - Added explicit db_session.flush() after critical progress update - This ensures updates are visible before acquiring locks Impact: - Eliminates nested session deadlocks - Training now proceeds past initial progress update - Maintains single database session context throughout training - Prevents database transaction conflicts Related Issues: - Fixes training hang during onboarding process - Not directly related to audit_metadata changes but exposed by them 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 16:13:32 +01:00
job_id=job_id,
session=session
2025-11-05 13:34:56 +01:00
)
self.prophet_model_data = prophet_result
# Step 2: Create enhanced features for XGBoost
logger.info("Step 2: Engineering enhanced features for XGBoost")
df_enhanced = self._prepare_xgboost_features(df)
# Step 3: Split into train/validation
split_idx = int(len(df_enhanced) * (1 - validation_split))
train_df = df_enhanced.iloc[:split_idx].copy()
val_df = df_enhanced.iloc[split_idx:].copy()
logger.info(
"Data split",
train_samples=len(train_df),
val_samples=len(val_df)
)
# Step 4: Get Prophet predictions on training data
logger.info("Step 3: Generating Prophet predictions for residual calculation")
train_prophet_pred = self._get_prophet_predictions(prophet_result, train_df)
val_prophet_pred = self._get_prophet_predictions(prophet_result, val_df)
# Step 5: Calculate residuals (actual - prophet_prediction)
train_residuals = train_df['y'].values - train_prophet_pred
val_residuals = val_df['y'].values - val_prophet_pred
logger.info(
"Residuals calculated",
train_residual_mean=float(np.mean(train_residuals)),
train_residual_std=float(np.std(train_residuals))
)
# Step 6: Prepare feature matrix for XGBoost
X_train = train_df[self.feature_columns].values
X_val = val_df[self.feature_columns].values
# Step 7: Train XGBoost on residuals
logger.info("Step 4: Training XGBoost on residuals")
self.xgb_model = await self._train_xgboost(
2025-11-05 13:34:56 +01:00
X_train, train_residuals,
X_val, val_residuals
)
# Step 8: Evaluate hybrid model
logger.info("Step 5: Evaluating hybrid model performance")
metrics = await self._evaluate_hybrid_model(
2025-11-05 13:34:56 +01:00
train_df, val_df,
train_prophet_pred, val_prophet_pred,
prophet_result
)
# Step 9: Save hybrid model
model_data = self._package_hybrid_model(
prophet_result, metrics, tenant_id, inventory_product_id
)
logger.info(
"Hybrid model training complete",
prophet_mape=metrics['prophet_val_mape'],
hybrid_mape=metrics['hybrid_val_mape'],
improvement_pct=metrics['improvement_percentage']
)
return model_data
def _prepare_xgboost_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Prepare enhanced features for XGBoost.
Args:
df: Base dataframe with 'ds', 'y' and regressor columns
Returns:
DataFrame with all enhanced features
"""
# Rename 'ds' to 'date' for feature engineering
df_prep = df.copy()
if 'ds' in df_prep.columns:
df_prep['date'] = df_prep['ds']
# Ensure 'quantity' column for feature engineering
if 'y' in df_prep.columns:
df_prep['quantity'] = df_prep['y']
# Create all enhanced features
df_enhanced = self.feature_engineer.create_all_features(
df_prep,
date_column='date',
include_lags=True,
include_rolling=True,
include_interactions=True,
include_cyclical=True
)
# Fill NA values (from lagged features at beginning)
df_enhanced = self.feature_engineer.fill_na_values(df_enhanced)
# Get feature column list (excluding target and date columns)
self.feature_columns = [
col for col in self.feature_engineer.get_feature_columns()
if col in df_enhanced.columns
]
# Also include original regressor columns if present
regressor_cols = [
col for col in df.columns
if col not in ['ds', 'y', 'date', 'quantity'] and col in df_enhanced.columns
]
self.feature_columns.extend(regressor_cols)
self.feature_columns = list(set(self.feature_columns)) # Remove duplicates
logger.info(f"Prepared {len(self.feature_columns)} features for XGBoost")
return df_enhanced
def _get_prophet_predictions(
self,
prophet_result: Dict[str, Any],
df: pd.DataFrame
) -> np.ndarray:
"""
Get Prophet predictions for given dataframe.
Args:
prophet_result: Prophet model result from training
df: DataFrame with 'ds' column
Returns:
Array of predictions
"""
# Get the Prophet model from result
prophet_model = prophet_result.get('model')
if prophet_model is None:
raise ValueError("Prophet model not found in result")
# Prepare dataframe for prediction
pred_df = df[['ds']].copy()
# Add regressors if present
regressor_cols = [col for col in df.columns if col not in ['ds', 'y', 'date', 'quantity']]
for col in regressor_cols:
if col in df.columns:
pred_df[col] = df[col]
# Get predictions
forecast = prophet_model.predict(pred_df)
return forecast['yhat'].values
async def _train_xgboost(
2025-11-05 13:34:56 +01:00
self,
X_train: np.ndarray,
y_train: np.ndarray,
X_val: np.ndarray,
y_val: np.ndarray
) -> xgb.XGBRegressor:
"""
Train XGBoost model on residuals.
Args:
X_train: Training features
y_train: Training residuals
X_val: Validation features
y_val: Validation residuals
Returns:
Trained XGBoost model
"""
# XGBoost parameters optimized for residual learning
params = {
'n_estimators': 100,
'max_depth': 3, # Shallow trees to prevent overfitting
'learning_rate': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'min_child_weight': 3,
'reg_alpha': 0.1, # L1 regularization
'reg_lambda': 1.0, # L2 regularization
'objective': 'reg:squarederror',
'random_state': 42,
'n_jobs': -1
}
# Initialize model
model = xgb.XGBRegressor(**params)
# ✅ FIX: Run blocking model.fit() in thread pool to avoid blocking event loop
import asyncio
await asyncio.to_thread(
model.fit,
2025-11-05 13:34:56 +01:00
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=10,
verbose=False
)
logger.info(
"XGBoost training complete",
best_iteration=model.best_iteration if hasattr(model, 'best_iteration') else None
)
return model
async def _evaluate_hybrid_model(
2025-11-05 13:34:56 +01:00
self,
train_df: pd.DataFrame,
val_df: pd.DataFrame,
train_prophet_pred: np.ndarray,
val_prophet_pred: np.ndarray,
prophet_result: Dict[str, Any]
) -> Dict[str, float]:
"""
Evaluate hybrid model vs Prophet-only on validation set.
Args:
train_df: Training data
val_df: Validation data
train_prophet_pred: Prophet predictions on training set
val_prophet_pred: Prophet predictions on validation set
prophet_result: Prophet training result
Returns:
Dictionary of metrics
"""
# Get actual values
train_actual = train_df['y'].values
val_actual = val_df['y'].values
# Get XGBoost predictions on residuals
X_train = train_df[self.feature_columns].values
X_val = val_df[self.feature_columns].values
# ✅ FIX: Run blocking predict() in thread pool to avoid blocking event loop
import asyncio
train_xgb_pred = await asyncio.to_thread(self.xgb_model.predict, X_train)
val_xgb_pred = await asyncio.to_thread(self.xgb_model.predict, X_val)
2025-11-05 13:34:56 +01:00
# Hybrid predictions = Prophet + XGBoost residual correction
train_hybrid_pred = train_prophet_pred + train_xgb_pred
val_hybrid_pred = val_prophet_pred + val_xgb_pred
# Calculate metrics for Prophet-only
prophet_train_mae = mean_absolute_error(train_actual, train_prophet_pred)
prophet_val_mae = mean_absolute_error(val_actual, val_prophet_pred)
prophet_train_mape = mean_absolute_percentage_error(train_actual, train_prophet_pred) * 100
prophet_val_mape = mean_absolute_percentage_error(val_actual, val_prophet_pred) * 100
# Calculate metrics for Hybrid
hybrid_train_mae = mean_absolute_error(train_actual, train_hybrid_pred)
hybrid_val_mae = mean_absolute_error(val_actual, val_hybrid_pred)
hybrid_train_mape = mean_absolute_percentage_error(train_actual, train_hybrid_pred) * 100
hybrid_val_mape = mean_absolute_percentage_error(val_actual, val_hybrid_pred) * 100
# Calculate improvement
mae_improvement = ((prophet_val_mae - hybrid_val_mae) / prophet_val_mae) * 100
mape_improvement = ((prophet_val_mape - hybrid_val_mape) / prophet_val_mape) * 100
metrics = {
'prophet_train_mae': float(prophet_train_mae),
'prophet_val_mae': float(prophet_val_mae),
'prophet_train_mape': float(prophet_train_mape),
'prophet_val_mape': float(prophet_val_mape),
'hybrid_train_mae': float(hybrid_train_mae),
'hybrid_val_mae': float(hybrid_val_mae),
'hybrid_train_mape': float(hybrid_train_mape),
'hybrid_val_mape': float(hybrid_val_mape),
'mae_improvement_pct': float(mae_improvement),
'mape_improvement_pct': float(mape_improvement),
'improvement_percentage': float(mape_improvement) # Primary metric
}
return metrics
def _package_hybrid_model(
self,
prophet_result: Dict[str, Any],
metrics: Dict[str, float],
tenant_id: str,
inventory_product_id: str
) -> Dict[str, Any]:
"""
Package hybrid model for storage.
Args:
prophet_result: Prophet model result
metrics: Hybrid model metrics
tenant_id: Tenant ID
inventory_product_id: Product ID
Returns:
Model package dictionary
"""
return {
'model_type': 'hybrid_prophet_xgboost',
'prophet_model': prophet_result.get('model'),
'xgboost_model': self.xgb_model,
'feature_columns': self.feature_columns,
'prophet_metrics': {
'train_mae': metrics['prophet_train_mae'],
'val_mae': metrics['prophet_val_mae'],
'train_mape': metrics['prophet_train_mape'],
'val_mape': metrics['prophet_val_mape']
},
'hybrid_metrics': {
'train_mae': metrics['hybrid_train_mae'],
'val_mae': metrics['hybrid_val_mae'],
'train_mape': metrics['hybrid_train_mape'],
'val_mape': metrics['hybrid_val_mape']
},
'improvement_metrics': {
'mae_improvement_pct': metrics['mae_improvement_pct'],
'mape_improvement_pct': metrics['mape_improvement_pct']
},
'tenant_id': tenant_id,
'inventory_product_id': inventory_product_id,
'trained_at': datetime.utcnow().isoformat()
}
async def predict(
self,
future_df: pd.DataFrame,
model_data: Dict[str, Any]
) -> pd.DataFrame:
"""
Make predictions using hybrid model.
Args:
future_df: DataFrame with future dates and regressors
model_data: Loaded hybrid model data
Returns:
DataFrame with predictions
"""
# Step 1: Get Prophet predictions
prophet_model = model_data['prophet_model']
# ✅ FIX: Run blocking predict() in thread pool to avoid blocking event loop
import asyncio
prophet_forecast = await asyncio.to_thread(prophet_model.predict, future_df)
2025-11-05 13:34:56 +01:00
# Step 2: Prepare features for XGBoost
future_enhanced = self._prepare_xgboost_features(future_df)
# Step 3: Get XGBoost predictions
xgb_model = model_data['xgboost_model']
feature_columns = model_data['feature_columns']
X_future = future_enhanced[feature_columns].values
# ✅ FIX: Run blocking predict() in thread pool to avoid blocking event loop
xgb_pred = await asyncio.to_thread(xgb_model.predict, X_future)
2025-11-05 13:34:56 +01:00
# Step 4: Combine predictions
hybrid_pred = prophet_forecast['yhat'].values + xgb_pred
# Step 5: Create result dataframe
result = pd.DataFrame({
'ds': future_df['ds'],
'prophet_yhat': prophet_forecast['yhat'],
'xgb_adjustment': xgb_pred,
'yhat': hybrid_pred,
'yhat_lower': prophet_forecast['yhat_lower'] + xgb_pred,
'yhat_upper': prophet_forecast['yhat_upper'] + xgb_pred
})
return result