Files
bakery-ia/services/training/app/ml/hybrid_trainer.py

459 lines
16 KiB
Python
Raw Normal View History

2025-11-05 13:34:56 +01:00
"""
Hybrid Prophet + XGBoost Trainer
Combines Prophet's seasonality modeling with XGBoost's pattern learning
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
import structlog
2025-11-14 07:23:56 +01:00
from datetime import datetime, timezone
2025-11-05 13:34:56 +01:00
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')
# Import XGBoost
try:
import xgboost as xgb
except ImportError:
raise ImportError("XGBoost not installed. Run: pip install xgboost")
from app.ml.prophet_manager import BakeryProphetManager
from app.ml.enhanced_features import AdvancedFeatureEngineer
logger = structlog.get_logger()
class HybridProphetXGBoost:
"""
Hybrid forecasting model combining Prophet and XGBoost.
Approach:
1. Train Prophet on historical data (captures trend, seasonality, holidays)
2. Calculate residuals (actual - prophet_prediction)
3. Train XGBoost on residuals using enhanced features
4. Final prediction = prophet_prediction + xgboost_residual_prediction
Benefits:
- Prophet handles seasonality, holidays, trends
- XGBoost captures complex patterns Prophet misses
- Maintains Prophet's interpretability
- Improves accuracy by 10-25% over Prophet alone
"""
def __init__(self, database_manager=None):
self.prophet_manager = BakeryProphetManager(database_manager)
self.feature_engineer = AdvancedFeatureEngineer()
self.xgb_model = None
self.feature_columns = []
self.prophet_model_data = None
async def train_hybrid_model(
self,
tenant_id: str,
inventory_product_id: str,
df: pd.DataFrame,
job_id: str,
Fix training hang caused by nested database sessions and deadlocks Root Cause: The training process was hanging at the first progress update due to a nested database session issue. The main trainer created a session and repositories, then called prophet_manager.train_bakery_model() which created another nested session with an advisory lock. This caused a deadlock where: 1. Outer session had uncommitted UPDATE on model_training_logs 2. Inner session tried to acquire advisory lock 3. Neither could proceed, causing training to hang indefinitely Changes Made: 1. prophet_manager.py: - Added optional 'session' parameter to train_bakery_model() - Refactored to use parent session if provided, otherwise create new one - Prevents nested session creation during training 2. hybrid_trainer.py: - Added optional 'session' parameter to train_hybrid_model() - Passes session to prophet_manager to maintain single session context 3. trainer.py: - Updated _train_single_product() to accept and pass session - Updated _train_all_models_enhanced() to accept and pass session - Pass db_session from main training context to all training methods - Added explicit db_session.flush() after critical progress update - This ensures updates are visible before acquiring locks Impact: - Eliminates nested session deadlocks - Training now proceeds past initial progress update - Maintains single database session context throughout training - Prevents database transaction conflicts Related Issues: - Fixes training hang during onboarding process - Not directly related to audit_metadata changes but exposed by them 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 16:13:32 +01:00
validation_split: float = 0.2,
session = None
2025-11-05 13:34:56 +01:00
) -> Dict[str, Any]:
"""
Train hybrid Prophet + XGBoost model.
Args:
tenant_id: Tenant identifier
inventory_product_id: Product identifier
df: Training data (must have 'ds', 'y' and regressor columns)
job_id: Training job identifier
validation_split: Fraction of data for validation
Fix training hang caused by nested database sessions and deadlocks Root Cause: The training process was hanging at the first progress update due to a nested database session issue. The main trainer created a session and repositories, then called prophet_manager.train_bakery_model() which created another nested session with an advisory lock. This caused a deadlock where: 1. Outer session had uncommitted UPDATE on model_training_logs 2. Inner session tried to acquire advisory lock 3. Neither could proceed, causing training to hang indefinitely Changes Made: 1. prophet_manager.py: - Added optional 'session' parameter to train_bakery_model() - Refactored to use parent session if provided, otherwise create new one - Prevents nested session creation during training 2. hybrid_trainer.py: - Added optional 'session' parameter to train_hybrid_model() - Passes session to prophet_manager to maintain single session context 3. trainer.py: - Updated _train_single_product() to accept and pass session - Updated _train_all_models_enhanced() to accept and pass session - Pass db_session from main training context to all training methods - Added explicit db_session.flush() after critical progress update - This ensures updates are visible before acquiring locks Impact: - Eliminates nested session deadlocks - Training now proceeds past initial progress update - Maintains single database session context throughout training - Prevents database transaction conflicts Related Issues: - Fixes training hang during onboarding process - Not directly related to audit_metadata changes but exposed by them 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 16:13:32 +01:00
session: Optional database session (uses parent session if provided to avoid nested sessions)
2025-11-05 13:34:56 +01:00
Returns:
Dictionary with model metadata and performance metrics
"""
logger.info(
"Starting hybrid Prophet + XGBoost training",
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
data_points=len(df)
)
# Step 1: Train Prophet model (base forecaster)
logger.info("Step 1: Training Prophet base model")
Fix training hang caused by nested database sessions and deadlocks Root Cause: The training process was hanging at the first progress update due to a nested database session issue. The main trainer created a session and repositories, then called prophet_manager.train_bakery_model() which created another nested session with an advisory lock. This caused a deadlock where: 1. Outer session had uncommitted UPDATE on model_training_logs 2. Inner session tried to acquire advisory lock 3. Neither could proceed, causing training to hang indefinitely Changes Made: 1. prophet_manager.py: - Added optional 'session' parameter to train_bakery_model() - Refactored to use parent session if provided, otherwise create new one - Prevents nested session creation during training 2. hybrid_trainer.py: - Added optional 'session' parameter to train_hybrid_model() - Passes session to prophet_manager to maintain single session context 3. trainer.py: - Updated _train_single_product() to accept and pass session - Updated _train_all_models_enhanced() to accept and pass session - Pass db_session from main training context to all training methods - Added explicit db_session.flush() after critical progress update - This ensures updates are visible before acquiring locks Impact: - Eliminates nested session deadlocks - Training now proceeds past initial progress update - Maintains single database session context throughout training - Prevents database transaction conflicts Related Issues: - Fixes training hang during onboarding process - Not directly related to audit_metadata changes but exposed by them 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 16:13:32 +01:00
# ✅ FIX: Pass session to prophet_manager to avoid nested session issues
2025-11-05 13:34:56 +01:00
prophet_result = await self.prophet_manager.train_bakery_model(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
df=df.copy(),
Fix training hang caused by nested database sessions and deadlocks Root Cause: The training process was hanging at the first progress update due to a nested database session issue. The main trainer created a session and repositories, then called prophet_manager.train_bakery_model() which created another nested session with an advisory lock. This caused a deadlock where: 1. Outer session had uncommitted UPDATE on model_training_logs 2. Inner session tried to acquire advisory lock 3. Neither could proceed, causing training to hang indefinitely Changes Made: 1. prophet_manager.py: - Added optional 'session' parameter to train_bakery_model() - Refactored to use parent session if provided, otherwise create new one - Prevents nested session creation during training 2. hybrid_trainer.py: - Added optional 'session' parameter to train_hybrid_model() - Passes session to prophet_manager to maintain single session context 3. trainer.py: - Updated _train_single_product() to accept and pass session - Updated _train_all_models_enhanced() to accept and pass session - Pass db_session from main training context to all training methods - Added explicit db_session.flush() after critical progress update - This ensures updates are visible before acquiring locks Impact: - Eliminates nested session deadlocks - Training now proceeds past initial progress update - Maintains single database session context throughout training - Prevents database transaction conflicts Related Issues: - Fixes training hang during onboarding process - Not directly related to audit_metadata changes but exposed by them 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 16:13:32 +01:00
job_id=job_id,
session=session
2025-11-05 13:34:56 +01:00
)
self.prophet_model_data = prophet_result
# Step 2: Create enhanced features for XGBoost
logger.info("Step 2: Engineering enhanced features for XGBoost")
df_enhanced = self._prepare_xgboost_features(df)
# Step 3: Split into train/validation
split_idx = int(len(df_enhanced) * (1 - validation_split))
train_df = df_enhanced.iloc[:split_idx].copy()
val_df = df_enhanced.iloc[split_idx:].copy()
logger.info(
"Data split",
train_samples=len(train_df),
val_samples=len(val_df)
)
# Step 4: Get Prophet predictions on training data
logger.info("Step 3: Generating Prophet predictions for residual calculation")
train_prophet_pred = self._get_prophet_predictions(prophet_result, train_df)
val_prophet_pred = self._get_prophet_predictions(prophet_result, val_df)
# Step 5: Calculate residuals (actual - prophet_prediction)
train_residuals = train_df['y'].values - train_prophet_pred
val_residuals = val_df['y'].values - val_prophet_pred
logger.info(
"Residuals calculated",
train_residual_mean=float(np.mean(train_residuals)),
train_residual_std=float(np.std(train_residuals))
)
# Step 6: Prepare feature matrix for XGBoost
X_train = train_df[self.feature_columns].values
X_val = val_df[self.feature_columns].values
# Step 7: Train XGBoost on residuals
logger.info("Step 4: Training XGBoost on residuals")
self.xgb_model = await self._train_xgboost(
2025-11-05 13:34:56 +01:00
X_train, train_residuals,
X_val, val_residuals
)
# Step 8: Evaluate hybrid model
logger.info("Step 5: Evaluating hybrid model performance")
metrics = await self._evaluate_hybrid_model(
2025-11-05 13:34:56 +01:00
train_df, val_df,
train_prophet_pred, val_prophet_pred,
prophet_result
)
# Step 9: Save hybrid model
model_data = self._package_hybrid_model(
prophet_result, metrics, tenant_id, inventory_product_id
)
logger.info(
"Hybrid model training complete",
prophet_mape=metrics['prophet_val_mape'],
hybrid_mape=metrics['hybrid_val_mape'],
improvement_pct=metrics['improvement_percentage']
)
return model_data
def _prepare_xgboost_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Prepare enhanced features for XGBoost.
Args:
df: Base dataframe with 'ds', 'y' and regressor columns
Returns:
DataFrame with all enhanced features
"""
# Rename 'ds' to 'date' for feature engineering
df_prep = df.copy()
if 'ds' in df_prep.columns:
df_prep['date'] = df_prep['ds']
# Ensure 'quantity' column for feature engineering
if 'y' in df_prep.columns:
df_prep['quantity'] = df_prep['y']
# Create all enhanced features
df_enhanced = self.feature_engineer.create_all_features(
df_prep,
date_column='date',
include_lags=True,
include_rolling=True,
include_interactions=True,
include_cyclical=True
)
# Fill NA values (from lagged features at beginning)
df_enhanced = self.feature_engineer.fill_na_values(df_enhanced)
# Get feature column list (excluding target and date columns)
self.feature_columns = [
col for col in self.feature_engineer.get_feature_columns()
if col in df_enhanced.columns
]
# Also include original regressor columns if present
regressor_cols = [
col for col in df.columns
if col not in ['ds', 'y', 'date', 'quantity'] and col in df_enhanced.columns
]
self.feature_columns.extend(regressor_cols)
self.feature_columns = list(set(self.feature_columns)) # Remove duplicates
logger.info(f"Prepared {len(self.feature_columns)} features for XGBoost")
return df_enhanced
def _get_prophet_predictions(
self,
prophet_result: Dict[str, Any],
df: pd.DataFrame
) -> np.ndarray:
"""
Get Prophet predictions for given dataframe.
Args:
prophet_result: Prophet model result from training
df: DataFrame with 'ds' column
Returns:
Array of predictions
"""
# Get the Prophet model from result
prophet_model = prophet_result.get('model')
if prophet_model is None:
raise ValueError("Prophet model not found in result")
# Prepare dataframe for prediction
pred_df = df[['ds']].copy()
# Add regressors if present
regressor_cols = [col for col in df.columns if col not in ['ds', 'y', 'date', 'quantity']]
for col in regressor_cols:
if col in df.columns:
pred_df[col] = df[col]
# Get predictions
forecast = prophet_model.predict(pred_df)
return forecast['yhat'].values
async def _train_xgboost(
2025-11-05 13:34:56 +01:00
self,
X_train: np.ndarray,
y_train: np.ndarray,
X_val: np.ndarray,
y_val: np.ndarray
) -> xgb.XGBRegressor:
"""
Train XGBoost model on residuals.
Args:
X_train: Training features
y_train: Training residuals
X_val: Validation features
y_val: Validation residuals
Returns:
Trained XGBoost model
"""
# XGBoost parameters optimized for residual learning
params = {
'n_estimators': 100,
'max_depth': 3, # Shallow trees to prevent overfitting
'learning_rate': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'min_child_weight': 3,
'reg_alpha': 0.1, # L1 regularization
'reg_lambda': 1.0, # L2 regularization
'objective': 'reg:squarederror',
'random_state': 42,
'n_jobs': -1
}
# Initialize model
model = xgb.XGBRegressor(**params)
# ✅ FIX: Run blocking model.fit() in thread pool to avoid blocking event loop
import asyncio
await asyncio.to_thread(
model.fit,
2025-11-05 13:34:56 +01:00
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=10,
verbose=False
)
logger.info(
"XGBoost training complete",
best_iteration=model.best_iteration if hasattr(model, 'best_iteration') else None
)
return model
async def _evaluate_hybrid_model(
2025-11-05 13:34:56 +01:00
self,
train_df: pd.DataFrame,
val_df: pd.DataFrame,
train_prophet_pred: np.ndarray,
val_prophet_pred: np.ndarray,
prophet_result: Dict[str, Any]
) -> Dict[str, float]:
"""
Evaluate hybrid model vs Prophet-only on validation set.
Args:
train_df: Training data
val_df: Validation data
train_prophet_pred: Prophet predictions on training set
val_prophet_pred: Prophet predictions on validation set
prophet_result: Prophet training result
Returns:
Dictionary of metrics
"""
# Get actual values
train_actual = train_df['y'].values
val_actual = val_df['y'].values
# Get XGBoost predictions on residuals
X_train = train_df[self.feature_columns].values
X_val = val_df[self.feature_columns].values
# ✅ FIX: Run blocking predict() in thread pool to avoid blocking event loop
import asyncio
train_xgb_pred = await asyncio.to_thread(self.xgb_model.predict, X_train)
val_xgb_pred = await asyncio.to_thread(self.xgb_model.predict, X_val)
2025-11-05 13:34:56 +01:00
# Hybrid predictions = Prophet + XGBoost residual correction
train_hybrid_pred = train_prophet_pred + train_xgb_pred
val_hybrid_pred = val_prophet_pred + val_xgb_pred
# Calculate metrics for Prophet-only
prophet_train_mae = mean_absolute_error(train_actual, train_prophet_pred)
prophet_val_mae = mean_absolute_error(val_actual, val_prophet_pred)
prophet_train_mape = mean_absolute_percentage_error(train_actual, train_prophet_pred) * 100
prophet_val_mape = mean_absolute_percentage_error(val_actual, val_prophet_pred) * 100
# Calculate metrics for Hybrid
hybrid_train_mae = mean_absolute_error(train_actual, train_hybrid_pred)
hybrid_val_mae = mean_absolute_error(val_actual, val_hybrid_pred)
hybrid_train_mape = mean_absolute_percentage_error(train_actual, train_hybrid_pred) * 100
hybrid_val_mape = mean_absolute_percentage_error(val_actual, val_hybrid_pred) * 100
# Calculate improvement
mae_improvement = ((prophet_val_mae - hybrid_val_mae) / prophet_val_mae) * 100
mape_improvement = ((prophet_val_mape - hybrid_val_mape) / prophet_val_mape) * 100
metrics = {
'prophet_train_mae': float(prophet_train_mae),
'prophet_val_mae': float(prophet_val_mae),
'prophet_train_mape': float(prophet_train_mape),
'prophet_val_mape': float(prophet_val_mape),
'hybrid_train_mae': float(hybrid_train_mae),
'hybrid_val_mae': float(hybrid_val_mae),
'hybrid_train_mape': float(hybrid_train_mape),
'hybrid_val_mape': float(hybrid_val_mape),
'mae_improvement_pct': float(mae_improvement),
'mape_improvement_pct': float(mape_improvement),
'improvement_percentage': float(mape_improvement) # Primary metric
}
return metrics
def _package_hybrid_model(
self,
prophet_result: Dict[str, Any],
metrics: Dict[str, float],
tenant_id: str,
inventory_product_id: str
) -> Dict[str, Any]:
"""
Package hybrid model for storage.
Args:
prophet_result: Prophet model result
metrics: Hybrid model metrics
tenant_id: Tenant ID
inventory_product_id: Product ID
Returns:
Model package dictionary
"""
return {
'model_type': 'hybrid_prophet_xgboost',
'prophet_model': prophet_result.get('model'),
'xgboost_model': self.xgb_model,
'feature_columns': self.feature_columns,
'prophet_metrics': {
'train_mae': metrics['prophet_train_mae'],
'val_mae': metrics['prophet_val_mae'],
'train_mape': metrics['prophet_train_mape'],
'val_mape': metrics['prophet_val_mape']
},
'hybrid_metrics': {
'train_mae': metrics['hybrid_train_mae'],
'val_mae': metrics['hybrid_val_mae'],
'train_mape': metrics['hybrid_train_mape'],
'val_mape': metrics['hybrid_val_mape']
},
'improvement_metrics': {
'mae_improvement_pct': metrics['mae_improvement_pct'],
'mape_improvement_pct': metrics['mape_improvement_pct']
},
'tenant_id': tenant_id,
'inventory_product_id': inventory_product_id,
2025-11-14 07:23:56 +01:00
'trained_at': datetime.now(timezone.utc).isoformat()
2025-11-05 13:34:56 +01:00
}
async def predict(
self,
future_df: pd.DataFrame,
model_data: Dict[str, Any]
) -> pd.DataFrame:
"""
Make predictions using hybrid model.
Args:
future_df: DataFrame with future dates and regressors
model_data: Loaded hybrid model data
Returns:
DataFrame with predictions
"""
# Step 1: Get Prophet predictions
prophet_model = model_data['prophet_model']
# ✅ FIX: Run blocking predict() in thread pool to avoid blocking event loop
import asyncio
prophet_forecast = await asyncio.to_thread(prophet_model.predict, future_df)
2025-11-05 13:34:56 +01:00
# Step 2: Prepare features for XGBoost
future_enhanced = self._prepare_xgboost_features(future_df)
# Step 3: Get XGBoost predictions
xgb_model = model_data['xgboost_model']
feature_columns = model_data['feature_columns']
X_future = future_enhanced[feature_columns].values
# ✅ FIX: Run blocking predict() in thread pool to avoid blocking event loop
xgb_pred = await asyncio.to_thread(xgb_model.predict, X_future)
2025-11-05 13:34:56 +01:00
# Step 4: Combine predictions
hybrid_pred = prophet_forecast['yhat'].values + xgb_pred
# Step 5: Create result dataframe
result = pd.DataFrame({
'ds': future_df['ds'],
'prophet_yhat': prophet_forecast['yhat'],
'xgb_adjustment': xgb_pred,
'yhat': hybrid_pred,
'yhat_lower': prophet_forecast['yhat_lower'] + xgb_pred,
'yhat_upper': prophet_forecast['yhat_upper'] + xgb_pred
})
return result