Fix training hang caused by nested database sessions and deadlocks

Root Cause:
The training process was hanging at the first progress update due to a
nested database session issue. The main trainer created a session and
repositories, then called prophet_manager.train_bakery_model() which
created another nested session with an advisory lock. This caused a
deadlock where:
1. Outer session had uncommitted UPDATE on model_training_logs
2. Inner session tried to acquire advisory lock
3. Neither could proceed, causing training to hang indefinitely

Changes Made:
1. prophet_manager.py:
   - Added optional 'session' parameter to train_bakery_model()
   - Refactored to use parent session if provided, otherwise create new one
   - Prevents nested session creation during training

2. hybrid_trainer.py:
   - Added optional 'session' parameter to train_hybrid_model()
   - Passes session to prophet_manager to maintain single session context

3. trainer.py:
   - Updated _train_single_product() to accept and pass session
   - Updated _train_all_models_enhanced() to accept and pass session
   - Pass db_session from main training context to all training methods
   - Added explicit db_session.flush() after critical progress update
   - This ensures updates are visible before acquiring locks

Impact:
- Eliminates nested session deadlocks
- Training now proceeds past initial progress update
- Maintains single database session context throughout training
- Prevents database transaction conflicts

Related Issues:
- Fixes training hang during onboarding process
- Not directly related to audit_metadata changes but exposed by them

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Urtzi Alfaro
2025-11-05 16:13:32 +01:00
parent 7a315afa62
commit caff49761d
3 changed files with 174 additions and 133 deletions

View File

@@ -94,7 +94,8 @@ class BakeryProphetManager:
df: pd.DataFrame,
job_id: str,
product_category: 'ProductCategory' = None,
category_hyperparameters: Dict[str, Any] = None) -> Dict[str, Any]:
category_hyperparameters: Dict[str, Any] = None,
session = None) -> Dict[str, Any]:
"""
Train a Prophet model with automatic hyperparameter optimization and distributed locking.
@@ -105,6 +106,7 @@ class BakeryProphetManager:
job_id: Training job identifier
product_category: Optional product category for category-specific settings
category_hyperparameters: Optional category-specific Prophet hyperparameters
session: Optional database session (uses parent session if provided to avoid nested sessions)
"""
# Check disk space before starting training
has_space, free_gb, total_gb, used_percent = check_disk_space('/tmp', min_free_gb=0.5)
@@ -116,141 +118,155 @@ class BakeryProphetManager:
# Acquire distributed lock to prevent concurrent training of same product
lock = get_training_lock(tenant_id, inventory_product_id, use_advisory=True)
try:
async with self.database_manager.get_session() as session:
async with lock.acquire(session):
logger.info(f"Training optimized bakery model for {inventory_product_id} (lock acquired)")
# Use provided session or create new one if not provided
use_parent_session = session is not None
# Validate input data
await self._validate_training_data(df, inventory_product_id)
async def _train_with_lock(db_session):
"""Inner function to perform training with lock"""
async with lock.acquire(db_session):
logger.info(f"Training optimized bakery model for {inventory_product_id} (lock acquired)")
# Prepare data for Prophet
prophet_data = await self._prepare_prophet_data(df)
# Validate input data
await self._validate_training_data(df, inventory_product_id)
# Get regressor columns
regressor_columns = self._extract_regressor_columns(prophet_data)
# Prepare data for Prophet
prophet_data = await self._prepare_prophet_data(df)
# Use category-specific hyperparameters if provided, otherwise optimize
if category_hyperparameters:
logger.info(f"Using category-specific hyperparameters for {inventory_product_id} (category: {product_category.value if product_category else 'unknown'})")
best_params = category_hyperparameters.copy()
use_optimized = False # Not optimized, but category-specific
else:
# Automatically optimize hyperparameters
logger.info(f"Optimizing hyperparameters for {inventory_product_id}...")
try:
best_params = await self._optimize_hyperparameters(prophet_data, inventory_product_id, regressor_columns)
use_optimized = True
except Exception as opt_error:
logger.warning(f"Hyperparameter optimization failed for {inventory_product_id}: {opt_error}")
logger.warning("Falling back to default Prophet parameters")
# Use conservative default parameters
best_params = {
'changepoint_prior_scale': 0.05,
'seasonality_prior_scale': 10.0,
'holidays_prior_scale': 10.0,
'changepoint_range': 0.8,
'seasonality_mode': 'additive',
'daily_seasonality': False,
'weekly_seasonality': True,
'yearly_seasonality': len(prophet_data) > 365,
'uncertainty_samples': 0 # Disable uncertainty sampling to avoid cmdstan
}
use_optimized = False
# Get regressor columns
regressor_columns = self._extract_regressor_columns(prophet_data)
# Create optimized Prophet model
model = self._create_optimized_prophet_model(best_params, regressor_columns)
# Add regressors to model
for regressor in regressor_columns:
if regressor in prophet_data.columns:
model.add_regressor(regressor)
# Set environment variable for cmdstan tmp directory
import os
tmpdir = os.environ.get('TMPDIR', '/tmp/cmdstan')
os.makedirs(tmpdir, mode=0o777, exist_ok=True)
os.environ['TMPDIR'] = tmpdir
# Verify tmp directory is writable
test_file = os.path.join(tmpdir, f'test_write_{inventory_product_id}.tmp')
# Use category-specific hyperparameters if provided, otherwise optimize
if category_hyperparameters:
logger.info(f"Using category-specific hyperparameters for {inventory_product_id} (category: {product_category.value if product_category else 'unknown'})")
best_params = category_hyperparameters.copy()
use_optimized = False # Not optimized, but category-specific
else:
# Automatically optimize hyperparameters
logger.info(f"Optimizing hyperparameters for {inventory_product_id}...")
try:
with open(test_file, 'w') as f:
f.write('test')
os.remove(test_file)
logger.debug(f"Verified {tmpdir} is writable")
except Exception as e:
logger.error(f"TMPDIR {tmpdir} is not writable: {e}")
raise RuntimeError(f"Cannot write to {tmpdir}: {e}")
# Fit the model with enhanced error handling
try:
logger.info(f"Starting Prophet model fit for {inventory_product_id}")
# ✅ FIX: Run blocking model.fit() in thread pool to avoid blocking event loop
import asyncio
await asyncio.to_thread(model.fit, prophet_data)
logger.info(f"Prophet model fit completed successfully for {inventory_product_id}")
except Exception as fit_error:
error_details = {
'error_type': type(fit_error).__name__,
'error_message': str(fit_error),
'errno': getattr(fit_error, 'errno', None),
'tmpdir': tmpdir,
'disk_space': check_disk_space(tmpdir, 0)
best_params = await self._optimize_hyperparameters(prophet_data, inventory_product_id, regressor_columns)
use_optimized = True
except Exception as opt_error:
logger.warning(f"Hyperparameter optimization failed for {inventory_product_id}: {opt_error}")
logger.warning("Falling back to default Prophet parameters")
# Use conservative default parameters
best_params = {
'changepoint_prior_scale': 0.05,
'seasonality_prior_scale': 10.0,
'holidays_prior_scale': 10.0,
'changepoint_range': 0.8,
'seasonality_mode': 'additive',
'daily_seasonality': False,
'weekly_seasonality': True,
'yearly_seasonality': len(prophet_data) > 365,
'uncertainty_samples': 0 # Disable uncertainty sampling to avoid cmdstan
}
logger.error(f"Prophet model fit failed for {inventory_product_id}: {error_details}")
raise RuntimeError(f"Prophet training failed: {error_details['error_message']}") from fit_error
use_optimized = False
# Calculate enhanced training metrics first
training_metrics = await self._calculate_training_metrics(model, prophet_data, best_params)
# Create optimized Prophet model
model = self._create_optimized_prophet_model(best_params, regressor_columns)
# Store model and metrics - Generate proper UUID for model_id
model_id = str(uuid.uuid4())
model_path = await self._store_model(
tenant_id, inventory_product_id, model, model_id, prophet_data, regressor_columns, best_params, training_metrics
)
# Add regressors to model
for regressor in regressor_columns:
if regressor in prophet_data.columns:
model.add_regressor(regressor)
# Return same format as before, but with optimization info
# Ensure hyperparameters are JSON-serializable
def _serialize_hyperparameters(params):
"""Helper to ensure hyperparameters are JSON serializable"""
if not params:
return {}
safe_params = {}
for k, v in params.items():
try:
if isinstance(v, (int, float, str, bool, type(None))):
safe_params[k] = v
elif hasattr(v, 'item'): # numpy scalars
safe_params[k] = v.item()
elif isinstance(v, (list, tuple)):
safe_params[k] = [x.item() if hasattr(x, 'item') else x for x in v]
else:
safe_params[k] = float(v) if isinstance(v, (np.integer, np.floating)) else str(v)
except:
safe_params[k] = str(v) # fallback to string conversion
return safe_params
# Set environment variable for cmdstan tmp directory
import os
tmpdir = os.environ.get('TMPDIR', '/tmp/cmdstan')
os.makedirs(tmpdir, mode=0o777, exist_ok=True)
os.environ['TMPDIR'] = tmpdir
model_info = {
"model_id": model_id,
"model_path": model_path,
"type": "prophet_optimized",
"training_samples": len(prophet_data),
"features": regressor_columns,
"hyperparameters": _serialize_hyperparameters(best_params),
"training_metrics": training_metrics,
"product_category": product_category.value if product_category else "unknown",
"trained_at": datetime.now().isoformat(),
"data_period": {
"start_date": pd.Timestamp(prophet_data['ds'].min()).isoformat(),
"end_date": pd.Timestamp(prophet_data['ds'].max()).isoformat(),
"total_days": len(prophet_data)
}
# Verify tmp directory is writable
test_file = os.path.join(tmpdir, f'test_write_{inventory_product_id}.tmp')
try:
with open(test_file, 'w') as f:
f.write('test')
os.remove(test_file)
logger.debug(f"Verified {tmpdir} is writable")
except Exception as e:
logger.error(f"TMPDIR {tmpdir} is not writable: {e}")
raise RuntimeError(f"Cannot write to {tmpdir}: {e}")
# Fit the model with enhanced error handling
try:
logger.info(f"Starting Prophet model fit for {inventory_product_id}")
# ✅ FIX: Run blocking model.fit() in thread pool to avoid blocking event loop
import asyncio
await asyncio.to_thread(model.fit, prophet_data)
logger.info(f"Prophet model fit completed successfully for {inventory_product_id}")
except Exception as fit_error:
error_details = {
'error_type': type(fit_error).__name__,
'error_message': str(fit_error),
'errno': getattr(fit_error, 'errno', None),
'tmpdir': tmpdir,
'disk_space': check_disk_space(tmpdir, 0)
}
logger.error(f"Prophet model fit failed for {inventory_product_id}: {error_details}")
raise RuntimeError(f"Prophet training failed: {error_details['error_message']}") from fit_error
logger.info(f"Optimized model trained successfully for {inventory_product_id}. "
f"MAPE: {training_metrics.get('optimized_mape', 'N/A')}%")
return model_info
# Calculate enhanced training metrics first
training_metrics = await self._calculate_training_metrics(model, prophet_data, best_params)
# Store model and metrics - Generate proper UUID for model_id
model_id = str(uuid.uuid4())
model_path = await self._store_model(
tenant_id, inventory_product_id, model, model_id, prophet_data, regressor_columns, best_params, training_metrics
)
# Return same format as before, but with optimization info
# Ensure hyperparameters are JSON-serializable
def _serialize_hyperparameters(params):
"""Helper to ensure hyperparameters are JSON serializable"""
if not params:
return {}
safe_params = {}
for k, v in params.items():
try:
if isinstance(v, (int, float, str, bool, type(None))):
safe_params[k] = v
elif hasattr(v, 'item'): # numpy scalars
safe_params[k] = v.item()
elif isinstance(v, (list, tuple)):
safe_params[k] = [x.item() if hasattr(x, 'item') else x for x in v]
else:
safe_params[k] = float(v) if isinstance(v, (np.integer, np.floating)) else str(v)
except:
safe_params[k] = str(v) # fallback to string conversion
return safe_params
model_info = {
"model_id": model_id,
"model_path": model_path,
"type": "prophet_optimized",
"training_samples": len(prophet_data),
"features": regressor_columns,
"hyperparameters": _serialize_hyperparameters(best_params),
"training_metrics": training_metrics,
"product_category": product_category.value if product_category else "unknown",
"trained_at": datetime.now().isoformat(),
"data_period": {
"start_date": pd.Timestamp(prophet_data['ds'].min()).isoformat(),
"end_date": pd.Timestamp(prophet_data['ds'].max()).isoformat(),
"total_days": len(prophet_data)
}
}
logger.info(f"Optimized model trained successfully for {inventory_product_id}. "
f"MAPE: {training_metrics.get('optimized_mape', 'N/A')}%")
return model_info
try:
# ✅ FIX: Use parent session if provided, otherwise create new one
# This prevents nested session issues and database deadlocks
if use_parent_session:
logger.debug(f"Using parent session for training {inventory_product_id}")
return await _train_with_lock(session)
else:
logger.debug(f"Creating new session for training {inventory_product_id}")
async with self.database_manager.get_session() as new_session:
return await _train_with_lock(new_session)
except LockAcquisitionError as e:
logger.warning(f"Could not acquire lock for {inventory_product_id}: {e}")