Improve AI logic

This commit is contained in:
Urtzi Alfaro
2025-11-05 13:34:56 +01:00
parent 5c87fbcf48
commit 394ad3aea4
218 changed files with 30627 additions and 7658 deletions

View File

@@ -39,7 +39,8 @@ router = APIRouter()
training_service = EnhancedTrainingService()
@router.get(
route_builder.build_base_route("models") + "/{inventory_product_id}/active"
route_builder.build_base_route("models") + "/{inventory_product_id}/active",
response_model=TrainedModelResponse
)
async def get_active_model(
tenant_id: str = Path(..., description="Tenant ID"),
@@ -90,21 +91,25 @@ async def get_active_model(
await db.commit()
return {
"model_id": str(model_record.id), # ✅ This is the correct field name
"model_id": str(model_record.id),
"tenant_id": str(model_record.tenant_id),
"inventory_product_id": str(model_record.inventory_product_id),
"model_type": model_record.model_type,
"model_path": model_record.model_path,
"features_used": model_record.features_used,
"hyperparameters": model_record.hyperparameters,
"version": 1, # Default version
"training_samples": model_record.training_samples or 0,
"features": model_record.features_used or [],
"hyperparameters": model_record.hyperparameters or {},
"training_metrics": {
"mape": model_record.mape,
"mae": model_record.mae,
"rmse": model_record.rmse,
"r2_score": model_record.r2_score
"mape": model_record.mape or 0.0,
"mae": model_record.mae or 0.0,
"rmse": model_record.rmse or 0.0,
"r2_score": model_record.r2_score or 0.0
},
"created_at": model_record.created_at.isoformat() if model_record.created_at else None,
"training_period": {
"start_date": model_record.training_start_date.isoformat() if model_record.training_start_date else None,
"end_date": model_record.training_end_date.isoformat() if model_record.training_end_date else None
}
"is_active": model_record.is_active,
"created_at": model_record.created_at,
"data_period_start": model_record.training_start_date,
"data_period_end": model_record.training_end_date
}
except HTTPException:

View File

@@ -17,6 +17,8 @@ from shared.database.base import create_database_manager
from shared.database.transactions import transactional
from shared.database.exceptions import DatabaseError
from app.core.config import settings
from app.ml.enhanced_features import AdvancedFeatureEngineer
import holidays
logger = structlog.get_logger()
@@ -26,16 +28,67 @@ class EnhancedBakeryDataProcessor:
Integrates date alignment, data cleaning, feature engineering, and preparation for ML models.
"""
def __init__(self, database_manager=None):
def __init__(self, database_manager=None, region: str = 'MD'):
self.database_manager = database_manager or create_database_manager(settings.DATABASE_URL, "training-service")
self.scalers = {} # Store scalers for each feature
self.imputers = {} # Store imputers for missing value handling
self.date_alignment_service = DateAlignmentService()
self.feature_engineer = AdvancedFeatureEngineer()
self.region = region # Region for holidays (MD=Madrid, PV=Basque, etc.)
self.spain_holidays = holidays.Spain(prov=region) # Initialize holidays library
def get_scalers(self) -> Dict[str, Any]:
"""Return the scalers/normalization parameters for use during prediction"""
return self.scalers.copy()
@staticmethod
def _extract_numeric_from_dict(value: Any) -> Optional[float]:
"""
Robust extraction of numeric values from complex data structures.
Handles various dict structures that might come from external APIs.
Args:
value: Any value that might be a dict, numeric, or other type
Returns:
Numeric value as float, or None if extraction fails
"""
# If already numeric, return it
if isinstance(value, (int, float)) and not isinstance(value, bool):
return float(value)
# If it's a dict, try multiple extraction strategies
if isinstance(value, dict):
# Strategy 1: Try common keys
for key in ['value', 'data', 'result', 'amount', 'count', 'number', 'val']:
if key in value:
extracted = value[key]
# Recursively extract if nested
if isinstance(extracted, dict):
return EnhancedBakeryDataProcessor._extract_numeric_from_dict(extracted)
elif isinstance(extracted, (int, float)) and not isinstance(extracted, bool):
return float(extracted)
# Strategy 2: Try to find first numeric value in dict
for v in value.values():
if isinstance(v, (int, float)) and not isinstance(v, bool):
return float(v)
elif isinstance(v, dict):
# Recursively try nested dicts
result = EnhancedBakeryDataProcessor._extract_numeric_from_dict(v)
if result is not None:
return result
# Strategy 3: Try to convert string to numeric
if isinstance(value, str):
try:
return float(value)
except (ValueError, TypeError):
pass
# If all strategies fail, return None (will be converted to NaN)
return None
async def _get_repositories(self, session):
"""Initialize repositories with session"""
return {
@@ -117,9 +170,12 @@ class EnhancedBakeryDataProcessor:
daily_sales = self._merge_weather_features(daily_sales, weather_data)
daily_sales = self._merge_traffic_features(daily_sales, traffic_data)
# Step 6: Engineer additional features
# Step 6: Engineer basic features
daily_sales = self._engineer_features(daily_sales)
# Step 6b: Add advanced features (lagged, rolling, cyclical, interactions, trends)
daily_sales = self._add_advanced_features(daily_sales)
# Step 7: Handle missing values
daily_sales = self._handle_missing_values(daily_sales)
@@ -177,52 +233,73 @@ class EnhancedBakeryDataProcessor:
async def prepare_prediction_features(self,
future_dates: pd.DatetimeIndex,
weather_forecast: pd.DataFrame = None,
traffic_forecast: pd.DataFrame = None) -> pd.DataFrame:
traffic_forecast: pd.DataFrame = None,
historical_data: pd.DataFrame = None) -> pd.DataFrame:
"""
Create features for future predictions with proper date handling.
Args:
future_dates: Future dates to predict
weather_forecast: Weather forecast data
traffic_forecast: Traffic forecast data
historical_data: Historical data for creating lagged and rolling features
Returns:
DataFrame with features for prediction
"""
try:
# Create base future dataframe
future_df = pd.DataFrame({'ds': future_dates})
# Add temporal features
future_df = self._add_temporal_features(
future_df.rename(columns={'ds': 'date'})
).rename(columns={'date': 'ds'})
# Add weather features
if weather_forecast is not None and not weather_forecast.empty:
weather_features = weather_forecast.copy()
if 'date' in weather_features.columns:
weather_features = weather_features.rename(columns={'date': 'ds'})
future_df = future_df.merge(weather_features, on='ds', how='left')
# Add traffic features
# Add traffic features
if traffic_forecast is not None and not traffic_forecast.empty:
traffic_features = traffic_forecast.copy()
if 'date' in traffic_features.columns:
traffic_features = traffic_features.rename(columns={'date': 'ds'})
future_df = future_df.merge(traffic_features, on='ds', how='left')
# Engineer additional features
# Engineer basic features
future_df = self._engineer_features(future_df.rename(columns={'ds': 'date'}))
# Add advanced features if historical data is provided
if historical_data is not None and not historical_data.empty:
# Combine historical and future data to calculate lagged/rolling features
combined_df = pd.concat([
historical_data.rename(columns={'ds': 'date'}),
future_df
], ignore_index=True).sort_values('date')
# Apply advanced features to combined data
combined_df = self._add_advanced_features(combined_df)
# Extract only the future rows
future_df = combined_df[combined_df['date'].isin(future_df['date'])].copy()
else:
# Without historical data, add advanced features with NaN for lags
logger.warning("No historical data provided, lagged features will be NaN")
future_df = self._add_advanced_features(future_df)
future_df = future_df.rename(columns={'date': 'ds'})
# Handle missing values in future data
future_df = self._handle_missing_values_future(future_df)
return future_df
except Exception as e:
logger.error("Error creating prediction features", error=str(e))
# Return minimal features if error
@@ -428,19 +505,40 @@ class EnhancedBakeryDataProcessor:
for standard_name, possible_names in weather_mapping.items():
for possible_name in possible_names:
if possible_name in weather_clean.columns:
weather_clean[standard_name] = pd.to_numeric(weather_clean[possible_name], errors='coerce')
# Extract numeric values using robust helper function
try:
# Check if column contains dict-like objects
has_dicts = weather_clean[possible_name].apply(lambda x: isinstance(x, dict)).any()
if has_dicts:
logger.warning(f"Weather column {possible_name} contains dict objects, extracting numeric values")
# Use robust extraction for all values
weather_clean[standard_name] = weather_clean[possible_name].apply(
self._extract_numeric_from_dict
)
else:
# Direct numeric conversion for simple values
weather_clean[standard_name] = pd.to_numeric(weather_clean[possible_name], errors='coerce')
except Exception as e:
logger.warning(f"Error converting weather column {possible_name}: {e}")
# Fallback: try to extract from each value
weather_clean[standard_name] = weather_clean[possible_name].apply(
self._extract_numeric_from_dict
)
weather_features.append(standard_name)
break
# Keep only the features we found
weather_clean = weather_clean[weather_features].copy()
# Merge with sales data
merged = daily_sales.merge(weather_clean, on='date', how='left')
# Fill missing weather values with Madrid-appropriate defaults
for feature, default_value in weather_defaults.items():
if feature in merged.columns:
# Ensure the column is numeric before filling
merged[feature] = pd.to_numeric(merged[feature], errors='coerce')
merged[feature] = merged[feature].fillna(default_value)
return merged
@@ -494,16 +592,35 @@ class EnhancedBakeryDataProcessor:
for standard_name, possible_names in traffic_mapping.items():
for possible_name in possible_names:
if possible_name in traffic_clean.columns:
traffic_clean[standard_name] = pd.to_numeric(traffic_clean[possible_name], errors='coerce')
# Extract numeric values using robust helper function
try:
# Check if column contains dict-like objects
has_dicts = traffic_clean[possible_name].apply(lambda x: isinstance(x, dict)).any()
if has_dicts:
logger.warning(f"Traffic column {possible_name} contains dict objects, extracting numeric values")
# Use robust extraction for all values
traffic_clean[standard_name] = traffic_clean[possible_name].apply(
self._extract_numeric_from_dict
)
else:
# Direct numeric conversion for simple values
traffic_clean[standard_name] = pd.to_numeric(traffic_clean[possible_name], errors='coerce')
except Exception as e:
logger.warning(f"Error converting traffic column {possible_name}: {e}")
# Fallback: try to extract from each value
traffic_clean[standard_name] = traffic_clean[possible_name].apply(
self._extract_numeric_from_dict
)
traffic_features.append(standard_name)
break
# Keep only the features we found
traffic_clean = traffic_clean[traffic_features].copy()
# Merge with sales data
merged = daily_sales.merge(traffic_clean, on='date', how='left')
# Fill missing traffic values with reasonable defaults
traffic_defaults = {
'traffic_volume': 100.0,
@@ -511,9 +628,11 @@ class EnhancedBakeryDataProcessor:
'congestion_level': 1.0, # Low congestion
'average_speed': 30.0 # km/h typical for Madrid
}
for feature, default_value in traffic_defaults.items():
if feature in merged.columns:
# Ensure the column is numeric before filling
merged[feature] = pd.to_numeric(merged[feature], errors='coerce')
merged[feature] = merged[feature].fillna(default_value)
return merged
@@ -530,17 +649,23 @@ class EnhancedBakeryDataProcessor:
# Weather-based features
if 'temperature' in df.columns:
# Ensure temperature is numeric (defensive check)
df['temperature'] = pd.to_numeric(df['temperature'], errors='coerce').fillna(15.0)
df['temp_squared'] = df['temperature'] ** 2
df['is_hot_day'] = (df['temperature'] > 25).astype(int) # Hot days in Madrid
df['is_cold_day'] = (df['temperature'] < 10).astype(int) # Cold days
df['is_pleasant_day'] = ((df['temperature'] >= 18) & (df['temperature'] <= 25)).astype(int)
# Temperature categories for bakery products
df['temp_category'] = pd.cut(df['temperature'],
bins=[-np.inf, 5, 15, 25, np.inf],
df['temp_category'] = pd.cut(df['temperature'],
bins=[-np.inf, 5, 15, 25, np.inf],
labels=[0, 1, 2, 3]).astype(int)
if 'precipitation' in df.columns:
# Ensure precipitation is numeric (defensive check)
df['precipitation'] = pd.to_numeric(df['precipitation'], errors='coerce').fillna(0.0)
df['is_rainy_day'] = (df['precipitation'] > 0.1).astype(int)
df['is_heavy_rain'] = (df['precipitation'] > 10).astype(int)
df['rain_intensity'] = pd.cut(df['precipitation'],
@@ -549,10 +674,13 @@ class EnhancedBakeryDataProcessor:
# Traffic-based features with NaN protection
if 'traffic_volume' in df.columns:
# Ensure traffic_volume is numeric (defensive check)
df['traffic_volume'] = pd.to_numeric(df['traffic_volume'], errors='coerce').fillna(100.0)
# Calculate traffic quantiles for relative measures
q75 = df['traffic_volume'].quantile(0.75)
q25 = df['traffic_volume'].quantile(0.25)
df['high_traffic'] = (df['traffic_volume'] > q75).astype(int)
df['low_traffic'] = (df['traffic_volume'] < q25).astype(int)
@@ -578,7 +706,15 @@ class EnhancedBakeryDataProcessor:
# Fill any remaining NaN values
df['traffic_normalized'] = df['traffic_normalized'].fillna(0.0)
# Ensure other weather features are numeric if they exist
for weather_col in ['humidity', 'wind_speed', 'pressure', 'pedestrian_count', 'congestion_level', 'average_speed']:
if weather_col in df.columns:
df[weather_col] = pd.to_numeric(df[weather_col], errors='coerce').fillna(
{'humidity': 60.0, 'wind_speed': 5.0, 'pressure': 1013.0,
'pedestrian_count': 50.0, 'congestion_level': 1.0, 'average_speed': 30.0}.get(weather_col, 0.0)
)
# Interaction features - bakery specific
if 'is_weekend' in df.columns and 'temperature' in df.columns:
df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
@@ -619,7 +755,39 @@ class EnhancedBakeryDataProcessor:
column=col,
nan_count=nan_count)
df[col] = df[col].fillna(0.0)
return df
def _add_advanced_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Add advanced features using AdvancedFeatureEngineer.
Includes lagged features, rolling statistics, cyclical encoding, and trend features.
"""
df = df.copy()
logger.info("Adding advanced features (lagged, rolling, cyclical, trends)")
# Reset feature engineer to clear previous features
self.feature_engineer = AdvancedFeatureEngineer()
# Create all advanced features at once
df = self.feature_engineer.create_all_features(
df,
date_column='date',
include_lags=True,
include_rolling=True,
include_interactions=True,
include_cyclical=True
)
# Fill NA values from lagged and rolling features
df = self.feature_engineer.fill_na_values(df, strategy='forward_backward')
# Store created feature columns for later reference
created_features = self.feature_engineer.get_feature_columns()
logger.info(f"Added {len(created_features)} advanced features",
features=created_features[:10]) # Log first 10 for brevity
return df
def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -733,46 +901,83 @@ class EnhancedBakeryDataProcessor:
return 4 # Autumn
def _is_spanish_holiday(self, date: datetime) -> bool:
"""Check if a date is a major Spanish holiday"""
month_day = (date.month, date.day)
# Major Spanish holidays that affect bakery sales
spanish_holidays = [
(1, 1), # New Year
(1, 6), # Epiphany (Reyes)
(5, 1), # Labour Day
(8, 15), # Assumption
(10, 12), # National Day
(11, 1), # All Saints
(12, 6), # Constitution
(12, 8), # Immaculate Conception
(12, 25), # Christmas
(5, 15), # San Isidro (Madrid patron saint)
(5, 2), # Madrid Community Day
]
return month_day in spanish_holidays
"""
Check if a date is a Spanish holiday using holidays library.
Supports dynamic Easter calculation and regional holidays.
"""
try:
# Convert to date if datetime
if isinstance(date, datetime):
date = date.date()
elif isinstance(date, pd.Timestamp):
date = date.date()
# Check if date is in holidays
return date in self.spain_holidays
except Exception as e:
logger.warning(f"Error checking holiday status for {date}: {e}")
# Fallback to checking basic holidays
month_day = (date.month, date.day)
basic_holidays = [
(1, 1), (1, 6), (5, 1), (8, 15), (10, 12),
(11, 1), (12, 6), (12, 8), (12, 25)
]
return month_day in basic_holidays
def _is_school_holiday(self, date: datetime) -> bool:
"""Check if a date is during school holidays (approximate)"""
month = date.month
# Approximate Spanish school holiday periods
# Summer holidays (July-August)
if month in [7, 8]:
return True
# Christmas holidays (mid December to early January)
if month == 12 and date.day >= 20:
return True
if month == 1 and date.day <= 10:
return True
# Easter holidays (approximate - early April)
if month == 4 and date.day <= 15:
return True
return False
"""
Check if a date is during school holidays in Spain.
Uses dynamic Easter calculation and standard Spanish school calendar.
"""
try:
from datetime import timedelta
import holidays as hol
# Convert to date if datetime
if isinstance(date, datetime):
check_date = date.date()
elif isinstance(date, pd.Timestamp):
check_date = date.date()
else:
check_date = date
month = check_date.month
day = check_date.day
# Summer holidays (July 1 - August 31)
if month in [7, 8]:
return True
# Christmas holidays (December 23 - January 7)
if (month == 12 and day >= 23) or (month == 1 and day <= 7):
return True
# Easter/Spring break (Semana Santa)
# Calculate Easter for this year
year = check_date.year
spain_hol = hol.Spain(years=year, prov=self.region)
# Find Easter dates (Viernes Santo - Good Friday, and nearby days)
# Easter break typically spans 1 week before and after Easter Sunday
for holiday_date, holiday_name in spain_hol.items():
if 'viernes santo' in holiday_name.lower() or 'easter' in holiday_name.lower():
# Easter break: 7 days before and 7 days after
easter_start = holiday_date - timedelta(days=7)
easter_end = holiday_date + timedelta(days=7)
if easter_start <= check_date <= easter_end:
return True
return False
except Exception as e:
logger.warning(f"Error checking school holiday for {date}: {e}")
# Fallback to simple approximation
month = date.month if hasattr(date, 'month') else date.month
day = date.day if hasattr(date, 'day') else date.day
return (month in [7, 8] or
(month == 12 and day >= 23) or
(month == 1 and day <= 7) or
(month == 4 and 1 <= day <= 15)) # Approximate Easter
async def calculate_feature_importance(self,
model_data: pd.DataFrame,

View File

@@ -0,0 +1,347 @@
"""
Enhanced Feature Engineering for Hybrid Prophet + XGBoost Models
Adds lagged features, rolling statistics, and advanced interactions
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Optional
import structlog
logger = structlog.get_logger()
class AdvancedFeatureEngineer:
"""
Advanced feature engineering for hybrid forecasting models.
Adds lagged features, rolling statistics, and complex interactions.
"""
def __init__(self):
self.feature_columns = []
def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame:
"""
Add lagged demand features for capturing recent trends.
Args:
df: DataFrame with 'quantity' column
lag_days: List of lag periods (default: [1, 7, 14])
Returns:
DataFrame with added lagged features
"""
if lag_days is None:
lag_days = [1, 7, 14]
df = df.copy()
for lag in lag_days:
col_name = f'lag_{lag}_day'
df[col_name] = df['quantity'].shift(lag)
self.feature_columns.append(col_name)
logger.info(f"Added {len(lag_days)} lagged features", lags=lag_days)
return df
def add_rolling_features(
self,
df: pd.DataFrame,
windows: List[int] = None,
features: List[str] = None
) -> pd.DataFrame:
"""
Add rolling statistics (mean, std, max, min).
Args:
df: DataFrame with 'quantity' column
windows: List of window sizes (default: [7, 14, 30])
features: List of statistics to calculate (default: ['mean', 'std', 'max', 'min'])
Returns:
DataFrame with rolling features
"""
if windows is None:
windows = [7, 14, 30]
if features is None:
features = ['mean', 'std', 'max', 'min']
df = df.copy()
for window in windows:
for feature in features:
col_name = f'rolling_{feature}_{window}d'
if feature == 'mean':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).mean()
elif feature == 'std':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).std()
elif feature == 'max':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).max()
elif feature == 'min':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).min()
self.feature_columns.append(col_name)
logger.info(f"Added rolling features", windows=windows, features=features)
return df
def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add enhanced day-of-week features.
Args:
df: DataFrame with date column
date_column: Name of date column
Returns:
DataFrame with day-of-week features
"""
df = df.copy()
# Day of week (0=Monday, 6=Sunday)
df['day_of_week'] = df[date_column].dt.dayofweek
# Is weekend
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
# Is Friday (often higher demand due to weekend prep)
df['is_friday'] = (df['day_of_week'] == 4).astype(int)
# Is Monday (often lower demand after weekend)
df['is_monday'] = (df['day_of_week'] == 0).astype(int)
# Add to feature list
for col in ['day_of_week', 'is_weekend', 'is_friday', 'is_monday']:
if col not in self.feature_columns:
self.feature_columns.append(col)
return df
def add_calendar_enhanced_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add enhanced calendar features beyond basic temporal features.
Args:
df: DataFrame with date column
date_column: Name of date column
Returns:
DataFrame with enhanced calendar features
"""
df = df.copy()
# Month and quarter (if not already present)
if 'month' not in df.columns:
df['month'] = df[date_column].dt.month
if 'quarter' not in df.columns:
df['quarter'] = df[date_column].dt.quarter
# Day of month
df['day_of_month'] = df[date_column].dt.day
# Is month start/end
df['is_month_start'] = (df['day_of_month'] <= 3).astype(int)
df['is_month_end'] = (df[date_column].dt.is_month_end).astype(int)
# Week of year
df['week_of_year'] = df[date_column].dt.isocalendar().week
# Payday indicators (15th and last day of month - high bakery traffic)
df['is_payday'] = ((df['day_of_month'] == 15) | df[date_column].dt.is_month_end).astype(int)
# Add to feature list
for col in ['month', 'quarter', 'day_of_month', 'is_month_start', 'is_month_end',
'week_of_year', 'is_payday']:
if col not in self.feature_columns:
self.feature_columns.append(col)
return df
def add_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Add interaction features between variables.
Args:
df: DataFrame with base features
Returns:
DataFrame with interaction features
"""
df = df.copy()
# Weekend × Temperature (people buy more cold drinks in hot weekends)
if 'is_weekend' in df.columns and 'temperature' in df.columns:
df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
self.feature_columns.append('weekend_temp_interaction')
# Rain × Weekend (bad weather reduces weekend traffic)
if 'is_weekend' in df.columns and 'precipitation' in df.columns:
df['rain_weekend_interaction'] = df['is_weekend'] * (df['precipitation'] > 0).astype(int)
self.feature_columns.append('rain_weekend_interaction')
# Friday × Traffic (high Friday traffic means weekend prep buying)
if 'is_friday' in df.columns and 'traffic_volume' in df.columns:
df['friday_traffic_interaction'] = df['is_friday'] * df['traffic_volume']
self.feature_columns.append('friday_traffic_interaction')
# Month × Temperature (seasonal temperature patterns)
if 'month' in df.columns and 'temperature' in df.columns:
df['month_temp_interaction'] = df['month'] * df['temperature']
self.feature_columns.append('month_temp_interaction')
# Payday × Weekend (big shopping days)
if 'is_payday' in df.columns and 'is_weekend' in df.columns:
df['payday_weekend_interaction'] = df['is_payday'] * df['is_weekend']
self.feature_columns.append('payday_weekend_interaction')
logger.info(f"Added {len([c for c in self.feature_columns if 'interaction' in c])} interaction features")
return df
def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add trend-based features.
Args:
df: DataFrame with date and quantity
date_column: Name of date column
Returns:
DataFrame with trend features
"""
df = df.copy()
# Days since start (linear trend proxy)
df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days
# Momentum indicators (recent change vs. older change)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day']
self.feature_columns.append('momentum_1_7')
if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns:
df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d']
self.feature_columns.append('trend_7_30')
# Velocity (rate of change)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7
self.feature_columns.append('velocity_week')
self.feature_columns.append('days_since_start')
return df
def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Add cyclical encoding for periodic features (day_of_week, month).
Helps models understand that Monday follows Sunday, December follows January.
Args:
df: DataFrame with day_of_week and month columns
Returns:
DataFrame with cyclical features
"""
df = df.copy()
# Day of week cyclical encoding
if 'day_of_week' in df.columns:
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
self.feature_columns.extend(['day_of_week_sin', 'day_of_week_cos'])
# Month cyclical encoding
if 'month' in df.columns:
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
self.feature_columns.extend(['month_sin', 'month_cos'])
logger.info("Added cyclical encoding for temporal features")
return df
def create_all_features(
self,
df: pd.DataFrame,
date_column: str = 'date',
include_lags: bool = True,
include_rolling: bool = True,
include_interactions: bool = True,
include_cyclical: bool = True
) -> pd.DataFrame:
"""
Create all enhanced features in one go.
Args:
df: DataFrame with base data
date_column: Name of date column
include_lags: Whether to include lagged features
include_rolling: Whether to include rolling statistics
include_interactions: Whether to include interaction features
include_cyclical: Whether to include cyclical encoding
Returns:
DataFrame with all enhanced features
"""
logger.info("Creating comprehensive feature set for hybrid model")
# Reset feature list
self.feature_columns = []
# Day of week and calendar features (always needed)
df = self.add_day_of_week_features(df, date_column)
df = self.add_calendar_enhanced_features(df, date_column)
# Optional features
if include_lags:
df = self.add_lagged_features(df)
if include_rolling:
df = self.add_rolling_features(df)
if include_interactions:
df = self.add_interaction_features(df)
if include_cyclical:
df = self.add_cyclical_encoding(df)
# Trend features (depends on lags and rolling)
if include_lags or include_rolling:
df = self.add_trend_features(df, date_column)
logger.info(f"Created {len(self.feature_columns)} enhanced features for hybrid model")
return df
def get_feature_columns(self) -> List[str]:
"""Get list of all created feature column names."""
return self.feature_columns.copy()
def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_backward') -> pd.DataFrame:
"""
Fill NA values in lagged and rolling features.
Args:
df: DataFrame with potential NA values
strategy: 'forward_backward', 'zero', 'mean'
Returns:
DataFrame with filled NA values
"""
df = df.copy()
if strategy == 'forward_backward':
# Forward fill first (use previous values)
df = df.fillna(method='ffill')
# Backward fill remaining (beginning of series)
df = df.fillna(method='bfill')
elif strategy == 'zero':
df = df.fillna(0)
elif strategy == 'mean':
df = df.fillna(df.mean())
return df

View File

@@ -0,0 +1,253 @@
"""
Event Feature Generator
Converts calendar events into features for demand forecasting
"""
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Optional
from datetime import date, timedelta
import structlog
logger = structlog.get_logger()
class EventFeatureGenerator:
"""
Generate event-related features for demand forecasting.
Features include:
- Binary flags for event presence
- Event impact multipliers
- Event type indicators
- Days until/since major events
"""
# Event type impact weights (default multipliers)
EVENT_IMPACT_WEIGHTS = {
'promotion': 1.3,
'festival': 1.8,
'holiday': 0.7, # Bakeries often close or have reduced demand
'weather_event': 0.8, # Bad weather reduces foot traffic
'school_break': 1.2,
'sport_event': 1.4,
'market': 1.5,
'concert': 1.3,
'local_event': 1.2
}
def __init__(self):
pass
def generate_event_features(
self,
dates: pd.DatetimeIndex,
events: List[Dict[str, Any]]
) -> pd.DataFrame:
"""
Generate event features for given dates.
Args:
dates: Dates to generate features for
events: List of event dictionaries with keys:
- event_date: date
- event_type: str
- impact_multiplier: float (optional)
- event_name: str
Returns:
DataFrame with event features
"""
df = pd.DataFrame({'date': dates})
# Initialize feature columns
df['has_event'] = 0
df['event_impact'] = 1.0 # Neutral impact
df['is_promotion'] = 0
df['is_festival'] = 0
df['is_local_event'] = 0
df['days_to_next_event'] = 365
df['days_since_last_event'] = 365
if not events:
logger.debug("No events provided, returning default features")
return df
# Convert events to DataFrame for easier processing
events_df = pd.DataFrame(events)
events_df['event_date'] = pd.to_datetime(events_df['event_date'])
for idx, row in df.iterrows():
current_date = pd.to_datetime(row['date'])
# Check if there's an event on this date
day_events = events_df[events_df['event_date'] == current_date]
if not day_events.empty:
df.at[idx, 'has_event'] = 1
# Use custom impact multiplier if provided, else use default
if 'impact_multiplier' in day_events.columns and not day_events['impact_multiplier'].isna().all():
impact = day_events['impact_multiplier'].max()
else:
# Use default impact based on event type
event_types = day_events['event_type'].tolist()
impacts = [self.EVENT_IMPACT_WEIGHTS.get(et, 1.0) for et in event_types]
impact = max(impacts)
df.at[idx, 'event_impact'] = impact
# Set event type flags
event_types = day_events['event_type'].tolist()
if 'promotion' in event_types:
df.at[idx, 'is_promotion'] = 1
if 'festival' in event_types:
df.at[idx, 'is_festival'] = 1
if 'local_event' in event_types or 'market' in event_types:
df.at[idx, 'is_local_event'] = 1
# Calculate days to/from nearest event
future_events = events_df[events_df['event_date'] > current_date]
if not future_events.empty:
next_event_date = future_events['event_date'].min()
df.at[idx, 'days_to_next_event'] = (next_event_date - current_date).days
past_events = events_df[events_df['event_date'] < current_date]
if not past_events.empty:
last_event_date = past_events['event_date'].max()
df.at[idx, 'days_since_last_event'] = (current_date - last_event_date).days
# Cap days values at 365
df['days_to_next_event'] = df['days_to_next_event'].clip(upper=365)
df['days_since_last_event'] = df['days_since_last_event'].clip(upper=365)
logger.debug("Generated event features",
total_days=len(df),
days_with_events=df['has_event'].sum())
return df
def add_event_features_to_forecast_data(
self,
forecast_data: pd.DataFrame,
event_features: pd.DataFrame
) -> pd.DataFrame:
"""
Add event features to forecast input data.
Args:
forecast_data: Existing forecast data with 'date' column
event_features: Event features from generate_event_features()
Returns:
Enhanced forecast data with event features
"""
forecast_data = forecast_data.copy()
forecast_data['date'] = pd.to_datetime(forecast_data['date'])
event_features['date'] = pd.to_datetime(event_features['date'])
# Merge event features
enhanced_data = forecast_data.merge(
event_features[[
'date', 'has_event', 'event_impact', 'is_promotion',
'is_festival', 'is_local_event', 'days_to_next_event',
'days_since_last_event'
]],
on='date',
how='left'
)
# Fill missing with defaults
enhanced_data['has_event'].fillna(0, inplace=True)
enhanced_data['event_impact'].fillna(1.0, inplace=True)
enhanced_data['is_promotion'].fillna(0, inplace=True)
enhanced_data['is_festival'].fillna(0, inplace=True)
enhanced_data['is_local_event'].fillna(0, inplace=True)
enhanced_data['days_to_next_event'].fillna(365, inplace=True)
enhanced_data['days_since_last_event'].fillna(365, inplace=True)
return enhanced_data
def get_event_summary(self, events: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Get summary statistics about events.
Args:
events: List of event dictionaries
Returns:
Summary dict with counts by type, avg impact, etc.
"""
if not events:
return {
'total_events': 0,
'events_by_type': {},
'avg_impact': 1.0
}
events_df = pd.DataFrame(events)
summary = {
'total_events': len(events),
'events_by_type': events_df['event_type'].value_counts().to_dict(),
'date_range': {
'start': events_df['event_date'].min().isoformat() if not events_df.empty else None,
'end': events_df['event_date'].max().isoformat() if not events_df.empty else None
}
}
if 'impact_multiplier' in events_df.columns:
summary['avg_impact'] = float(events_df['impact_multiplier'].mean())
return summary
def create_event_calendar_features(
dates: pd.DatetimeIndex,
tenant_id: str,
event_repository = None
) -> pd.DataFrame:
"""
Convenience function to fetch events from database and generate features.
Args:
dates: Dates to generate features for
tenant_id: Tenant UUID
event_repository: EventRepository instance (optional)
Returns:
DataFrame with event features
"""
if event_repository is None:
logger.warning("No event repository provided, using empty events")
events = []
else:
# Fetch events from database
from datetime import date
start_date = dates.min().date()
end_date = dates.max().date()
try:
import asyncio
from uuid import UUID
loop = asyncio.get_event_loop()
events_objects = loop.run_until_complete(
event_repository.get_events_by_date_range(
tenant_id=UUID(tenant_id),
start_date=start_date,
end_date=end_date,
confirmed_only=False
)
)
# Convert to dict format
events = [event.to_dict() for event in events_objects]
except Exception as e:
logger.error(f"Failed to fetch events from database: {e}")
events = []
# Generate features
generator = EventFeatureGenerator()
return generator.generate_event_features(dates, events)

View File

@@ -0,0 +1,447 @@
"""
Hybrid Prophet + XGBoost Trainer
Combines Prophet's seasonality modeling with XGBoost's pattern learning
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
import structlog
from datetime import datetime
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')
# Import XGBoost
try:
import xgboost as xgb
except ImportError:
raise ImportError("XGBoost not installed. Run: pip install xgboost")
from app.ml.prophet_manager import BakeryProphetManager
from app.ml.enhanced_features import AdvancedFeatureEngineer
logger = structlog.get_logger()
class HybridProphetXGBoost:
"""
Hybrid forecasting model combining Prophet and XGBoost.
Approach:
1. Train Prophet on historical data (captures trend, seasonality, holidays)
2. Calculate residuals (actual - prophet_prediction)
3. Train XGBoost on residuals using enhanced features
4. Final prediction = prophet_prediction + xgboost_residual_prediction
Benefits:
- Prophet handles seasonality, holidays, trends
- XGBoost captures complex patterns Prophet misses
- Maintains Prophet's interpretability
- Improves accuracy by 10-25% over Prophet alone
"""
def __init__(self, database_manager=None):
self.prophet_manager = BakeryProphetManager(database_manager)
self.feature_engineer = AdvancedFeatureEngineer()
self.xgb_model = None
self.feature_columns = []
self.prophet_model_data = None
async def train_hybrid_model(
self,
tenant_id: str,
inventory_product_id: str,
df: pd.DataFrame,
job_id: str,
validation_split: float = 0.2
) -> Dict[str, Any]:
"""
Train hybrid Prophet + XGBoost model.
Args:
tenant_id: Tenant identifier
inventory_product_id: Product identifier
df: Training data (must have 'ds', 'y' and regressor columns)
job_id: Training job identifier
validation_split: Fraction of data for validation
Returns:
Dictionary with model metadata and performance metrics
"""
logger.info(
"Starting hybrid Prophet + XGBoost training",
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
data_points=len(df)
)
# Step 1: Train Prophet model (base forecaster)
logger.info("Step 1: Training Prophet base model")
prophet_result = await self.prophet_manager.train_bakery_model(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
df=df.copy(),
job_id=job_id
)
self.prophet_model_data = prophet_result
# Step 2: Create enhanced features for XGBoost
logger.info("Step 2: Engineering enhanced features for XGBoost")
df_enhanced = self._prepare_xgboost_features(df)
# Step 3: Split into train/validation
split_idx = int(len(df_enhanced) * (1 - validation_split))
train_df = df_enhanced.iloc[:split_idx].copy()
val_df = df_enhanced.iloc[split_idx:].copy()
logger.info(
"Data split",
train_samples=len(train_df),
val_samples=len(val_df)
)
# Step 4: Get Prophet predictions on training data
logger.info("Step 3: Generating Prophet predictions for residual calculation")
train_prophet_pred = self._get_prophet_predictions(prophet_result, train_df)
val_prophet_pred = self._get_prophet_predictions(prophet_result, val_df)
# Step 5: Calculate residuals (actual - prophet_prediction)
train_residuals = train_df['y'].values - train_prophet_pred
val_residuals = val_df['y'].values - val_prophet_pred
logger.info(
"Residuals calculated",
train_residual_mean=float(np.mean(train_residuals)),
train_residual_std=float(np.std(train_residuals))
)
# Step 6: Prepare feature matrix for XGBoost
X_train = train_df[self.feature_columns].values
X_val = val_df[self.feature_columns].values
# Step 7: Train XGBoost on residuals
logger.info("Step 4: Training XGBoost on residuals")
self.xgb_model = self._train_xgboost(
X_train, train_residuals,
X_val, val_residuals
)
# Step 8: Evaluate hybrid model
logger.info("Step 5: Evaluating hybrid model performance")
metrics = self._evaluate_hybrid_model(
train_df, val_df,
train_prophet_pred, val_prophet_pred,
prophet_result
)
# Step 9: Save hybrid model
model_data = self._package_hybrid_model(
prophet_result, metrics, tenant_id, inventory_product_id
)
logger.info(
"Hybrid model training complete",
prophet_mape=metrics['prophet_val_mape'],
hybrid_mape=metrics['hybrid_val_mape'],
improvement_pct=metrics['improvement_percentage']
)
return model_data
def _prepare_xgboost_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Prepare enhanced features for XGBoost.
Args:
df: Base dataframe with 'ds', 'y' and regressor columns
Returns:
DataFrame with all enhanced features
"""
# Rename 'ds' to 'date' for feature engineering
df_prep = df.copy()
if 'ds' in df_prep.columns:
df_prep['date'] = df_prep['ds']
# Ensure 'quantity' column for feature engineering
if 'y' in df_prep.columns:
df_prep['quantity'] = df_prep['y']
# Create all enhanced features
df_enhanced = self.feature_engineer.create_all_features(
df_prep,
date_column='date',
include_lags=True,
include_rolling=True,
include_interactions=True,
include_cyclical=True
)
# Fill NA values (from lagged features at beginning)
df_enhanced = self.feature_engineer.fill_na_values(df_enhanced)
# Get feature column list (excluding target and date columns)
self.feature_columns = [
col for col in self.feature_engineer.get_feature_columns()
if col in df_enhanced.columns
]
# Also include original regressor columns if present
regressor_cols = [
col for col in df.columns
if col not in ['ds', 'y', 'date', 'quantity'] and col in df_enhanced.columns
]
self.feature_columns.extend(regressor_cols)
self.feature_columns = list(set(self.feature_columns)) # Remove duplicates
logger.info(f"Prepared {len(self.feature_columns)} features for XGBoost")
return df_enhanced
def _get_prophet_predictions(
self,
prophet_result: Dict[str, Any],
df: pd.DataFrame
) -> np.ndarray:
"""
Get Prophet predictions for given dataframe.
Args:
prophet_result: Prophet model result from training
df: DataFrame with 'ds' column
Returns:
Array of predictions
"""
# Get the Prophet model from result
prophet_model = prophet_result.get('model')
if prophet_model is None:
raise ValueError("Prophet model not found in result")
# Prepare dataframe for prediction
pred_df = df[['ds']].copy()
# Add regressors if present
regressor_cols = [col for col in df.columns if col not in ['ds', 'y', 'date', 'quantity']]
for col in regressor_cols:
if col in df.columns:
pred_df[col] = df[col]
# Get predictions
forecast = prophet_model.predict(pred_df)
return forecast['yhat'].values
def _train_xgboost(
self,
X_train: np.ndarray,
y_train: np.ndarray,
X_val: np.ndarray,
y_val: np.ndarray
) -> xgb.XGBRegressor:
"""
Train XGBoost model on residuals.
Args:
X_train: Training features
y_train: Training residuals
X_val: Validation features
y_val: Validation residuals
Returns:
Trained XGBoost model
"""
# XGBoost parameters optimized for residual learning
params = {
'n_estimators': 100,
'max_depth': 3, # Shallow trees to prevent overfitting
'learning_rate': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'min_child_weight': 3,
'reg_alpha': 0.1, # L1 regularization
'reg_lambda': 1.0, # L2 regularization
'objective': 'reg:squarederror',
'random_state': 42,
'n_jobs': -1
}
# Initialize model
model = xgb.XGBRegressor(**params)
# Train with early stopping
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=10,
verbose=False
)
logger.info(
"XGBoost training complete",
best_iteration=model.best_iteration if hasattr(model, 'best_iteration') else None
)
return model
def _evaluate_hybrid_model(
self,
train_df: pd.DataFrame,
val_df: pd.DataFrame,
train_prophet_pred: np.ndarray,
val_prophet_pred: np.ndarray,
prophet_result: Dict[str, Any]
) -> Dict[str, float]:
"""
Evaluate hybrid model vs Prophet-only on validation set.
Args:
train_df: Training data
val_df: Validation data
train_prophet_pred: Prophet predictions on training set
val_prophet_pred: Prophet predictions on validation set
prophet_result: Prophet training result
Returns:
Dictionary of metrics
"""
# Get actual values
train_actual = train_df['y'].values
val_actual = val_df['y'].values
# Get XGBoost predictions on residuals
X_train = train_df[self.feature_columns].values
X_val = val_df[self.feature_columns].values
train_xgb_pred = self.xgb_model.predict(X_train)
val_xgb_pred = self.xgb_model.predict(X_val)
# Hybrid predictions = Prophet + XGBoost residual correction
train_hybrid_pred = train_prophet_pred + train_xgb_pred
val_hybrid_pred = val_prophet_pred + val_xgb_pred
# Calculate metrics for Prophet-only
prophet_train_mae = mean_absolute_error(train_actual, train_prophet_pred)
prophet_val_mae = mean_absolute_error(val_actual, val_prophet_pred)
prophet_train_mape = mean_absolute_percentage_error(train_actual, train_prophet_pred) * 100
prophet_val_mape = mean_absolute_percentage_error(val_actual, val_prophet_pred) * 100
# Calculate metrics for Hybrid
hybrid_train_mae = mean_absolute_error(train_actual, train_hybrid_pred)
hybrid_val_mae = mean_absolute_error(val_actual, val_hybrid_pred)
hybrid_train_mape = mean_absolute_percentage_error(train_actual, train_hybrid_pred) * 100
hybrid_val_mape = mean_absolute_percentage_error(val_actual, val_hybrid_pred) * 100
# Calculate improvement
mae_improvement = ((prophet_val_mae - hybrid_val_mae) / prophet_val_mae) * 100
mape_improvement = ((prophet_val_mape - hybrid_val_mape) / prophet_val_mape) * 100
metrics = {
'prophet_train_mae': float(prophet_train_mae),
'prophet_val_mae': float(prophet_val_mae),
'prophet_train_mape': float(prophet_train_mape),
'prophet_val_mape': float(prophet_val_mape),
'hybrid_train_mae': float(hybrid_train_mae),
'hybrid_val_mae': float(hybrid_val_mae),
'hybrid_train_mape': float(hybrid_train_mape),
'hybrid_val_mape': float(hybrid_val_mape),
'mae_improvement_pct': float(mae_improvement),
'mape_improvement_pct': float(mape_improvement),
'improvement_percentage': float(mape_improvement) # Primary metric
}
return metrics
def _package_hybrid_model(
self,
prophet_result: Dict[str, Any],
metrics: Dict[str, float],
tenant_id: str,
inventory_product_id: str
) -> Dict[str, Any]:
"""
Package hybrid model for storage.
Args:
prophet_result: Prophet model result
metrics: Hybrid model metrics
tenant_id: Tenant ID
inventory_product_id: Product ID
Returns:
Model package dictionary
"""
return {
'model_type': 'hybrid_prophet_xgboost',
'prophet_model': prophet_result.get('model'),
'xgboost_model': self.xgb_model,
'feature_columns': self.feature_columns,
'prophet_metrics': {
'train_mae': metrics['prophet_train_mae'],
'val_mae': metrics['prophet_val_mae'],
'train_mape': metrics['prophet_train_mape'],
'val_mape': metrics['prophet_val_mape']
},
'hybrid_metrics': {
'train_mae': metrics['hybrid_train_mae'],
'val_mae': metrics['hybrid_val_mae'],
'train_mape': metrics['hybrid_train_mape'],
'val_mape': metrics['hybrid_val_mape']
},
'improvement_metrics': {
'mae_improvement_pct': metrics['mae_improvement_pct'],
'mape_improvement_pct': metrics['mape_improvement_pct']
},
'tenant_id': tenant_id,
'inventory_product_id': inventory_product_id,
'trained_at': datetime.utcnow().isoformat()
}
async def predict(
self,
future_df: pd.DataFrame,
model_data: Dict[str, Any]
) -> pd.DataFrame:
"""
Make predictions using hybrid model.
Args:
future_df: DataFrame with future dates and regressors
model_data: Loaded hybrid model data
Returns:
DataFrame with predictions
"""
# Step 1: Get Prophet predictions
prophet_model = model_data['prophet_model']
prophet_forecast = prophet_model.predict(future_df)
# Step 2: Prepare features for XGBoost
future_enhanced = self._prepare_xgboost_features(future_df)
# Step 3: Get XGBoost predictions
xgb_model = model_data['xgboost_model']
feature_columns = model_data['feature_columns']
X_future = future_enhanced[feature_columns].values
xgb_pred = xgb_model.predict(X_future)
# Step 4: Combine predictions
hybrid_pred = prophet_forecast['yhat'].values + xgb_pred
# Step 5: Create result dataframe
result = pd.DataFrame({
'ds': future_df['ds'],
'prophet_yhat': prophet_forecast['yhat'],
'xgb_adjustment': xgb_pred,
'yhat': hybrid_pred,
'yhat_lower': prophet_forecast['yhat_lower'] + xgb_pred,
'yhat_upper': prophet_forecast['yhat_upper'] + xgb_pred
})
return result

View File

@@ -0,0 +1,242 @@
"""
Model Selection System
Determines whether to use Prophet-only or Hybrid Prophet+XGBoost models
"""
import pandas as pd
import numpy as np
from typing import Dict, Any, Optional
import structlog
logger = structlog.get_logger()
class ModelSelector:
"""
Intelligent model selection based on data characteristics.
Decision Criteria:
- Data size: Hybrid needs more data (min 90 days)
- Complexity: High variance benefits from XGBoost
- Seasonality strength: Weak seasonality benefits from XGBoost
- Historical performance: Compare models on validation set
"""
# Thresholds for model selection
MIN_DATA_POINTS_HYBRID = 90 # Minimum data points for hybrid
HIGH_VARIANCE_THRESHOLD = 0.5 # CV > 0.5 suggests complex patterns
LOW_SEASONALITY_THRESHOLD = 0.3 # Weak seasonal patterns
HYBRID_IMPROVEMENT_THRESHOLD = 0.05 # 5% MAPE improvement to justify hybrid
def __init__(self):
pass
def select_model_type(
self,
df: pd.DataFrame,
product_category: str = "unknown",
force_prophet: bool = False,
force_hybrid: bool = False
) -> str:
"""
Select best model type based on data characteristics.
Args:
df: Training data with 'y' column
product_category: Product category (bread, pastries, etc.)
force_prophet: Force Prophet-only model
force_hybrid: Force hybrid model
Returns:
"prophet" or "hybrid"
"""
# Honor forced selections
if force_prophet:
logger.info("Prophet-only model forced by configuration")
return "prophet"
if force_hybrid:
logger.info("Hybrid model forced by configuration")
return "hybrid"
# Check minimum data requirements
if len(df) < self.MIN_DATA_POINTS_HYBRID:
logger.info(
"Insufficient data for hybrid model, using Prophet",
data_points=len(df),
min_required=self.MIN_DATA_POINTS_HYBRID
)
return "prophet"
# Calculate data characteristics
characteristics = self._analyze_data_characteristics(df)
# Decision logic
score_hybrid = 0
score_prophet = 0
# Factor 1: Data complexity (variance)
if characteristics['coefficient_of_variation'] > self.HIGH_VARIANCE_THRESHOLD:
score_hybrid += 2
logger.debug("High variance detected, favoring hybrid", cv=characteristics['coefficient_of_variation'])
else:
score_prophet += 1
# Factor 2: Seasonality strength
if characteristics['seasonality_strength'] < self.LOW_SEASONALITY_THRESHOLD:
score_hybrid += 2
logger.debug("Weak seasonality detected, favoring hybrid", strength=characteristics['seasonality_strength'])
else:
score_prophet += 1
# Factor 3: Data size (more data = better for hybrid)
if len(df) > 180:
score_hybrid += 1
elif len(df) < 120:
score_prophet += 1
# Factor 4: Product category considerations
if product_category in ['seasonal', 'cakes']:
# Event-driven products benefit from XGBoost pattern learning
score_hybrid += 1
elif product_category in ['bread', 'savory']:
# Stable products work well with Prophet
score_prophet += 1
# Factor 5: Zero ratio (sparse data)
if characteristics['zero_ratio'] > 0.3:
# High zero ratio suggests difficult forecasting, hybrid might help
score_hybrid += 1
# Make decision
selected_model = "hybrid" if score_hybrid > score_prophet else "prophet"
logger.info(
"Model selection complete",
selected_model=selected_model,
score_hybrid=score_hybrid,
score_prophet=score_prophet,
data_points=len(df),
cv=characteristics['coefficient_of_variation'],
seasonality=characteristics['seasonality_strength'],
category=product_category
)
return selected_model
def _analyze_data_characteristics(self, df: pd.DataFrame) -> Dict[str, float]:
"""
Analyze time series characteristics.
Args:
df: DataFrame with 'y' column (sales data)
Returns:
Dictionary with data characteristics
"""
y = df['y'].values
# Coefficient of variation
cv = np.std(y) / np.mean(y) if np.mean(y) > 0 else 0
# Zero ratio
zero_ratio = (y == 0).sum() / len(y)
# Seasonality strength (simple proxy using rolling std)
if len(df) >= 14:
rolling_mean = pd.Series(y).rolling(window=7, center=True).mean()
seasonality_strength = rolling_mean.std() / (np.std(y) + 1e-6) if np.std(y) > 0 else 0
else:
seasonality_strength = 0.5 # Default
# Trend strength
if len(df) >= 30:
from scipy import stats
x = np.arange(len(y))
slope, _, r_value, _, _ = stats.linregress(x, y)
trend_strength = abs(r_value)
else:
trend_strength = 0
return {
'coefficient_of_variation': float(cv),
'zero_ratio': float(zero_ratio),
'seasonality_strength': float(seasonality_strength),
'trend_strength': float(trend_strength),
'mean': float(np.mean(y)),
'std': float(np.std(y))
}
def compare_models(
self,
prophet_metrics: Dict[str, float],
hybrid_metrics: Dict[str, float]
) -> str:
"""
Compare Prophet and Hybrid model performance.
Args:
prophet_metrics: Prophet model metrics (with 'mape' key)
hybrid_metrics: Hybrid model metrics (with 'mape' key)
Returns:
"prophet" or "hybrid" based on better performance
"""
prophet_mape = prophet_metrics.get('mape', float('inf'))
hybrid_mape = hybrid_metrics.get('mape', float('inf'))
# Calculate improvement
if prophet_mape > 0:
improvement = (prophet_mape - hybrid_mape) / prophet_mape
else:
improvement = 0
# Hybrid must improve by at least threshold to justify complexity
if improvement >= self.HYBRID_IMPROVEMENT_THRESHOLD:
logger.info(
"Hybrid model selected based on performance",
prophet_mape=prophet_mape,
hybrid_mape=hybrid_mape,
improvement=f"{improvement*100:.1f}%"
)
return "hybrid"
else:
logger.info(
"Prophet model selected (hybrid improvement insufficient)",
prophet_mape=prophet_mape,
hybrid_mape=hybrid_mape,
improvement=f"{improvement*100:.1f}%"
)
return "prophet"
def should_use_hybrid_model(
df: pd.DataFrame,
product_category: str = "unknown",
tenant_settings: Dict[str, Any] = None
) -> bool:
"""
Convenience function to determine if hybrid model should be used.
Args:
df: Training data
product_category: Product category
tenant_settings: Optional tenant-specific settings
Returns:
True if hybrid model should be used, False otherwise
"""
selector = ModelSelector()
# Check tenant settings
force_prophet = tenant_settings.get('force_prophet_only', False) if tenant_settings else False
force_hybrid = tenant_settings.get('force_hybrid', False) if tenant_settings else False
selected = selector.select_model_type(
df=df,
product_category=product_category,
force_prophet=force_prophet,
force_hybrid=force_hybrid
)
return selected == "hybrid"

View File

@@ -0,0 +1,361 @@
"""
Product Categorization System
Classifies bakery products into categories for category-specific forecasting
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Tuple
from enum import Enum
import structlog
logger = structlog.get_logger()
class ProductCategory(str, Enum):
"""Product categories for bakery items"""
BREAD = "bread"
PASTRIES = "pastries"
CAKES = "cakes"
DRINKS = "drinks"
SEASONAL = "seasonal"
SAVORY = "savory"
UNKNOWN = "unknown"
class ProductCategorizer:
"""
Automatic product categorization based on product name and sales patterns.
Categories have different characteristics:
- BREAD: Daily staple, high volume, consistent demand, short shelf life (1 day)
- PASTRIES: Morning peak, weekend boost, medium shelf life (2-3 days)
- CAKES: Event-driven, weekends, advance orders, longer shelf life (3-5 days)
- DRINKS: Weather-dependent, hot/cold seasonal patterns
- SEASONAL: Holiday-specific (roscón, panettone, etc.)
- SAVORY: Lunch peak, weekday focus
"""
def __init__(self):
# Keywords for automatic classification
self.category_keywords = {
ProductCategory.BREAD: [
'pan', 'baguette', 'hogaza', 'chapata', 'integral', 'centeno',
'bread', 'loaf', 'barra', 'molde', 'candeal'
],
ProductCategory.PASTRIES: [
'croissant', 'napolitana', 'palmera', 'ensaimada', 'magdalena',
'bollo', 'brioche', 'suizo', 'caracola', 'donut', 'berlina'
],
ProductCategory.CAKES: [
'tarta', 'pastel', 'bizcocho', 'cake', 'torta', 'milhojas',
'saint honoré', 'selva negra', 'tres leches'
],
ProductCategory.DRINKS: [
'café', 'coffee', '', 'tea', 'zumo', 'juice', 'batido',
'smoothie', 'refresco', 'agua', 'water'
],
ProductCategory.SEASONAL: [
'roscón', 'panettone', 'turrón', 'polvorón', 'mona de pascua',
'huevo de pascua', 'buñuelo', 'torrija'
],
ProductCategory.SAVORY: [
'empanada', 'quiche', 'pizza', 'focaccia', 'salado', 'bocadillo',
'sandwich', 'croqueta', 'hojaldre salado'
]
}
def categorize_product(
self,
product_name: str,
product_id: str = None,
sales_data: pd.DataFrame = None
) -> ProductCategory:
"""
Categorize a product based on name and optional sales patterns.
Args:
product_name: Product name
product_id: Optional product ID
sales_data: Optional historical sales data for pattern analysis
Returns:
ProductCategory enum
"""
# First try keyword matching
category = self._categorize_by_keywords(product_name)
if category != ProductCategory.UNKNOWN:
logger.info(f"Product categorized by keywords",
product=product_name,
category=category.value)
return category
# If no keyword match and we have sales data, analyze patterns
if sales_data is not None and len(sales_data) > 30:
category = self._categorize_by_sales_pattern(product_name, sales_data)
logger.info(f"Product categorized by sales pattern",
product=product_name,
category=category.value)
return category
logger.warning(f"Could not categorize product, using UNKNOWN",
product=product_name)
return ProductCategory.UNKNOWN
def _categorize_by_keywords(self, product_name: str) -> ProductCategory:
"""Categorize by matching keywords in product name"""
product_name_lower = product_name.lower()
# Check each category's keywords
for category, keywords in self.category_keywords.items():
for keyword in keywords:
if keyword in product_name_lower:
return category
return ProductCategory.UNKNOWN
def _categorize_by_sales_pattern(
self,
product_name: str,
sales_data: pd.DataFrame
) -> ProductCategory:
"""
Categorize by analyzing sales patterns.
Patterns:
- BREAD: Consistent daily sales, low variance
- PASTRIES: Weekend boost, morning peak
- CAKES: Weekend spike, event correlation
- DRINKS: Temperature correlation
- SEASONAL: Concentrated in specific months
- SAVORY: Weekday focus, lunch peak
"""
try:
# Ensure we have required columns
if 'date' not in sales_data.columns or 'quantity' not in sales_data.columns:
return ProductCategory.UNKNOWN
sales_data = sales_data.copy()
sales_data['date'] = pd.to_datetime(sales_data['date'])
sales_data['day_of_week'] = sales_data['date'].dt.dayofweek
sales_data['month'] = sales_data['date'].dt.month
sales_data['is_weekend'] = sales_data['day_of_week'].isin([5, 6])
# Calculate pattern metrics
weekend_avg = sales_data[sales_data['is_weekend']]['quantity'].mean()
weekday_avg = sales_data[~sales_data['is_weekend']]['quantity'].mean()
overall_avg = sales_data['quantity'].mean()
cv = sales_data['quantity'].std() / overall_avg if overall_avg > 0 else 0
# Weekend ratio
weekend_ratio = weekend_avg / weekday_avg if weekday_avg > 0 else 1.0
# Seasonal concentration (Gini coefficient for months)
monthly_sales = sales_data.groupby('month')['quantity'].sum()
seasonal_concentration = self._gini_coefficient(monthly_sales.values)
# Decision rules based on patterns
if seasonal_concentration > 0.6:
# High concentration in specific months = seasonal
return ProductCategory.SEASONAL
elif cv < 0.3 and weekend_ratio < 1.2:
# Low variance, consistent daily = bread
return ProductCategory.BREAD
elif weekend_ratio > 1.5:
# Strong weekend boost = cakes
return ProductCategory.CAKES
elif weekend_ratio > 1.2:
# Moderate weekend boost = pastries
return ProductCategory.PASTRIES
elif weekend_ratio < 0.9:
# Weekday focus = savory
return ProductCategory.SAVORY
else:
return ProductCategory.UNKNOWN
except Exception as e:
logger.error(f"Error analyzing sales pattern: {e}")
return ProductCategory.UNKNOWN
def _gini_coefficient(self, values: np.ndarray) -> float:
"""Calculate Gini coefficient for concentration measurement"""
if len(values) == 0:
return 0.0
sorted_values = np.sort(values)
n = len(values)
cumsum = np.cumsum(sorted_values)
# Gini coefficient formula
return (2 * np.sum((np.arange(1, n + 1) * sorted_values))) / (n * cumsum[-1]) - (n + 1) / n
def get_category_characteristics(self, category: ProductCategory) -> Dict[str, any]:
"""
Get forecasting characteristics for a category.
Returns hyperparameters and settings specific to the category.
"""
characteristics = {
ProductCategory.BREAD: {
"shelf_life_days": 1,
"demand_stability": "high",
"seasonality_strength": "low",
"weekend_factor": 0.95, # Slightly lower on weekends
"holiday_factor": 0.7, # Much lower on holidays
"weather_sensitivity": "low",
"prophet_params": {
"seasonality_mode": "additive",
"yearly_seasonality": False,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.01, # Very stable
"seasonality_prior_scale": 5.0
}
},
ProductCategory.PASTRIES: {
"shelf_life_days": 2,
"demand_stability": "medium",
"seasonality_strength": "medium",
"weekend_factor": 1.3, # Boost on weekends
"holiday_factor": 1.1, # Slight boost on holidays
"weather_sensitivity": "medium",
"prophet_params": {
"seasonality_mode": "multiplicative",
"yearly_seasonality": True,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.05,
"seasonality_prior_scale": 10.0
}
},
ProductCategory.CAKES: {
"shelf_life_days": 4,
"demand_stability": "low",
"seasonality_strength": "high",
"weekend_factor": 2.0, # Large weekend boost
"holiday_factor": 1.5, # Holiday boost
"weather_sensitivity": "low",
"prophet_params": {
"seasonality_mode": "multiplicative",
"yearly_seasonality": True,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.1, # More flexible
"seasonality_prior_scale": 15.0
}
},
ProductCategory.DRINKS: {
"shelf_life_days": 1,
"demand_stability": "medium",
"seasonality_strength": "high",
"weekend_factor": 1.1,
"holiday_factor": 1.2,
"weather_sensitivity": "very_high",
"prophet_params": {
"seasonality_mode": "multiplicative",
"yearly_seasonality": True,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.08,
"seasonality_prior_scale": 12.0
}
},
ProductCategory.SEASONAL: {
"shelf_life_days": 7,
"demand_stability": "very_low",
"seasonality_strength": "very_high",
"weekend_factor": 1.2,
"holiday_factor": 3.0, # Massive holiday boost
"weather_sensitivity": "low",
"prophet_params": {
"seasonality_mode": "multiplicative",
"yearly_seasonality": True,
"weekly_seasonality": False,
"daily_seasonality": False,
"changepoint_prior_scale": 0.2, # Very flexible
"seasonality_prior_scale": 20.0
}
},
ProductCategory.SAVORY: {
"shelf_life_days": 1,
"demand_stability": "medium",
"seasonality_strength": "low",
"weekend_factor": 0.8, # Lower on weekends
"holiday_factor": 0.6, # Much lower on holidays
"weather_sensitivity": "medium",
"prophet_params": {
"seasonality_mode": "additive",
"yearly_seasonality": False,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.03,
"seasonality_prior_scale": 7.0
}
},
ProductCategory.UNKNOWN: {
"shelf_life_days": 2,
"demand_stability": "medium",
"seasonality_strength": "medium",
"weekend_factor": 1.0,
"holiday_factor": 1.0,
"weather_sensitivity": "medium",
"prophet_params": {
"seasonality_mode": "multiplicative",
"yearly_seasonality": True,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.05,
"seasonality_prior_scale": 10.0
}
}
}
return characteristics.get(category, characteristics[ProductCategory.UNKNOWN])
def batch_categorize(
self,
products: List[Dict[str, any]],
sales_data: pd.DataFrame = None
) -> Dict[str, ProductCategory]:
"""
Categorize multiple products at once.
Args:
products: List of dicts with 'id' and 'name' keys
sales_data: Optional sales data with 'inventory_product_id' column
Returns:
Dict mapping product_id to category
"""
results = {}
for product in products:
product_id = product.get('id')
product_name = product.get('name', '')
# Filter sales data for this product if available
product_sales = None
if sales_data is not None and 'inventory_product_id' in sales_data.columns:
product_sales = sales_data[
sales_data['inventory_product_id'] == product_id
].copy()
category = self.categorize_product(
product_name=product_name,
product_id=product_id,
sales_data=product_sales
)
results[product_id] = category
logger.info(f"Batch categorization complete",
total_products=len(products),
categories=dict(pd.Series(list(results.values())).value_counts()))
return results

View File

@@ -19,6 +19,8 @@ import json
from pathlib import Path
import math
import warnings
import shutil
import errno
warnings.filterwarnings('ignore')
from sqlalchemy.ext.asyncio import AsyncSession
@@ -39,6 +41,38 @@ from app.utils.distributed_lock import get_training_lock, LockAcquisitionError
logger = logging.getLogger(__name__)
def check_disk_space(path='/tmp', min_free_gb=1.0):
"""
Check if there's enough disk space available.
Args:
path: Path to check disk space for
min_free_gb: Minimum required free space in GB
Returns:
tuple: (bool: has_space, float: free_gb, float: total_gb, float: used_percent)
"""
try:
stat = shutil.disk_usage(path)
total_gb = stat.total / (1024**3)
free_gb = stat.free / (1024**3)
used_gb = stat.used / (1024**3)
used_percent = (stat.used / stat.total) * 100
has_space = free_gb >= min_free_gb
logger.info(f"Disk space check for {path}: "
f"total={total_gb:.2f}GB, free={free_gb:.2f}GB, "
f"used={used_gb:.2f}GB ({used_percent:.1f}%)")
if used_percent > 85:
logger.warning(f"Disk usage is high: {used_percent:.1f}% - this may cause issues")
return has_space, free_gb, total_gb, used_percent
except Exception as e:
logger.error(f"Failed to check disk space: {e}")
return True, 0, 0, 0 # Assume OK if we can't check
class BakeryProphetManager:
"""
Simplified Prophet Manager with built-in hyperparameter optimization.
@@ -58,10 +92,27 @@ class BakeryProphetManager:
tenant_id: str,
inventory_product_id: str,
df: pd.DataFrame,
job_id: str) -> Dict[str, Any]:
job_id: str,
product_category: 'ProductCategory' = None,
category_hyperparameters: Dict[str, Any] = None) -> Dict[str, Any]:
"""
Train a Prophet model with automatic hyperparameter optimization and distributed locking.
Args:
tenant_id: Tenant identifier
inventory_product_id: Product identifier
df: Training data DataFrame
job_id: Training job identifier
product_category: Optional product category for category-specific settings
category_hyperparameters: Optional category-specific Prophet hyperparameters
"""
# Check disk space before starting training
has_space, free_gb, total_gb, used_percent = check_disk_space('/tmp', min_free_gb=0.5)
if not has_space:
error_msg = f"Insufficient disk space: {free_gb:.2f}GB free ({used_percent:.1f}% used). Need at least 0.5GB free."
logger.error(error_msg)
raise RuntimeError(error_msg)
# Acquire distributed lock to prevent concurrent training of same product
lock = get_training_lock(tenant_id, inventory_product_id, use_advisory=True)
@@ -79,9 +130,33 @@ class BakeryProphetManager:
# Get regressor columns
regressor_columns = self._extract_regressor_columns(prophet_data)
# Automatically optimize hyperparameters
logger.info(f"Optimizing hyperparameters for {inventory_product_id}...")
best_params = await self._optimize_hyperparameters(prophet_data, inventory_product_id, regressor_columns)
# Use category-specific hyperparameters if provided, otherwise optimize
if category_hyperparameters:
logger.info(f"Using category-specific hyperparameters for {inventory_product_id} (category: {product_category.value if product_category else 'unknown'})")
best_params = category_hyperparameters.copy()
use_optimized = False # Not optimized, but category-specific
else:
# Automatically optimize hyperparameters
logger.info(f"Optimizing hyperparameters for {inventory_product_id}...")
try:
best_params = await self._optimize_hyperparameters(prophet_data, inventory_product_id, regressor_columns)
use_optimized = True
except Exception as opt_error:
logger.warning(f"Hyperparameter optimization failed for {inventory_product_id}: {opt_error}")
logger.warning("Falling back to default Prophet parameters")
# Use conservative default parameters
best_params = {
'changepoint_prior_scale': 0.05,
'seasonality_prior_scale': 10.0,
'holidays_prior_scale': 10.0,
'changepoint_range': 0.8,
'seasonality_mode': 'additive',
'daily_seasonality': False,
'weekly_seasonality': True,
'yearly_seasonality': len(prophet_data) > 365,
'uncertainty_samples': 0 # Disable uncertainty sampling to avoid cmdstan
}
use_optimized = False
# Create optimized Prophet model
model = self._create_optimized_prophet_model(best_params, regressor_columns)
@@ -91,8 +166,38 @@ class BakeryProphetManager:
if regressor in prophet_data.columns:
model.add_regressor(regressor)
# Fit the model
model.fit(prophet_data)
# Set environment variable for cmdstan tmp directory
import os
tmpdir = os.environ.get('TMPDIR', '/tmp/cmdstan')
os.makedirs(tmpdir, mode=0o777, exist_ok=True)
os.environ['TMPDIR'] = tmpdir
# Verify tmp directory is writable
test_file = os.path.join(tmpdir, f'test_write_{inventory_product_id}.tmp')
try:
with open(test_file, 'w') as f:
f.write('test')
os.remove(test_file)
logger.debug(f"Verified {tmpdir} is writable")
except Exception as e:
logger.error(f"TMPDIR {tmpdir} is not writable: {e}")
raise RuntimeError(f"Cannot write to {tmpdir}: {e}")
# Fit the model with enhanced error handling
try:
logger.info(f"Starting Prophet model fit for {inventory_product_id}")
model.fit(prophet_data)
logger.info(f"Prophet model fit completed successfully for {inventory_product_id}")
except Exception as fit_error:
error_details = {
'error_type': type(fit_error).__name__,
'error_message': str(fit_error),
'errno': getattr(fit_error, 'errno', None),
'tmpdir': tmpdir,
'disk_space': check_disk_space(tmpdir, 0)
}
logger.error(f"Prophet model fit failed for {inventory_product_id}: {error_details}")
raise RuntimeError(f"Prophet training failed: {error_details['error_message']}") from fit_error
# Calculate enhanced training metrics first
training_metrics = await self._calculate_training_metrics(model, prophet_data, best_params)
@@ -104,18 +209,39 @@ class BakeryProphetManager:
)
# Return same format as before, but with optimization info
# Ensure hyperparameters are JSON-serializable
def _serialize_hyperparameters(params):
"""Helper to ensure hyperparameters are JSON serializable"""
if not params:
return {}
safe_params = {}
for k, v in params.items():
try:
if isinstance(v, (int, float, str, bool, type(None))):
safe_params[k] = v
elif hasattr(v, 'item'): # numpy scalars
safe_params[k] = v.item()
elif isinstance(v, (list, tuple)):
safe_params[k] = [x.item() if hasattr(x, 'item') else x for x in v]
else:
safe_params[k] = float(v) if isinstance(v, (np.integer, np.floating)) else str(v)
except:
safe_params[k] = str(v) # fallback to string conversion
return safe_params
model_info = {
"model_id": model_id,
"model_path": model_path,
"type": "prophet_optimized",
"training_samples": len(prophet_data),
"features": regressor_columns,
"hyperparameters": best_params,
"hyperparameters": _serialize_hyperparameters(best_params),
"training_metrics": training_metrics,
"product_category": product_category.value if product_category else "unknown",
"trained_at": datetime.now().isoformat(),
"data_period": {
"start_date": prophet_data['ds'].min().isoformat(),
"end_date": prophet_data['ds'].max().isoformat(),
"start_date": pd.Timestamp(prophet_data['ds'].min()).isoformat(),
"end_date": pd.Timestamp(prophet_data['ds'].max()).isoformat(),
"total_days": len(prophet_data)
}
}
@@ -238,7 +364,7 @@ class BakeryProphetManager:
'daily_seasonality': trial.suggest_categorical('daily_seasonality', [True, False]),
'weekly_seasonality': True, # Always keep weekly
'yearly_seasonality': trial.suggest_categorical('yearly_seasonality', [True, False]),
'uncertainty_samples': trial.suggest_int('uncertainty_samples', uncertainty_range[0], uncertainty_range[1]) # ✅ FIX: Adaptive uncertainty sampling
'uncertainty_samples': int(trial.suggest_int('uncertainty_samples', int(uncertainty_range[0]), int(uncertainty_range[1]))) # ✅ FIX: Explicit int casting for all values
}
# Simple 2-fold cross-validation for speed
@@ -254,17 +380,32 @@ class BakeryProphetManager:
try:
# Create and train model with adaptive uncertainty sampling
uncertainty_samples = params.get('uncertainty_samples', 200) # ✅ FIX: Use adaptive uncertainty samples
model = Prophet(**{k: v for k, v in params.items() if k != 'uncertainty_samples'},
uncertainty_samples = int(params.get('uncertainty_samples', 200)) # ✅ FIX: Explicit int casting to prevent type errors
# Set environment variable for cmdstan tmp directory
import os
tmpdir = os.environ.get('TMPDIR', '/tmp/cmdstan')
os.makedirs(tmpdir, mode=0o777, exist_ok=True)
os.environ['TMPDIR'] = tmpdir
model = Prophet(**{k: v for k, v in params.items() if k != 'uncertainty_samples'},
interval_width=0.8, uncertainty_samples=uncertainty_samples)
for regressor in regressor_columns:
if regressor in train_data.columns:
model.add_regressor(regressor)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
model.fit(train_data)
try:
model.fit(train_data)
except OSError as e:
# Log errno for "Operation not permitted" errors
if e.errno == errno.EPERM:
logger.error(f"Permission denied during Prophet fit (errno={e.errno}): {e}")
logger.error(f"TMPDIR: {tmpdir}, exists: {os.path.exists(tmpdir)}, "
f"writable: {os.access(tmpdir, os.W_OK)}")
raise
# Predict on validation set
future_df = model.make_future_dataframe(periods=0)
@@ -317,9 +458,9 @@ class BakeryProphetManager:
logger.info(f"Optimization completed for {inventory_product_id}. Best score: {best_score:.2f}%. "
f"Parameters: {best_params}")
# ✅ FIX: Log uncertainty sampling configuration for debugging confidence intervals
uncertainty_samples = best_params.get('uncertainty_samples', 500)
# ✅ FIX: Log uncertainty sampling configuration for debugging confidence intervals with explicit int casting
uncertainty_samples = int(best_params.get('uncertainty_samples', 500))
logger.info(f"Prophet model will use {uncertainty_samples} uncertainty samples for {inventory_product_id} "
f"(category: {product_category}, zero_ratio: {zero_ratio:.2f})")
@@ -363,25 +504,43 @@ class BakeryProphetManager:
def _create_optimized_prophet_model(self, optimized_params: Dict[str, Any], regressor_columns: List[str]) -> Prophet:
"""Create Prophet model with optimized parameters and adaptive uncertainty sampling"""
holidays = self._get_spanish_holidays()
# Determine uncertainty samples based on data characteristics
uncertainty_samples = optimized_params.get('uncertainty_samples', 500)
model = Prophet(
holidays=holidays if not holidays.empty else None,
daily_seasonality=optimized_params.get('daily_seasonality', True),
weekly_seasonality=optimized_params.get('weekly_seasonality', True),
yearly_seasonality=optimized_params.get('yearly_seasonality', True),
seasonality_mode=optimized_params.get('seasonality_mode', 'additive'),
changepoint_prior_scale=optimized_params.get('changepoint_prior_scale', 0.05),
seasonality_prior_scale=optimized_params.get('seasonality_prior_scale', 10.0),
holidays_prior_scale=optimized_params.get('holidays_prior_scale', 10.0),
changepoint_range=optimized_params.get('changepoint_range', 0.8),
interval_width=0.8,
mcmc_samples=0,
uncertainty_samples=uncertainty_samples
)
# Determine uncertainty samples based on data characteristics with explicit int casting
uncertainty_samples = int(optimized_params.get('uncertainty_samples', 500)) if optimized_params.get('uncertainty_samples') is not None else 500
# If uncertainty_samples is 0, we're in fallback mode (no cmdstan)
if uncertainty_samples == 0:
logger.info("Creating Prophet model without uncertainty sampling (fallback mode)")
model = Prophet(
holidays=holidays if not holidays.empty else None,
daily_seasonality=optimized_params.get('daily_seasonality', True),
weekly_seasonality=optimized_params.get('weekly_seasonality', True),
yearly_seasonality=optimized_params.get('yearly_seasonality', True),
seasonality_mode=optimized_params.get('seasonality_mode', 'additive'),
changepoint_prior_scale=float(optimized_params.get('changepoint_prior_scale', 0.05)),
seasonality_prior_scale=float(optimized_params.get('seasonality_prior_scale', 10.0)),
holidays_prior_scale=float(optimized_params.get('holidays_prior_scale', 10.0)),
changepoint_range=float(optimized_params.get('changepoint_range', 0.8)),
interval_width=0.8,
mcmc_samples=0,
uncertainty_samples=1 # Minimum value to avoid errors
)
else:
model = Prophet(
holidays=holidays if not holidays.empty else None,
daily_seasonality=optimized_params.get('daily_seasonality', True),
weekly_seasonality=optimized_params.get('weekly_seasonality', True),
yearly_seasonality=optimized_params.get('yearly_seasonality', True),
seasonality_mode=optimized_params.get('seasonality_mode', 'additive'),
changepoint_prior_scale=float(optimized_params.get('changepoint_prior_scale', 0.05)),
seasonality_prior_scale=float(optimized_params.get('seasonality_prior_scale', 10.0)),
holidays_prior_scale=float(optimized_params.get('holidays_prior_scale', 10.0)),
changepoint_range=float(optimized_params.get('changepoint_range', 0.8)),
interval_width=0.8,
mcmc_samples=0,
uncertainty_samples=uncertainty_samples
)
return model
# All the existing methods remain the same, just with enhanced metrics
@@ -539,8 +698,8 @@ class BakeryProphetManager:
"regressor_columns": regressor_columns,
"training_samples": len(training_data),
"data_period": {
"start_date": training_data['ds'].min().isoformat(),
"end_date": training_data['ds'].max().isoformat()
"start_date": pd.Timestamp(training_data['ds'].min()).isoformat(),
"end_date": pd.Timestamp(training_data['ds'].max()).isoformat()
},
"optimized": True,
"optimized_parameters": optimized_params or {},
@@ -566,6 +725,25 @@ class BakeryProphetManager:
# Deactivate previous models for this product
await self._deactivate_previous_models_with_session(db_session, tenant_id, inventory_product_id)
# Helper to ensure hyperparameters are JSON serializable
def _serialize_hyperparameters(params):
if not params:
return {}
safe_params = {}
for k, v in params.items():
try:
if isinstance(v, (int, float, str, bool, type(None))):
safe_params[k] = v
elif hasattr(v, 'item'): # numpy scalars
safe_params[k] = v.item()
elif isinstance(v, (list, tuple)):
safe_params[k] = [x.item() if hasattr(x, 'item') else x for x in v]
else:
safe_params[k] = float(v) if isinstance(v, (np.integer, np.floating)) else str(v)
except:
safe_params[k] = str(v) # fallback to string conversion
return safe_params
# Create new database record
db_model = TrainedModel(
id=model_id,
@@ -575,22 +753,22 @@ class BakeryProphetManager:
job_id=model_id.split('_')[0], # Extract job_id from model_id
model_path=str(model_path),
metadata_path=str(metadata_path),
hyperparameters=optimized_params or {},
features_used=regressor_columns,
hyperparameters=_serialize_hyperparameters(optimized_params or {}),
features_used=[str(f) for f in regressor_columns] if regressor_columns else [],
is_active=True,
is_production=True, # New models are production-ready
training_start_date=training_data['ds'].min().to_pydatetime().replace(tzinfo=None) if training_data['ds'].min().tz is None else training_data['ds'].min().to_pydatetime(),
training_end_date=training_data['ds'].max().to_pydatetime().replace(tzinfo=None) if training_data['ds'].max().tz is None else training_data['ds'].max().to_pydatetime(),
training_start_date=pd.Timestamp(training_data['ds'].min()).to_pydatetime().replace(tzinfo=None),
training_end_date=pd.Timestamp(training_data['ds'].max()).to_pydatetime().replace(tzinfo=None),
training_samples=len(training_data)
)
# Add training metrics if available
if training_metrics:
db_model.mape = training_metrics.get('mape')
db_model.mae = training_metrics.get('mae')
db_model.rmse = training_metrics.get('rmse')
db_model.r2_score = training_metrics.get('r2')
db_model.data_quality_score = training_metrics.get('data_quality_score')
db_model.mape = float(training_metrics.get('mape')) if training_metrics.get('mape') is not None else None
db_model.mae = float(training_metrics.get('mae')) if training_metrics.get('mae') is not None else None
db_model.rmse = float(training_metrics.get('rmse')) if training_metrics.get('rmse') is not None else None
db_model.r2_score = float(training_metrics.get('r2')) if training_metrics.get('r2') is not None else None
db_model.data_quality_score = float(training_metrics.get('data_quality_score')) if training_metrics.get('data_quality_score') is not None else None
db_session.add(db_model)
await db_session.commit()
@@ -698,7 +876,7 @@ class BakeryProphetManager:
# Ensure y values are non-negative
prophet_data['y'] = prophet_data['y'].clip(lower=0)
logger.info(f"Prepared Prophet data: {len(prophet_data)} rows, date range: {prophet_data['ds'].min()} to {prophet_data['ds'].max()}")
logger.info(f"Prepared Prophet data: {len(prophet_data)} rows, date range: {pd.Timestamp(prophet_data['ds'].min())} to {pd.Timestamp(prophet_data['ds'].max())}")
return prophet_data
@@ -714,12 +892,69 @@ class BakeryProphetManager:
logger.info(f"Identified regressor columns: {regressor_columns}")
return regressor_columns
def _get_spanish_holidays(self) -> pd.DataFrame:
"""Get Spanish holidays (unchanged)"""
def _get_spanish_holidays(self, region: str = None) -> pd.DataFrame:
"""
Get Spanish holidays dynamically using holidays library.
Supports national and regional holidays, including dynamic Easter calculation.
Args:
region: Region code (e.g., 'MD' for Madrid, 'PV' for Basque Country)
Returns:
DataFrame with holiday dates and names
"""
try:
import holidays
holidays_list = []
years = range(2020, 2035) # Extended range for better coverage
# Get Spanish holidays for each year
for year in years:
# National holidays
spain_holidays = holidays.Spain(years=year, prov=region)
for date, name in spain_holidays.items():
holidays_list.append({
'holiday': self._normalize_holiday_name(name),
'ds': pd.Timestamp(date),
'lower_window': 0,
'upper_window': 0 # Can be adjusted for multi-day holidays
})
if holidays_list:
holidays_df = pd.DataFrame(holidays_list)
# Remove duplicates (some holidays may repeat)
holidays_df = holidays_df.drop_duplicates(subset=['ds', 'holiday'])
holidays_df = holidays_df.sort_values('ds').reset_index(drop=True)
logger.info(f"Loaded {len(holidays_df)} Spanish holidays dynamically",
region=region or 'National',
years=f"{min(years)}-{max(years)}")
return holidays_df
else:
return pd.DataFrame()
except Exception as e:
logger.warning(f"Could not load Spanish holidays dynamically: {str(e)}")
# Fallback to minimal hardcoded holidays
return self._get_fallback_holidays()
def _normalize_holiday_name(self, name: str) -> str:
"""Normalize holiday name to a consistent format for Prophet"""
# Convert to lowercase and replace spaces with underscores
normalized = name.lower().replace(' ', '_').replace("'", '')
# Remove special characters
normalized = ''.join(c for c in normalized if c.isalnum() or c == '_')
return normalized
def _get_fallback_holidays(self) -> pd.DataFrame:
"""Fallback to basic hardcoded holidays if dynamic loading fails"""
try:
holidays_list = []
years = range(2020, 2030)
years = range(2020, 2035)
for year in years:
holidays_list.extend([
{'holiday': 'new_year', 'ds': f'{year}-01-01'},
@@ -732,14 +967,10 @@ class BakeryProphetManager:
{'holiday': 'immaculate_conception', 'ds': f'{year}-12-08'},
{'holiday': 'christmas', 'ds': f'{year}-12-25'}
])
if holidays_list:
holidays_df = pd.DataFrame(holidays_list)
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])
return holidays_df
else:
return pd.DataFrame()
holidays_df = pd.DataFrame(holidays_list)
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])
return holidays_df
except Exception as e:
logger.warning(f"Could not load Spanish holidays: {str(e)}")
logger.error(f"Fallback holidays failed: {e}")
return pd.DataFrame()

View File

@@ -0,0 +1,284 @@
"""
Traffic Forecasting System
Predicts bakery foot traffic using weather and temporal features
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional
from prophet import Prophet
import structlog
from datetime import datetime, timedelta
logger = structlog.get_logger()
class TrafficForecaster:
"""
Forecast bakery foot traffic using Prophet with weather and temporal features.
Traffic patterns are influenced by:
- Weather: Temperature, precipitation, conditions
- Time: Day of week, holidays, season
- Special events: Local events, promotions
"""
def __init__(self):
self.model = None
self.is_trained = False
def train(
self,
historical_traffic: pd.DataFrame,
weather_data: pd.DataFrame = None
) -> Dict[str, Any]:
"""
Train traffic forecasting model.
Args:
historical_traffic: DataFrame with columns ['date', 'traffic_count']
weather_data: Optional weather data with columns ['date', 'temperature', 'precipitation', 'condition']
Returns:
Training metrics
"""
try:
logger.info("Training traffic forecasting model",
data_points=len(historical_traffic))
# Prepare Prophet format
df = historical_traffic.copy()
df = df.rename(columns={'date': 'ds', 'traffic_count': 'y'})
df['ds'] = pd.to_datetime(df['ds'])
df = df.sort_values('ds')
# Merge with weather data if available
if weather_data is not None:
weather_data = weather_data.copy()
weather_data['date'] = pd.to_datetime(weather_data['date'])
df = df.merge(weather_data, left_on='ds', right_on='date', how='left')
# Create Prophet model with custom settings for traffic
self.model = Prophet(
seasonality_mode='multiplicative',
yearly_seasonality=True,
weekly_seasonality=True,
daily_seasonality=False,
changepoint_prior_scale=0.05, # Moderate flexibility
seasonality_prior_scale=10.0,
holidays_prior_scale=10.0
)
# Add weather regressors if available
if 'temperature' in df.columns:
self.model.add_regressor('temperature')
if 'precipitation' in df.columns:
self.model.add_regressor('precipitation')
if 'is_rainy' in df.columns:
self.model.add_regressor('is_rainy')
# Add custom holidays for Spain
from app.ml.prophet_manager import BakeryProphetManager
spanish_holidays = self._get_spanish_holidays(
df['ds'].min().year,
df['ds'].max().year + 1
)
self.model.add_country_holidays(country_name='ES')
# Fit model
self.model.fit(df)
self.is_trained = True
# Calculate training metrics
predictions = self.model.predict(df)
metrics = self._calculate_metrics(df['y'].values, predictions['yhat'].values)
logger.info("Traffic forecasting model trained successfully",
mape=metrics['mape'],
rmse=metrics['rmse'])
return metrics
except Exception as e:
logger.error(f"Failed to train traffic forecasting model: {e}")
raise
def predict(
self,
future_dates: pd.DatetimeIndex,
weather_forecast: pd.DataFrame = None
) -> pd.DataFrame:
"""
Predict traffic for future dates.
Args:
future_dates: Dates to predict traffic for
weather_forecast: Optional weather forecast data
Returns:
DataFrame with columns ['date', 'predicted_traffic', 'yhat_lower', 'yhat_upper']
"""
if not self.is_trained:
raise ValueError("Model not trained. Call train() first.")
try:
# Create future dataframe
future = pd.DataFrame({'ds': future_dates})
# Add weather features if available
if weather_forecast is not None:
weather_forecast = weather_forecast.copy()
weather_forecast['date'] = pd.to_datetime(weather_forecast['date'])
future = future.merge(weather_forecast, left_on='ds', right_on='date', how='left')
# Fill missing weather with defaults
if 'temperature' in future.columns:
future['temperature'].fillna(15.0, inplace=True)
if 'precipitation' in future.columns:
future['precipitation'].fillna(0.0, inplace=True)
if 'is_rainy' in future.columns:
future['is_rainy'].fillna(0, inplace=True)
# Predict
forecast = self.model.predict(future)
# Format results
results = pd.DataFrame({
'date': forecast['ds'],
'predicted_traffic': forecast['yhat'].clip(lower=0), # Traffic can't be negative
'yhat_lower': forecast['yhat_lower'].clip(lower=0),
'yhat_upper': forecast['yhat_upper'].clip(lower=0)
})
logger.info("Traffic predictions generated",
dates=len(results),
avg_traffic=results['predicted_traffic'].mean())
return results
except Exception as e:
logger.error(f"Failed to predict traffic: {e}")
raise
def _calculate_metrics(self, actual: np.ndarray, predicted: np.ndarray) -> Dict[str, float]:
"""Calculate forecast accuracy metrics"""
mae = np.mean(np.abs(actual - predicted))
mse = np.mean((actual - predicted) ** 2)
rmse = np.sqrt(mse)
# MAPE (handle zeros)
mask = actual != 0
mape = np.mean(np.abs((actual[mask] - predicted[mask]) / actual[mask])) * 100 if mask.any() else 0
return {
'mae': float(mae),
'mse': float(mse),
'rmse': float(rmse),
'mape': float(mape)
}
def _get_spanish_holidays(self, start_year: int, end_year: int) -> pd.DataFrame:
"""Get Spanish holidays for the date range"""
try:
import holidays
es_holidays = holidays.Spain(years=range(start_year, end_year + 1))
holiday_dates = []
holiday_names = []
for date, name in es_holidays.items():
holiday_dates.append(date)
holiday_names.append(name)
return pd.DataFrame({
'ds': pd.to_datetime(holiday_dates),
'holiday': holiday_names
})
except Exception as e:
logger.warning(f"Could not load Spanish holidays: {e}")
return pd.DataFrame(columns=['ds', 'holiday'])
class TrafficFeatureGenerator:
"""
Generate traffic-related features for demand forecasting.
Uses predicted traffic as a feature in product demand models.
"""
def __init__(self, traffic_forecaster: TrafficForecaster = None):
self.traffic_forecaster = traffic_forecaster or TrafficForecaster()
def generate_traffic_features(
self,
dates: pd.DatetimeIndex,
weather_forecast: pd.DataFrame = None
) -> pd.DataFrame:
"""
Generate traffic features for given dates.
Args:
dates: Dates to generate features for
weather_forecast: Optional weather forecast
Returns:
DataFrame with traffic features
"""
if not self.traffic_forecaster.is_trained:
logger.warning("Traffic forecaster not trained, using default traffic values")
return pd.DataFrame({
'date': dates,
'predicted_traffic': 100.0, # Default baseline
'traffic_normalized': 1.0
})
# Predict traffic
traffic_predictions = self.traffic_forecaster.predict(dates, weather_forecast)
# Normalize traffic (0-2 range, 1 = average)
mean_traffic = traffic_predictions['predicted_traffic'].mean()
traffic_predictions['traffic_normalized'] = (
traffic_predictions['predicted_traffic'] / mean_traffic
).clip(0, 2)
# Add traffic categories
traffic_predictions['traffic_category'] = pd.cut(
traffic_predictions['predicted_traffic'],
bins=[0, 50, 100, 150, np.inf],
labels=['low', 'medium', 'high', 'very_high']
)
return traffic_predictions
def add_traffic_features_to_forecast_data(
self,
forecast_data: pd.DataFrame,
traffic_predictions: pd.DataFrame
) -> pd.DataFrame:
"""
Add traffic features to forecast input data.
Args:
forecast_data: Existing forecast data with 'date' column
traffic_predictions: Traffic predictions from generate_traffic_features()
Returns:
Enhanced forecast data with traffic features
"""
forecast_data = forecast_data.copy()
forecast_data['date'] = pd.to_datetime(forecast_data['date'])
traffic_predictions['date'] = pd.to_datetime(traffic_predictions['date'])
# Merge traffic features
enhanced_data = forecast_data.merge(
traffic_predictions[['date', 'predicted_traffic', 'traffic_normalized']],
on='date',
how='left'
)
# Fill missing with defaults
enhanced_data['predicted_traffic'].fillna(100.0, inplace=True)
enhanced_data['traffic_normalized'].fillna(1.0, inplace=True)
return enhanced_data

View File

@@ -14,6 +14,9 @@ import asyncio
from app.ml.data_processor import EnhancedBakeryDataProcessor
from app.ml.prophet_manager import BakeryProphetManager
from app.ml.product_categorizer import ProductCategorizer, ProductCategory
from app.ml.model_selector import ModelSelector
from app.ml.hybrid_trainer import HybridProphetXGBoost
from app.services.training_orchestrator import TrainingDataSet
from app.core.config import settings
@@ -49,6 +52,9 @@ class EnhancedBakeryMLTrainer:
self.database_manager = database_manager or create_database_manager(settings.DATABASE_URL, "training-service")
self.enhanced_data_processor = EnhancedBakeryDataProcessor(self.database_manager)
self.prophet_manager = BakeryProphetManager(database_manager=self.database_manager)
self.hybrid_trainer = HybridProphetXGBoost(database_manager=self.database_manager)
self.model_selector = ModelSelector()
self.product_categorizer = ProductCategorizer()
async def _get_repositories(self, session):
"""Initialize repositories with session"""
@@ -169,6 +175,16 @@ class EnhancedBakeryMLTrainer:
sales_df, weather_df, traffic_df, products, tenant_id, job_id
)
# Categorize all products for category-specific forecasting
logger.info("Categorizing products for optimized forecasting")
product_categories = await self._categorize_all_products(
sales_df, processed_data
)
logger.info("Product categorization complete",
total_products=len(product_categories),
categories_breakdown={cat.value: sum(1 for c in product_categories.values() if c == cat)
for cat in set(product_categories.values())})
# Event 2: Data Analysis (20%)
# Recalculate time remaining based on elapsed time
elapsed_seconds = (datetime.now(timezone.utc) - repos['training_log']._get_start_time(job_id) if hasattr(repos['training_log'], '_get_start_time') else 0) or 0
@@ -202,7 +218,7 @@ class EnhancedBakeryMLTrainer:
)
training_results = await self._train_all_models_enhanced(
tenant_id, processed_data, job_id, repos, progress_tracker
tenant_id, processed_data, job_id, repos, progress_tracker, product_categories
)
# Calculate overall training summary with enhanced metrics
@@ -269,6 +285,149 @@ class EnhancedBakeryMLTrainer:
raise
async def train_single_product_model(self,
tenant_id: str,
inventory_product_id: str,
training_data: pd.DataFrame,
job_id: str = None) -> Dict[str, Any]:
"""
Train a model for a single product using repository pattern.
Args:
tenant_id: Tenant identifier
inventory_product_id: Specific inventory product to train
training_data: Prepared training DataFrame for the product
job_id: Training job identifier (optional)
Returns:
Dictionary with model training results
"""
if not job_id:
job_id = f"single_product_{tenant_id}_{inventory_product_id}_{uuid.uuid4().hex[:8]}"
logger.info("Starting single product model training",
job_id=job_id,
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
data_points=len(training_data))
try:
# Get database session and repositories
async with self.database_manager.get_session() as db_session:
repos = await self._get_repositories(db_session)
# Validate input data
if training_data.empty or len(training_data) < settings.MIN_TRAINING_DATA_DAYS:
raise ValueError(f"Insufficient training data: need at least {settings.MIN_TRAINING_DATA_DAYS} data points, got {len(training_data)}")
# Validate required columns
required_columns = ['ds', 'y']
missing_cols = [col for col in required_columns if col not in training_data.columns]
if missing_cols:
raise ValueError(f"Missing required columns in training data: {missing_cols}")
# Create a simple progress tracker for single product
from app.services.progress_tracker import ParallelProductProgressTracker
progress_tracker = ParallelProductProgressTracker(
job_id=job_id,
tenant_id=tenant_id,
total_products=1
)
# Ensure training data has proper data types before training
if 'ds' in training_data.columns:
training_data['ds'] = pd.to_datetime(training_data['ds'])
if 'y' in training_data.columns:
training_data['y'] = pd.to_numeric(training_data['y'], errors='coerce')
# Remove any rows with NaN values
training_data = training_data.dropna()
# Train the model using the existing _train_single_product method
product_id, result = await self._train_single_product(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
product_data=training_data,
job_id=job_id,
repos=repos,
progress_tracker=progress_tracker
)
logger.info("Single product training completed",
job_id=job_id,
inventory_product_id=inventory_product_id,
result_status=result.get('status'))
# Get training metrics and filter out non-numeric values
raw_metrics = result.get('model_info', {}).get('training_metrics', {})
# Filter metrics to only include numeric values (per Pydantic schema requirement)
filtered_metrics = {}
for key, value in raw_metrics.items():
if key == 'product_category':
# Skip product_category as it's a string value, not a numeric metric
continue
try:
# Try to convert to float for validation
filtered_metrics[key] = float(value) if value is not None else 0.0
except (ValueError, TypeError):
# Skip non-numeric values
continue
# Return appropriate result format
return {
"job_id": job_id,
"tenant_id": tenant_id,
"inventory_product_id": inventory_product_id,
"status": result.get('status', 'success'),
"model_id": str(result.get('model_record_id', '')) if result.get('model_record_id') else None,
"training_metrics": filtered_metrics,
"training_time": result.get('training_time_seconds', 0),
"data_points": result.get('data_points', 0),
"message": f"Single product model training {'completed' if result.get('status') != 'error' else 'failed'}"
}
except Exception as e:
logger.error("Single product model training failed",
job_id=job_id,
inventory_product_id=inventory_product_id,
error=str(e))
raise
def _serialize_scalers(self, scalers: Dict[str, Any]) -> Dict[str, Any]:
"""
Serialize scaler objects to basic Python types that can be stored in database.
This prevents issues with storing complex sklearn objects in JSON fields.
"""
if not scalers:
return {}
serialized = {}
for key, value in scalers.items():
try:
# Convert numpy scalars to Python native types
if hasattr(value, 'item'): # numpy scalars
serialized[key] = value.item()
elif isinstance(value, (np.integer, np.floating)):
serialized[key] = value.item() # Convert numpy types to Python types
elif isinstance(value, (int, float, str, bool, type(None))):
serialized[key] = value # Already basic type
elif isinstance(value, (list, tuple)):
# Convert list/tuple elements to basic types
serialized[key] = [v.item() if hasattr(v, 'item') else v for v in value]
else:
# For complex objects, try to convert to string representation
# or store as float if it's numeric
try:
serialized[key] = float(value)
except (ValueError, TypeError):
# If all else fails, convert to string
serialized[key] = str(value)
except Exception:
# If serialization fails, set to None to prevent database errors
serialized[key] = None
return serialized
async def _process_all_products_enhanced(self,
sales_df: pd.DataFrame,
weather_df: pd.DataFrame,
@@ -321,12 +480,15 @@ class EnhancedBakeryMLTrainer:
product_data: pd.DataFrame,
job_id: str,
repos: Dict,
progress_tracker: ParallelProductProgressTracker) -> tuple[str, Dict[str, Any]]:
progress_tracker: ParallelProductProgressTracker,
product_category: ProductCategory = ProductCategory.UNKNOWN) -> tuple[str, Dict[str, Any]]:
"""Train a single product model - used for parallel execution with progress aggregation"""
product_start_time = time.time()
try:
logger.info("Training model", inventory_product_id=inventory_product_id)
logger.info("Training model",
inventory_product_id=inventory_product_id,
category=product_category.value)
# Check if we have enough data
if len(product_data) < settings.MIN_TRAINING_DATA_DAYS:
@@ -343,14 +505,58 @@ class EnhancedBakeryMLTrainer:
min_required=settings.MIN_TRAINING_DATA_DAYS)
return inventory_product_id, result
# Train the model using Prophet manager
model_info = await self.prophet_manager.train_bakery_model(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
# Get category-specific hyperparameters
category_characteristics = self.product_categorizer.get_category_characteristics(product_category)
# Determine which model type to use (Prophet vs Hybrid)
model_type = self.model_selector.select_model_type(
df=product_data,
job_id=job_id
product_category=product_category.value
)
logger.info("Model type selected",
inventory_product_id=inventory_product_id,
model_type=model_type,
category=product_category.value)
# Train the selected model
if model_type == "hybrid":
# Train hybrid Prophet + XGBoost model
model_info = await self.hybrid_trainer.train_hybrid_model(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
df=product_data,
job_id=job_id
)
model_info['model_type'] = 'hybrid_prophet_xgboost'
else:
# Train Prophet-only model with category-specific settings
model_info = await self.prophet_manager.train_bakery_model(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
df=product_data,
job_id=job_id,
product_category=product_category,
category_hyperparameters=category_characteristics.get('prophet_params', {})
)
model_info['model_type'] = 'prophet_optimized'
# Filter training metrics to exclude non-numeric values (e.g., product_category)
if 'training_metrics' in model_info and model_info['training_metrics']:
raw_metrics = model_info['training_metrics']
filtered_metrics = {}
for key, value in raw_metrics.items():
if key == 'product_category':
# Skip product_category as it's a string value, not a numeric metric
continue
try:
# Try to convert to float for validation
filtered_metrics[key] = float(value) if value is not None else 0.0
except (ValueError, TypeError):
# Skip non-numeric values
continue
model_info['training_metrics'] = filtered_metrics
# Store model record using repository
model_record = await self._create_model_record(
repos, tenant_id, inventory_product_id, model_info, job_id, product_data
@@ -366,7 +572,7 @@ class EnhancedBakeryMLTrainer:
result = {
'status': 'success',
'model_info': model_info,
'model_record_id': model_record.id if model_record else None,
'model_record_id': str(model_record.id) if model_record else None,
'data_points': len(product_data),
'training_time_seconds': time.time() - product_start_time,
'trained_at': datetime.now().isoformat()
@@ -403,7 +609,8 @@ class EnhancedBakeryMLTrainer:
processed_data: Dict[str, pd.DataFrame],
job_id: str,
repos: Dict,
progress_tracker: ParallelProductProgressTracker) -> Dict[str, Any]:
progress_tracker: ParallelProductProgressTracker,
product_categories: Dict[str, ProductCategory] = None) -> Dict[str, Any]:
"""Train models with throttled parallel execution and progress tracking"""
total_products = len(processed_data)
logger.info(f"Starting throttled parallel training for {total_products} products")
@@ -416,7 +623,8 @@ class EnhancedBakeryMLTrainer:
product_data=product_data,
job_id=job_id,
repos=repos,
progress_tracker=progress_tracker
progress_tracker=progress_tracker,
product_category=product_categories.get(inventory_product_id, ProductCategory.UNKNOWN) if product_categories else ProductCategory.UNKNOWN
)
for inventory_product_id, product_data in processed_data.items()
]
@@ -478,6 +686,29 @@ class EnhancedBakeryMLTrainer:
processed_data: pd.DataFrame):
"""Create model record using repository"""
try:
# Extract training period from the processed data
training_start_date = None
training_end_date = None
if 'ds' in processed_data.columns and not processed_data.empty:
# Ensure ds column is datetime64 before extracting dates (prevents object dtype issues)
ds_datetime = pd.to_datetime(processed_data['ds'])
# Get min/max as pandas Timestamps (guaranteed to work correctly)
min_ts = ds_datetime.min()
max_ts = ds_datetime.max()
# Convert to python datetime with timezone removal
if pd.notna(min_ts):
training_start_date = pd.Timestamp(min_ts).to_pydatetime().replace(tzinfo=None)
if pd.notna(max_ts):
training_end_date = pd.Timestamp(max_ts).to_pydatetime().replace(tzinfo=None)
# Ensure features are clean string list
try:
features_used = [str(col) for col in processed_data.columns]
except Exception:
features_used = []
model_data = {
"tenant_id": tenant_id,
"inventory_product_id": inventory_product_id,
@@ -485,17 +716,20 @@ class EnhancedBakeryMLTrainer:
"model_type": "enhanced_prophet",
"model_path": model_info.get("model_path"),
"metadata_path": model_info.get("metadata_path"),
"mape": model_info.get("training_metrics", {}).get("mape"),
"mae": model_info.get("training_metrics", {}).get("mae"),
"rmse": model_info.get("training_metrics", {}).get("rmse"),
"r2_score": model_info.get("training_metrics", {}).get("r2"),
"training_samples": len(processed_data),
"hyperparameters": model_info.get("hyperparameters"),
"features_used": list(processed_data.columns),
"normalization_params": self.enhanced_data_processor.get_scalers(), # Include scalers for prediction consistency
"mape": float(model_info.get("training_metrics", {}).get("mape", 0)) if model_info.get("training_metrics", {}).get("mape") is not None else 0,
"mae": float(model_info.get("training_metrics", {}).get("mae", 0)) if model_info.get("training_metrics", {}).get("mae") is not None else 0,
"rmse": float(model_info.get("training_metrics", {}).get("rmse", 0)) if model_info.get("training_metrics", {}).get("rmse") is not None else 0,
"r2_score": float(model_info.get("training_metrics", {}).get("r2", 0)) if model_info.get("training_metrics", {}).get("r2") is not None else 0,
"training_samples": int(len(processed_data)),
"hyperparameters": self._serialize_scalers(model_info.get("hyperparameters", {})),
"features_used": [str(f) for f in features_used] if features_used else [],
"normalization_params": self._serialize_scalers(self.enhanced_data_processor.get_scalers()) or {}, # Include scalers for prediction consistency
"product_category": model_info.get("product_category", "unknown"), # Store product category
"is_active": True,
"is_production": True,
"data_quality_score": model_info.get("data_quality_score", 100.0)
"data_quality_score": float(model_info.get("data_quality_score", 100.0)) if model_info.get("data_quality_score") is not None else 100.0,
"training_start_date": training_start_date,
"training_end_date": training_end_date
}
model_record = await repos['model'].create_model(model_data)
@@ -533,13 +767,13 @@ class EnhancedBakeryMLTrainer:
"model_id": str(model_id),
"tenant_id": tenant_id,
"inventory_product_id": inventory_product_id,
"mae": metrics.get("mae"),
"mse": metrics.get("mse"),
"rmse": metrics.get("rmse"),
"mape": metrics.get("mape"),
"r2_score": metrics.get("r2"),
"accuracy_percentage": 100 - metrics.get("mape", 0) if metrics.get("mape") else None,
"evaluation_samples": metrics.get("data_points", 0)
"mae": float(metrics.get("mae")) if metrics.get("mae") is not None else None,
"mse": float(metrics.get("mse")) if metrics.get("mse") is not None else None,
"rmse": float(metrics.get("rmse")) if metrics.get("rmse") is not None else None,
"mape": float(metrics.get("mape")) if metrics.get("mape") is not None else None,
"r2_score": float(metrics.get("r2")) if metrics.get("r2") is not None else None,
"accuracy_percentage": float(100 - metrics.get("mape", 0)) if metrics.get("mape") is not None else None,
"evaluation_samples": int(metrics.get("data_points", 0)) if metrics.get("data_points") is not None else 0
}
await repos['performance'].create_performance_metric(metric_data)
@@ -672,7 +906,59 @@ class EnhancedBakeryMLTrainer:
sales_df['quantity'] = pd.to_numeric(sales_df['quantity'], errors='coerce')
except Exception:
raise ValueError("Quantity column must be numeric")
async def _categorize_all_products(
self,
sales_df: pd.DataFrame,
processed_data: Dict[str, pd.DataFrame]
) -> Dict[str, ProductCategory]:
"""
Categorize all products for category-specific forecasting.
Args:
sales_df: Raw sales data with product names
processed_data: Processed data by product ID
Returns:
Dict mapping inventory_product_id to ProductCategory
"""
product_categories = {}
for inventory_product_id in processed_data.keys():
try:
# Get product name from sales data (if available)
product_sales = sales_df[sales_df['inventory_product_id'] == inventory_product_id]
# Extract product name (try multiple possible column names)
product_name = "unknown"
for name_col in ['product_name', 'name', 'item_name']:
if name_col in product_sales.columns and not product_sales[name_col].empty:
product_name = product_sales[name_col].iloc[0]
break
# Prepare sales data for pattern analysis
sales_for_analysis = product_sales[['date', 'quantity']].copy() if 'date' in product_sales.columns else None
# Categorize product
category = self.product_categorizer.categorize_product(
product_name=str(product_name),
product_id=inventory_product_id,
sales_data=sales_for_analysis
)
product_categories[inventory_product_id] = category
logger.debug("Product categorized",
inventory_product_id=inventory_product_id,
product_name=product_name,
category=category.value)
except Exception as e:
logger.warning(f"Failed to categorize product {inventory_product_id}: {e}")
product_categories[inventory_product_id] = ProductCategory.UNKNOWN
return product_categories
async def evaluate_model_performance_enhanced(self,
tenant_id: str,
inventory_product_id: str,

View File

@@ -18,6 +18,7 @@ from .training import (
ModelPerformanceMetric,
TrainingJobQueue,
ModelArtifact,
TrainingPerformanceMetrics,
)
# List all models for easier access
@@ -27,5 +28,6 @@ __all__ = [
"ModelPerformanceMetric",
"TrainingJobQueue",
"ModelArtifact",
"TrainingPerformanceMetrics",
"AuditLog",
]

View File

@@ -150,7 +150,8 @@ class TrainedModel(Base):
hyperparameters = Column(JSON) # Store optimized parameters
features_used = Column(JSON) # List of regressor columns
normalization_params = Column(JSON) # Store feature normalization parameters for consistent predictions
product_category = Column(String, nullable=True) # Product category for category-specific forecasting
# Model status
is_active = Column(Boolean, default=True)
is_production = Column(Boolean, default=False)
@@ -185,6 +186,7 @@ class TrainedModel(Base):
"training_samples": self.training_samples,
"hyperparameters": self.hyperparameters,
"features_used": self.features_used,
"product_category": self.product_category,
"is_active": self.is_active,
"is_production": self.is_production,
"created_at": self.created_at.isoformat() if self.created_at else None,

View File

@@ -5,7 +5,7 @@ Includes all request/response schemas used by the API endpoints
"""
from pydantic import BaseModel, Field, validator
from typing import List, Optional, Dict, Any, Union
from typing import List, Optional, Dict, Any, Union, Tuple
from datetime import datetime
from enum import Enum
from uuid import UUID
@@ -37,6 +37,9 @@ class SingleProductTrainingRequest(BaseModel):
daily_seasonality: bool = Field(True, description="Enable daily seasonality")
weekly_seasonality: bool = Field(True, description="Enable weekly seasonality")
yearly_seasonality: bool = Field(True, description="Enable yearly seasonality")
# Location parameters
bakery_location: Optional[Tuple[float, float]] = Field(None, description="Bakery coordinates (latitude, longitude)")
class DateRangeInfo(BaseModel):
"""Schema for date range information"""

View File

@@ -170,6 +170,7 @@ class TrainingDataOrchestrator:
logger.error(f"Training data preparation failed: {str(e)}")
raise ValueError(f"Failed to prepare training data: {str(e)}")
@staticmethod
def extract_sales_date_range_utc_localize(sales_data_df: pd.DataFrame):
"""
Extracts the UTC-aware date range from a sales DataFrame using tz_localize.
@@ -246,12 +247,14 @@ class TrainingDataOrchestrator:
if 'date' in record:
record_date = record['date']
# ✅ FIX: Proper timezone handling for date parsing
# ✅ FIX: Proper timezone handling for date parsing - FIXED THE TRUNCATION ISSUE
if isinstance(record_date, str):
# Parse complete ISO datetime string with timezone info intact
# DO NOT truncate to date part only - this was causing the filtering issue
if 'T' in record_date:
record_date = record_date.replace('Z', '+00:00')
# Parse with timezone info intact
parsed_date = datetime.fromisoformat(record_date.split('T')[0])
# Parse with FULL datetime info, not just date part
parsed_date = datetime.fromisoformat(record_date)
# Ensure timezone-aware
if parsed_date.tzinfo is None:
parsed_date = parsed_date.replace(tzinfo=timezone.utc)
@@ -260,8 +263,8 @@ class TrainingDataOrchestrator:
# Ensure timezone-aware
if record_date.tzinfo is None:
record_date = record_date.replace(tzinfo=timezone.utc)
# Normalize to start of day
record_date = record_date.replace(hour=0, minute=0, second=0, microsecond=0)
# DO NOT normalize to start of day - keep actual datetime for proper filtering
# Only normalize if needed for daily aggregation, but preserve original for filtering
# ✅ FIX: Ensure aligned_range dates are also timezone-aware for comparison
aligned_start = aligned_range.start
@@ -885,4 +888,4 @@ class TrainingDataOrchestrator:
1 if len(dataset.traffic_data) > 0 else 0
])
}
}
}

View File

@@ -468,6 +468,7 @@ class EnhancedTrainingService:
"""
try:
from app.models.training import TrainingPerformanceMetrics
from shared.database.repository import BaseRepository
# Extract timing and success data
models_trained = training_results.get("models_trained", {})
@@ -508,10 +509,13 @@ class EnhancedTrainingService:
"completed_at": datetime.now(timezone.utc)
}
# Create a temporary repository for the TrainingPerformanceMetrics model
# Use the session from one of the initialized repositories to ensure it's available
session = self.model_repo.session # This should be the same session used by all repositories
metrics_repo = BaseRepository(TrainingPerformanceMetrics, session)
# Use repository to create record
performance_metrics = TrainingPerformanceMetrics(**metric_data)
self.session.add(performance_metrics)
await self.session.commit()
await metrics_repo.create(metric_data)
logger.info("Saved training performance metrics for future estimations",
tenant_id=tenant_id,
@@ -777,17 +781,154 @@ class EnhancedTrainingService:
inventory_product_id=inventory_product_id,
job_id=job_id)
# This would use the data client to fetch data for the specific product
# and then use the enhanced training pipeline
# For now, return a success response
# Create initial training log
await self._update_job_status_repository(
job_id=job_id,
status="running",
progress=0,
current_step="Fetching training data",
tenant_id=tenant_id
)
# Prepare training data for all products to get weather/traffic data
# then filter down to the specific product
training_dataset = await self.orchestrator.prepare_training_data(
tenant_id=tenant_id,
bakery_location=bakery_location,
job_id=job_id + "_temp"
)
# Filter sales data to the specific product
sales_df = pd.DataFrame(training_dataset.sales_data)
product_sales_df = sales_df[sales_df['inventory_product_id'] == inventory_product_id]
if product_sales_df.empty:
raise ValueError(f"No sales data available for product {inventory_product_id}")
# Prepare the data in Prophet format (ds and y columns)
# Ensure proper column names and types for Prophet
product_data = product_sales_df.copy()
product_data = product_data.rename(columns={
'sale_date': 'ds', # Common sales date column
'sale_datetime': 'ds', # Alternative date column
'date': 'ds', # Alternative date column
'quantity': 'y', # Quantity sold
'total_amount': 'y', # Alternative for sales data
'sales_amount': 'y', # Alternative for sales data
'sale_amount': 'y' # Alternative for sales data
})
# If 'ds' and 'y' columns are not renamed properly, try to infer them
if 'ds' not in product_data.columns:
# Try to find date-like columns
date_cols = [col for col in product_data.columns if 'date' in col.lower() or 'time' in col.lower()]
if date_cols:
product_data = product_data.rename(columns={date_cols[0]: 'ds'})
if 'y' not in product_data.columns:
# Try to find sales/quantity-like columns
sales_cols = [col for col in product_data.columns if
any(word in col.lower() for word in ['amount', 'quantity', 'sales', 'total', 'count', 'value'])]
if sales_cols:
product_data = product_data.rename(columns={sales_cols[0]: 'y'})
# Ensure required columns exist
if 'ds' not in product_data.columns or 'y' not in product_data.columns:
raise ValueError(f"Sales data must contain 'date' and 'quantity/sales' columns. Available columns: {list(product_data.columns)}")
# Convert the date column to datetime if it's not already
product_data['ds'] = pd.to_datetime(product_data['ds'])
# Convert to numeric ensuring no pandas/numpy objects remain
product_data['y'] = pd.to_numeric(product_data['y'], errors='coerce')
# Sort by date to ensure proper chronological order
product_data = product_data.sort_values('ds').reset_index(drop=True)
# Drop any rows with NaN values
product_data = product_data.dropna(subset=['ds', 'y'])
# Ensure the data is in the right format for Prophet
product_data = product_data[['ds', 'y']].copy()
# Convert to pandas datetime and float types (keep as pandas Series for proper min/max operations)
product_data['ds'] = pd.to_datetime(product_data['ds'])
product_data['y'] = product_data['y'].astype(float)
# DEBUG: Log data types to diagnose dict comparison error
logger.info(f"DEBUG: product_data dtypes after conversion: ds={product_data['ds'].dtype}, y={product_data['y'].dtype}")
logger.info(f"DEBUG: product_data['ds'] sample values: {product_data['ds'].head(3).tolist()}")
logger.info(f"DEBUG: Attempting to get min/max...")
try:
min_val = product_data['ds'].min()
max_val = product_data['ds'].max()
logger.info(f"DEBUG: min_val type={type(min_val)}, value={min_val}")
logger.info(f"DEBUG: max_val type={type(max_val)}, value={max_val}")
except Exception as debug_e:
logger.error(f"DEBUG: Failed to get min/max: {debug_e}")
import traceback
logger.error(f"DEBUG: Traceback: {traceback.format_exc()}")
logger.info("Prepared training data for single product",
inventory_product_id=inventory_product_id,
data_points=len(product_data),
date_range=f"{product_data['ds'].min()} to {product_data['ds'].max()}")
# Update progress
await self._update_job_status_repository(
job_id=job_id,
status="running",
progress=30,
current_step="Training model",
tenant_id=tenant_id
)
# Train the model using the trainer
# Extract datetime values with proper pandas Timestamp wrapper for type safety
try:
training_start = pd.Timestamp(product_data['ds'].min()).to_pydatetime()
training_end = pd.Timestamp(product_data['ds'].max()).to_pydatetime()
except Exception as e:
import traceback
logger.error(f"Failed to extract training dates: {e}")
logger.error(f"Traceback: {traceback.format_exc()}")
logger.error(f"product_data['ds'] dtype: {product_data['ds'].dtype}")
logger.error(f"product_data['ds'] first 5 values: {product_data['ds'].head().tolist()}")
raise
# Run the actual training
try:
model_info = await self.trainer.train_single_product_model(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
training_data=product_data,
job_id=job_id
)
except Exception as e:
import traceback
logger.error(f"Training failed with error: {e}")
logger.error(f"Full traceback: {traceback.format_exc()}")
raise
# Update progress
await self._update_job_status_repository(
job_id=job_id,
status="running",
progress=80,
current_step="Saving model",
tenant_id=tenant_id
)
# The model should already be saved by train_single_product_model
# Return appropriate response
return {
"job_id": job_id,
"tenant_id": tenant_id,
"inventory_product_id": inventory_product_id,
"status": "completed",
"message": "Enhanced single product training completed successfully",
"created_at": datetime.now(),
"created_at": datetime.now(timezone.utc),
"estimated_duration_minutes": 15, # Default estimate for single product
"training_results": {
"total_products": 1,
"successful_trainings": 1,
@@ -795,21 +936,37 @@ class EnhancedTrainingService:
"products": [{
"inventory_product_id": inventory_product_id,
"status": "completed",
"model_id": f"model_{inventory_product_id}_{job_id[:8]}",
"data_points": 100,
"metrics": {"mape": 15.5, "mae": 2.3, "rmse": 3.1, "r2_score": 0.85}
"model_id": str(model_info.get('model_id', f"model_{inventory_product_id}_{job_id[:8]}")) if model_info.get('model_id') else None,
"data_points": len(product_data) if product_data is not None else 0,
# Filter metrics to ensure only numeric values are included
"metrics": {
k: float(v) if not isinstance(v, (int, float)) else v
for k, v in model_info.get('training_metrics', {"mape": 0.0, "mae": 0.0, "rmse": 0.0, "r2_score": 0.0}).items()
if k != 'product_category' and v is not None
}
}],
"overall_training_time_seconds": 45.2
"overall_training_time_seconds": model_info.get('training_time', 45.2)
},
"enhanced_features": True,
"repository_integration": True,
"completed_at": datetime.now().isoformat()
"completed_at": datetime.now(timezone.utc).isoformat()
}
except Exception as e:
logger.error("Enhanced single product training failed",
inventory_product_id=inventory_product_id,
error=str(e))
# Update status to failed
await self._update_job_status_repository(
job_id=job_id,
status="failed",
progress=0,
current_step="Training failed",
error_message=str(e),
tenant_id=tenant_id
)
raise
def _create_detailed_training_response(self, final_result: Dict[str, Any]) -> Dict[str, Any]:
@@ -842,6 +999,7 @@ class EnhancedTrainingService:
"status": final_result["status"],
"message": f"Training {final_result['status']} successfully",
"created_at": datetime.now(),
"estimated_duration_minutes": final_result.get("estimated_duration_minutes", 15),
"training_results": {
"total_products": len(products),
"successful_trainings": len([p for p in products if p["status"] == "completed"]),