Files
bakery-ia/shared/ml/feature_calculator.py

589 lines
23 KiB
Python
Executable File

"""
Shared Feature Calculator for Training and Prediction Services
This module provides unified feature calculation logic to ensure consistency
between model training and inference (prediction), preventing train/serve skew.
Key principles:
- Same lag calculation logic in training and prediction
- Same rolling window statistics in training and prediction
- Same trend feature calculations in training and prediction
- Graceful handling of sparse/missing data with consistent fallbacks
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Union, Tuple
from datetime import datetime
import structlog
logger = structlog.get_logger()
class HistoricalFeatureCalculator:
"""
Unified historical feature calculator for both training and prediction.
This class ensures that features are calculated identically whether
during model training or during inference, preventing train/serve skew.
"""
def __init__(self):
"""Initialize the feature calculator."""
self.feature_columns = []
def calculate_lag_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
lag_days: List[int] = None,
mode: str = 'training'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate lagged sales features consistently for training and prediction.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training) with 'quantity' column
lag_days: List of lag periods (default: [1, 7, 14])
mode: 'training' returns DataFrame with lag columns, 'prediction' returns dict of features
Returns:
DataFrame with lag columns (training mode) or dict of lag features (prediction mode)
"""
if lag_days is None:
lag_days = [1, 7, 14]
if mode == 'training':
return self._calculate_lag_features_training(sales_data, lag_days)
else:
return self._calculate_lag_features_prediction(sales_data, lag_days)
def _calculate_lag_features_training(
self,
df: pd.DataFrame,
lag_days: List[int]
) -> pd.DataFrame:
"""
Calculate lag features for training (operates on DataFrame).
Args:
df: DataFrame with 'quantity' column
lag_days: List of lag periods
Returns:
DataFrame with added lag columns
"""
df = df.copy()
# Calculate overall statistics for fallback (consistent with prediction)
overall_mean = float(df['quantity'].mean()) if len(df) > 0 else 0.0
overall_std = float(df['quantity'].std()) if len(df) > 1 else 0.0
for lag in lag_days:
col_name = f'lag_{lag}_day'
# Use pandas shift
df[col_name] = df['quantity'].shift(lag)
# Fill NaN values using same logic as prediction mode
# For missing lags, use cascading fallback: previous lag -> last value -> mean
if lag == 1:
# For lag_1, fill with last available or mean
df[col_name] = df[col_name].fillna(df['quantity'].iloc[0] if len(df) > 0 else overall_mean)
elif lag == 7:
# For lag_7, fill with lag_1 if available, else last value, else mean
mask = df[col_name].isna()
if 'lag_1_day' in df.columns:
df.loc[mask, col_name] = df.loc[mask, 'lag_1_day']
else:
df.loc[mask, col_name] = df['quantity'].iloc[0] if len(df) > 0 else overall_mean
elif lag == 14:
# For lag_14, fill with lag_7 if available, else lag_1, else last value, else mean
mask = df[col_name].isna()
if 'lag_7_day' in df.columns:
df.loc[mask, col_name] = df.loc[mask, 'lag_7_day']
elif 'lag_1_day' in df.columns:
df.loc[mask, col_name] = df.loc[mask, 'lag_1_day']
else:
df.loc[mask, col_name] = df['quantity'].iloc[0] if len(df) > 0 else overall_mean
# Fill any remaining NaN with mean
df[col_name] = df[col_name].fillna(overall_mean)
self.feature_columns.append(col_name)
logger.debug(f"Added {len(lag_days)} lagged features (training mode)", lags=lag_days)
return df
def _calculate_lag_features_prediction(
self,
historical_sales: pd.Series,
lag_days: List[int]
) -> Dict[str, float]:
"""
Calculate lag features for prediction (operates on Series, returns dict).
Args:
historical_sales: Series of sales quantities indexed by date
lag_days: List of lag periods
Returns:
Dictionary of lag features
"""
features = {}
if len(historical_sales) == 0:
# Return default values if no data
for lag in lag_days:
features[f'lag_{lag}_day'] = 0.0
return features
# Calculate overall statistics for fallback
overall_mean = float(historical_sales.mean())
overall_std = float(historical_sales.std()) if len(historical_sales) > 1 else 0.0
# Calculate lag_1_day
if 1 in lag_days:
if len(historical_sales) >= 1:
features['lag_1_day'] = float(historical_sales.iloc[-1])
else:
features['lag_1_day'] = overall_mean
# Calculate lag_7_day
if 7 in lag_days:
if len(historical_sales) >= 7:
features['lag_7_day'] = float(historical_sales.iloc[-7])
else:
# Fallback to last value if insufficient data
features['lag_7_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) > 0 else overall_mean
# Calculate lag_14_day
if 14 in lag_days:
if len(historical_sales) >= 14:
features['lag_14_day'] = float(historical_sales.iloc[-14])
else:
# Cascading fallback: lag_7 -> lag_1 -> last value -> mean
if len(historical_sales) >= 7:
features['lag_14_day'] = float(historical_sales.iloc[-7])
else:
features['lag_14_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) > 0 else overall_mean
logger.debug("Calculated lag features (prediction mode)", features=features)
return features
def calculate_rolling_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
windows: List[int] = None,
statistics: List[str] = None,
mode: str = 'training'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate rolling window statistics consistently for training and prediction.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training) with 'quantity' column
windows: List of window sizes in days (default: [7, 14, 30])
statistics: List of statistics to calculate (default: ['mean', 'std', 'max', 'min'])
mode: 'training' returns DataFrame, 'prediction' returns dict
Returns:
DataFrame with rolling columns (training mode) or dict of rolling features (prediction mode)
"""
if windows is None:
windows = [7, 14, 30]
if statistics is None:
statistics = ['mean', 'std', 'max', 'min']
if mode == 'training':
return self._calculate_rolling_features_training(sales_data, windows, statistics)
else:
return self._calculate_rolling_features_prediction(sales_data, windows, statistics)
def _calculate_rolling_features_training(
self,
df: pd.DataFrame,
windows: List[int],
statistics: List[str]
) -> pd.DataFrame:
"""
Calculate rolling features for training (operates on DataFrame).
Args:
df: DataFrame with 'quantity' column
windows: List of window sizes
statistics: List of statistics to calculate
Returns:
DataFrame with added rolling columns
"""
df = df.copy()
# Calculate overall statistics for fallback
overall_mean = float(df['quantity'].mean()) if len(df) > 0 else 0.0
overall_std = float(df['quantity'].std()) if len(df) > 1 else 0.0
overall_max = float(df['quantity'].max()) if len(df) > 0 else 0.0
overall_min = float(df['quantity'].min()) if len(df) > 0 else 0.0
fallback_values = {
'mean': overall_mean,
'std': overall_std,
'max': overall_max,
'min': overall_min
}
for window in windows:
for stat in statistics:
col_name = f'rolling_{stat}_{window}d'
# Calculate rolling statistic with full window required (consistent with prediction)
# Use min_periods=window to match prediction behavior
if stat == 'mean':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).mean()
elif stat == 'std':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).std()
elif stat == 'max':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).max()
elif stat == 'min':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).min()
# Fill NaN values using cascading fallback (consistent with prediction)
# Use smaller window values if available, otherwise use overall statistics
mask = df[col_name].isna()
if window == 14 and f'rolling_{stat}_7d' in df.columns:
# Use 7-day window for 14-day NaN
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_7d']
elif window == 30 and f'rolling_{stat}_14d' in df.columns:
# Use 14-day window for 30-day NaN
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_14d']
elif window == 30 and f'rolling_{stat}_7d' in df.columns:
# Use 7-day window for 30-day NaN if 14-day not available
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_7d']
# Fill any remaining NaN with overall statistics
df[col_name] = df[col_name].fillna(fallback_values[stat])
self.feature_columns.append(col_name)
logger.debug(f"Added rolling features (training mode)", windows=windows, statistics=statistics)
return df
def _calculate_rolling_features_prediction(
self,
historical_sales: pd.Series,
windows: List[int],
statistics: List[str]
) -> Dict[str, float]:
"""
Calculate rolling features for prediction (operates on Series, returns dict).
Args:
historical_sales: Series of sales quantities indexed by date
windows: List of window sizes
statistics: List of statistics to calculate
Returns:
Dictionary of rolling features
"""
features = {}
if len(historical_sales) == 0:
# Return default values if no data
for window in windows:
for stat in statistics:
features[f'rolling_{stat}_{window}d'] = 0.0
return features
# Calculate overall statistics for fallback
overall_mean = float(historical_sales.mean())
overall_std = float(historical_sales.std()) if len(historical_sales) > 1 else 0.0
overall_max = float(historical_sales.max())
overall_min = float(historical_sales.min())
fallback_values = {
'mean': overall_mean,
'std': overall_std,
'max': overall_max,
'min': overall_min
}
# Calculate for each window
for window in windows:
if len(historical_sales) >= window:
# Have enough data for full window
window_data = historical_sales.iloc[-window:]
for stat in statistics:
col_name = f'rolling_{stat}_{window}d'
if stat == 'mean':
features[col_name] = float(window_data.mean())
elif stat == 'std':
features[col_name] = float(window_data.std()) if len(window_data) > 1 else 0.0
elif stat == 'max':
features[col_name] = float(window_data.max())
elif stat == 'min':
features[col_name] = float(window_data.min())
else:
# Insufficient data - use cascading fallback
for stat in statistics:
col_name = f'rolling_{stat}_{window}d'
# Try to use smaller window if available
if window == 14 and f'rolling_{stat}_7d' in features:
features[col_name] = features[f'rolling_{stat}_7d']
elif window == 30 and f'rolling_{stat}_14d' in features:
features[col_name] = features[f'rolling_{stat}_14d']
elif window == 30 and f'rolling_{stat}_7d' in features:
features[col_name] = features[f'rolling_{stat}_7d']
else:
# Use overall statistics
features[col_name] = fallback_values[stat]
logger.debug("Calculated rolling features (prediction mode)", num_features=len(features))
return features
def calculate_trend_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
reference_date: Optional[datetime] = None,
lag_features: Optional[Dict[str, float]] = None,
rolling_features: Optional[Dict[str, float]] = None,
mode: str = 'training'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate trend-based features consistently for training and prediction.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training)
reference_date: Reference date for calculations (prediction mode)
lag_features: Pre-calculated lag features (prediction mode)
rolling_features: Pre-calculated rolling features (prediction mode)
mode: 'training' returns DataFrame, 'prediction' returns dict
Returns:
DataFrame with trend columns (training mode) or dict of trend features (prediction mode)
"""
if mode == 'training':
return self._calculate_trend_features_training(sales_data)
else:
return self._calculate_trend_features_prediction(
sales_data,
reference_date,
lag_features,
rolling_features
)
def _calculate_trend_features_training(
self,
df: pd.DataFrame,
date_column: str = 'date'
) -> pd.DataFrame:
"""
Calculate trend features for training (operates on DataFrame).
Args:
df: DataFrame with date and lag/rolling features
date_column: Name of date column
Returns:
DataFrame with added trend columns
"""
df = df.copy()
# Days since start
df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days
# Momentum (difference between lag_1 and lag_7)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day']
self.feature_columns.append('momentum_1_7')
else:
df['momentum_1_7'] = 0.0
self.feature_columns.append('momentum_1_7')
# Trend (difference between 7-day and 30-day rolling means)
if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns:
df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d']
self.feature_columns.append('trend_7_30')
else:
df['trend_7_30'] = 0.0
self.feature_columns.append('trend_7_30')
# Velocity (rate of change over week)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7.0
self.feature_columns.append('velocity_week')
else:
df['velocity_week'] = 0.0
self.feature_columns.append('velocity_week')
self.feature_columns.append('days_since_start')
logger.debug("Added trend features (training mode)")
return df
def _calculate_trend_features_prediction(
self,
historical_sales: pd.Series,
reference_date: datetime,
lag_features: Dict[str, float],
rolling_features: Dict[str, float]
) -> Dict[str, float]:
"""
Calculate trend features for prediction (operates on Series, returns dict).
Args:
historical_sales: Series of sales quantities indexed by date
reference_date: The date we're forecasting for
lag_features: Pre-calculated lag features
rolling_features: Pre-calculated rolling features
Returns:
Dictionary of trend features
"""
features = {}
if len(historical_sales) == 0:
return {
'days_since_start': 0,
'momentum_1_7': 0.0,
'trend_7_30': 0.0,
'velocity_week': 0.0
}
# Days since first sale
features['days_since_start'] = (reference_date - historical_sales.index[0]).days
# Momentum (difference between lag_1 and lag_7)
if 'lag_1_day' in lag_features and 'lag_7_day' in lag_features:
if len(historical_sales) >= 7:
features['momentum_1_7'] = lag_features['lag_1_day'] - lag_features['lag_7_day']
else:
features['momentum_1_7'] = 0.0 # Insufficient data
else:
features['momentum_1_7'] = 0.0
# Trend (difference between 7-day and 30-day rolling means)
if 'rolling_mean_7d' in rolling_features and 'rolling_mean_30d' in rolling_features:
if len(historical_sales) >= 30:
features['trend_7_30'] = rolling_features['rolling_mean_7d'] - rolling_features['rolling_mean_30d']
else:
features['trend_7_30'] = 0.0 # Insufficient data
else:
features['trend_7_30'] = 0.0
# Velocity (rate of change over week)
if 'lag_1_day' in lag_features and 'lag_7_day' in lag_features:
if len(historical_sales) >= 7:
recent_value = lag_features['lag_1_day']
past_value = lag_features['lag_7_day']
features['velocity_week'] = float((recent_value - past_value) / 7.0)
else:
features['velocity_week'] = 0.0 # Insufficient data
else:
features['velocity_week'] = 0.0
logger.debug("Calculated trend features (prediction mode)", features=features)
return features
def calculate_data_freshness_metrics(
self,
historical_sales: pd.Series,
forecast_date: datetime
) -> Dict[str, Union[int, float]]:
"""
Calculate data freshness and availability metrics.
This is used by prediction service to assess data quality and adjust confidence.
Not used in training mode.
Args:
historical_sales: Series of sales quantities indexed by date
forecast_date: The date we're forecasting for
Returns:
Dictionary with freshness metrics
"""
if len(historical_sales) == 0:
return {
'days_since_last_sale': 999, # Very large number indicating no data
'historical_data_availability_score': 0.0
}
last_available_date = historical_sales.index.max()
days_since_last_sale = (forecast_date - last_available_date).days
# Calculate data availability score (0-1 scale, 1 being recent data)
max_considered_days = 180 # Consider data older than 6 months as very stale
availability_score = max(0.0, 1.0 - (days_since_last_sale / max_considered_days))
return {
'days_since_last_sale': days_since_last_sale,
'historical_data_availability_score': availability_score
}
def calculate_all_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
reference_date: Optional[datetime] = None,
mode: str = 'training',
date_column: str = 'date'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate all historical features in one call.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training)
reference_date: Reference date for predictions (prediction mode only)
mode: 'training' or 'prediction'
date_column: Name of date column (training mode only)
Returns:
DataFrame with all features (training) or dict of all features (prediction)
"""
if mode == 'training':
df = sales_data.copy()
# Calculate lag features
df = self.calculate_lag_features(df, mode='training')
# Calculate rolling features
df = self.calculate_rolling_features(df, mode='training')
# Calculate trend features
df = self.calculate_trend_features(df, mode='training')
logger.info(f"Calculated all features (training mode)", feature_count=len(self.feature_columns))
return df
else: # prediction mode
if reference_date is None:
raise ValueError("reference_date is required for prediction mode")
features = {}
# Calculate lag features
lag_features = self.calculate_lag_features(sales_data, mode='prediction')
features.update(lag_features)
# Calculate rolling features
rolling_features = self.calculate_rolling_features(sales_data, mode='prediction')
features.update(rolling_features)
# Calculate trend features
trend_features = self.calculate_trend_features(
sales_data,
reference_date=reference_date,
lag_features=lag_features,
rolling_features=rolling_features,
mode='prediction'
)
features.update(trend_features)
# Calculate data freshness metrics
freshness_metrics = self.calculate_data_freshness_metrics(sales_data, reference_date)
features.update(freshness_metrics)
logger.info(f"Calculated all features (prediction mode)", feature_count=len(features))
return features