348 lines
12 KiB
Python
348 lines
12 KiB
Python
"""
|
||
Enhanced Feature Engineering for Hybrid Prophet + XGBoost Models
|
||
Adds lagged features, rolling statistics, and advanced interactions
|
||
"""
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
from typing import Dict, List, Optional
|
||
import structlog
|
||
|
||
logger = structlog.get_logger()
|
||
|
||
|
||
class AdvancedFeatureEngineer:
|
||
"""
|
||
Advanced feature engineering for hybrid forecasting models.
|
||
Adds lagged features, rolling statistics, and complex interactions.
|
||
"""
|
||
|
||
def __init__(self):
|
||
self.feature_columns = []
|
||
|
||
def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame:
|
||
"""
|
||
Add lagged demand features for capturing recent trends.
|
||
|
||
Args:
|
||
df: DataFrame with 'quantity' column
|
||
lag_days: List of lag periods (default: [1, 7, 14])
|
||
|
||
Returns:
|
||
DataFrame with added lagged features
|
||
"""
|
||
if lag_days is None:
|
||
lag_days = [1, 7, 14]
|
||
|
||
df = df.copy()
|
||
|
||
for lag in lag_days:
|
||
col_name = f'lag_{lag}_day'
|
||
df[col_name] = df['quantity'].shift(lag)
|
||
self.feature_columns.append(col_name)
|
||
|
||
logger.info(f"Added {len(lag_days)} lagged features", lags=lag_days)
|
||
return df
|
||
|
||
def add_rolling_features(
|
||
self,
|
||
df: pd.DataFrame,
|
||
windows: List[int] = None,
|
||
features: List[str] = None
|
||
) -> pd.DataFrame:
|
||
"""
|
||
Add rolling statistics (mean, std, max, min).
|
||
|
||
Args:
|
||
df: DataFrame with 'quantity' column
|
||
windows: List of window sizes (default: [7, 14, 30])
|
||
features: List of statistics to calculate (default: ['mean', 'std', 'max', 'min'])
|
||
|
||
Returns:
|
||
DataFrame with rolling features
|
||
"""
|
||
if windows is None:
|
||
windows = [7, 14, 30]
|
||
|
||
if features is None:
|
||
features = ['mean', 'std', 'max', 'min']
|
||
|
||
df = df.copy()
|
||
|
||
for window in windows:
|
||
for feature in features:
|
||
col_name = f'rolling_{feature}_{window}d'
|
||
|
||
if feature == 'mean':
|
||
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).mean()
|
||
elif feature == 'std':
|
||
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).std()
|
||
elif feature == 'max':
|
||
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).max()
|
||
elif feature == 'min':
|
||
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).min()
|
||
|
||
self.feature_columns.append(col_name)
|
||
|
||
logger.info(f"Added rolling features", windows=windows, features=features)
|
||
return df
|
||
|
||
def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
|
||
"""
|
||
Add enhanced day-of-week features.
|
||
|
||
Args:
|
||
df: DataFrame with date column
|
||
date_column: Name of date column
|
||
|
||
Returns:
|
||
DataFrame with day-of-week features
|
||
"""
|
||
df = df.copy()
|
||
|
||
# Day of week (0=Monday, 6=Sunday)
|
||
df['day_of_week'] = df[date_column].dt.dayofweek
|
||
|
||
# Is weekend
|
||
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
|
||
|
||
# Is Friday (often higher demand due to weekend prep)
|
||
df['is_friday'] = (df['day_of_week'] == 4).astype(int)
|
||
|
||
# Is Monday (often lower demand after weekend)
|
||
df['is_monday'] = (df['day_of_week'] == 0).astype(int)
|
||
|
||
# Add to feature list
|
||
for col in ['day_of_week', 'is_weekend', 'is_friday', 'is_monday']:
|
||
if col not in self.feature_columns:
|
||
self.feature_columns.append(col)
|
||
|
||
return df
|
||
|
||
def add_calendar_enhanced_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
|
||
"""
|
||
Add enhanced calendar features beyond basic temporal features.
|
||
|
||
Args:
|
||
df: DataFrame with date column
|
||
date_column: Name of date column
|
||
|
||
Returns:
|
||
DataFrame with enhanced calendar features
|
||
"""
|
||
df = df.copy()
|
||
|
||
# Month and quarter (if not already present)
|
||
if 'month' not in df.columns:
|
||
df['month'] = df[date_column].dt.month
|
||
|
||
if 'quarter' not in df.columns:
|
||
df['quarter'] = df[date_column].dt.quarter
|
||
|
||
# Day of month
|
||
df['day_of_month'] = df[date_column].dt.day
|
||
|
||
# Is month start/end
|
||
df['is_month_start'] = (df['day_of_month'] <= 3).astype(int)
|
||
df['is_month_end'] = (df[date_column].dt.is_month_end).astype(int)
|
||
|
||
# Week of year
|
||
df['week_of_year'] = df[date_column].dt.isocalendar().week
|
||
|
||
# Payday indicators (15th and last day of month - high bakery traffic)
|
||
df['is_payday'] = ((df['day_of_month'] == 15) | df[date_column].dt.is_month_end).astype(int)
|
||
|
||
# Add to feature list
|
||
for col in ['month', 'quarter', 'day_of_month', 'is_month_start', 'is_month_end',
|
||
'week_of_year', 'is_payday']:
|
||
if col not in self.feature_columns:
|
||
self.feature_columns.append(col)
|
||
|
||
return df
|
||
|
||
def add_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
Add interaction features between variables.
|
||
|
||
Args:
|
||
df: DataFrame with base features
|
||
|
||
Returns:
|
||
DataFrame with interaction features
|
||
"""
|
||
df = df.copy()
|
||
|
||
# Weekend × Temperature (people buy more cold drinks in hot weekends)
|
||
if 'is_weekend' in df.columns and 'temperature' in df.columns:
|
||
df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
|
||
self.feature_columns.append('weekend_temp_interaction')
|
||
|
||
# Rain × Weekend (bad weather reduces weekend traffic)
|
||
if 'is_weekend' in df.columns and 'precipitation' in df.columns:
|
||
df['rain_weekend_interaction'] = df['is_weekend'] * (df['precipitation'] > 0).astype(int)
|
||
self.feature_columns.append('rain_weekend_interaction')
|
||
|
||
# Friday × Traffic (high Friday traffic means weekend prep buying)
|
||
if 'is_friday' in df.columns and 'traffic_volume' in df.columns:
|
||
df['friday_traffic_interaction'] = df['is_friday'] * df['traffic_volume']
|
||
self.feature_columns.append('friday_traffic_interaction')
|
||
|
||
# Month × Temperature (seasonal temperature patterns)
|
||
if 'month' in df.columns and 'temperature' in df.columns:
|
||
df['month_temp_interaction'] = df['month'] * df['temperature']
|
||
self.feature_columns.append('month_temp_interaction')
|
||
|
||
# Payday × Weekend (big shopping days)
|
||
if 'is_payday' in df.columns and 'is_weekend' in df.columns:
|
||
df['payday_weekend_interaction'] = df['is_payday'] * df['is_weekend']
|
||
self.feature_columns.append('payday_weekend_interaction')
|
||
|
||
logger.info(f"Added {len([c for c in self.feature_columns if 'interaction' in c])} interaction features")
|
||
return df
|
||
|
||
def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
|
||
"""
|
||
Add trend-based features.
|
||
|
||
Args:
|
||
df: DataFrame with date and quantity
|
||
date_column: Name of date column
|
||
|
||
Returns:
|
||
DataFrame with trend features
|
||
"""
|
||
df = df.copy()
|
||
|
||
# Days since start (linear trend proxy)
|
||
df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days
|
||
|
||
# Momentum indicators (recent change vs. older change)
|
||
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
|
||
df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day']
|
||
self.feature_columns.append('momentum_1_7')
|
||
|
||
if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns:
|
||
df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d']
|
||
self.feature_columns.append('trend_7_30')
|
||
|
||
# Velocity (rate of change)
|
||
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
|
||
df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7
|
||
self.feature_columns.append('velocity_week')
|
||
|
||
self.feature_columns.append('days_since_start')
|
||
|
||
return df
|
||
|
||
def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame:
|
||
"""
|
||
Add cyclical encoding for periodic features (day_of_week, month).
|
||
Helps models understand that Monday follows Sunday, December follows January.
|
||
|
||
Args:
|
||
df: DataFrame with day_of_week and month columns
|
||
|
||
Returns:
|
||
DataFrame with cyclical features
|
||
"""
|
||
df = df.copy()
|
||
|
||
# Day of week cyclical encoding
|
||
if 'day_of_week' in df.columns:
|
||
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
|
||
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
|
||
self.feature_columns.extend(['day_of_week_sin', 'day_of_week_cos'])
|
||
|
||
# Month cyclical encoding
|
||
if 'month' in df.columns:
|
||
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
|
||
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
|
||
self.feature_columns.extend(['month_sin', 'month_cos'])
|
||
|
||
logger.info("Added cyclical encoding for temporal features")
|
||
return df
|
||
|
||
def create_all_features(
|
||
self,
|
||
df: pd.DataFrame,
|
||
date_column: str = 'date',
|
||
include_lags: bool = True,
|
||
include_rolling: bool = True,
|
||
include_interactions: bool = True,
|
||
include_cyclical: bool = True
|
||
) -> pd.DataFrame:
|
||
"""
|
||
Create all enhanced features in one go.
|
||
|
||
Args:
|
||
df: DataFrame with base data
|
||
date_column: Name of date column
|
||
include_lags: Whether to include lagged features
|
||
include_rolling: Whether to include rolling statistics
|
||
include_interactions: Whether to include interaction features
|
||
include_cyclical: Whether to include cyclical encoding
|
||
|
||
Returns:
|
||
DataFrame with all enhanced features
|
||
"""
|
||
logger.info("Creating comprehensive feature set for hybrid model")
|
||
|
||
# Reset feature list
|
||
self.feature_columns = []
|
||
|
||
# Day of week and calendar features (always needed)
|
||
df = self.add_day_of_week_features(df, date_column)
|
||
df = self.add_calendar_enhanced_features(df, date_column)
|
||
|
||
# Optional features
|
||
if include_lags:
|
||
df = self.add_lagged_features(df)
|
||
|
||
if include_rolling:
|
||
df = self.add_rolling_features(df)
|
||
|
||
if include_interactions:
|
||
df = self.add_interaction_features(df)
|
||
|
||
if include_cyclical:
|
||
df = self.add_cyclical_encoding(df)
|
||
|
||
# Trend features (depends on lags and rolling)
|
||
if include_lags or include_rolling:
|
||
df = self.add_trend_features(df, date_column)
|
||
|
||
logger.info(f"Created {len(self.feature_columns)} enhanced features for hybrid model")
|
||
|
||
return df
|
||
|
||
def get_feature_columns(self) -> List[str]:
|
||
"""Get list of all created feature column names."""
|
||
return self.feature_columns.copy()
|
||
|
||
def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_backward') -> pd.DataFrame:
|
||
"""
|
||
Fill NA values in lagged and rolling features.
|
||
|
||
Args:
|
||
df: DataFrame with potential NA values
|
||
strategy: 'forward_backward', 'zero', 'mean'
|
||
|
||
Returns:
|
||
DataFrame with filled NA values
|
||
"""
|
||
df = df.copy()
|
||
|
||
if strategy == 'forward_backward':
|
||
# Forward fill first (use previous values)
|
||
df = df.fillna(method='ffill')
|
||
# Backward fill remaining (beginning of series)
|
||
df = df.fillna(method='bfill')
|
||
|
||
elif strategy == 'zero':
|
||
df = df.fillna(0)
|
||
|
||
elif strategy == 'mean':
|
||
df = df.fillna(df.mean())
|
||
|
||
return df
|