""" Enhanced Feature Engineering for Hybrid Prophet + XGBoost Models Adds lagged features, rolling statistics, and advanced interactions """ import pandas as pd import numpy as np from typing import Dict, List, Optional import structlog logger = structlog.get_logger() class AdvancedFeatureEngineer: """ Advanced feature engineering for hybrid forecasting models. Adds lagged features, rolling statistics, and complex interactions. """ def __init__(self): self.feature_columns = [] def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame: """ Add lagged demand features for capturing recent trends. Args: df: DataFrame with 'quantity' column lag_days: List of lag periods (default: [1, 7, 14]) Returns: DataFrame with added lagged features """ if lag_days is None: lag_days = [1, 7, 14] df = df.copy() for lag in lag_days: col_name = f'lag_{lag}_day' df[col_name] = df['quantity'].shift(lag) self.feature_columns.append(col_name) logger.info(f"Added {len(lag_days)} lagged features", lags=lag_days) return df def add_rolling_features( self, df: pd.DataFrame, windows: List[int] = None, features: List[str] = None ) -> pd.DataFrame: """ Add rolling statistics (mean, std, max, min). Args: df: DataFrame with 'quantity' column windows: List of window sizes (default: [7, 14, 30]) features: List of statistics to calculate (default: ['mean', 'std', 'max', 'min']) Returns: DataFrame with rolling features """ if windows is None: windows = [7, 14, 30] if features is None: features = ['mean', 'std', 'max', 'min'] df = df.copy() for window in windows: for feature in features: col_name = f'rolling_{feature}_{window}d' if feature == 'mean': df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).mean() elif feature == 'std': df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).std() elif feature == 'max': df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).max() elif feature == 'min': df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).min() self.feature_columns.append(col_name) logger.info(f"Added rolling features", windows=windows, features=features) return df def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame: """ Add enhanced day-of-week features. Args: df: DataFrame with date column date_column: Name of date column Returns: DataFrame with day-of-week features """ df = df.copy() # Day of week (0=Monday, 6=Sunday) df['day_of_week'] = df[date_column].dt.dayofweek # Is weekend df['is_weekend'] = (df['day_of_week'] >= 5).astype(int) # Is Friday (often higher demand due to weekend prep) df['is_friday'] = (df['day_of_week'] == 4).astype(int) # Is Monday (often lower demand after weekend) df['is_monday'] = (df['day_of_week'] == 0).astype(int) # Add to feature list for col in ['day_of_week', 'is_weekend', 'is_friday', 'is_monday']: if col not in self.feature_columns: self.feature_columns.append(col) return df def add_calendar_enhanced_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame: """ Add enhanced calendar features beyond basic temporal features. Args: df: DataFrame with date column date_column: Name of date column Returns: DataFrame with enhanced calendar features """ df = df.copy() # Month and quarter (if not already present) if 'month' not in df.columns: df['month'] = df[date_column].dt.month if 'quarter' not in df.columns: df['quarter'] = df[date_column].dt.quarter # Day of month df['day_of_month'] = df[date_column].dt.day # Is month start/end df['is_month_start'] = (df['day_of_month'] <= 3).astype(int) df['is_month_end'] = (df[date_column].dt.is_month_end).astype(int) # Week of year df['week_of_year'] = df[date_column].dt.isocalendar().week # Payday indicators (15th and last day of month - high bakery traffic) df['is_payday'] = ((df['day_of_month'] == 15) | df[date_column].dt.is_month_end).astype(int) # Add to feature list for col in ['month', 'quarter', 'day_of_month', 'is_month_start', 'is_month_end', 'week_of_year', 'is_payday']: if col not in self.feature_columns: self.feature_columns.append(col) return df def add_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame: """ Add interaction features between variables. Args: df: DataFrame with base features Returns: DataFrame with interaction features """ df = df.copy() # Weekend × Temperature (people buy more cold drinks in hot weekends) if 'is_weekend' in df.columns and 'temperature' in df.columns: df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature'] self.feature_columns.append('weekend_temp_interaction') # Rain × Weekend (bad weather reduces weekend traffic) if 'is_weekend' in df.columns and 'precipitation' in df.columns: df['rain_weekend_interaction'] = df['is_weekend'] * (df['precipitation'] > 0).astype(int) self.feature_columns.append('rain_weekend_interaction') # Friday × Traffic (high Friday traffic means weekend prep buying) if 'is_friday' in df.columns and 'traffic_volume' in df.columns: df['friday_traffic_interaction'] = df['is_friday'] * df['traffic_volume'] self.feature_columns.append('friday_traffic_interaction') # Month × Temperature (seasonal temperature patterns) if 'month' in df.columns and 'temperature' in df.columns: df['month_temp_interaction'] = df['month'] * df['temperature'] self.feature_columns.append('month_temp_interaction') # Payday × Weekend (big shopping days) if 'is_payday' in df.columns and 'is_weekend' in df.columns: df['payday_weekend_interaction'] = df['is_payday'] * df['is_weekend'] self.feature_columns.append('payday_weekend_interaction') logger.info(f"Added {len([c for c in self.feature_columns if 'interaction' in c])} interaction features") return df def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame: """ Add trend-based features. Args: df: DataFrame with date and quantity date_column: Name of date column Returns: DataFrame with trend features """ df = df.copy() # Days since start (linear trend proxy) df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days # Momentum indicators (recent change vs. older change) if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns: df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day'] self.feature_columns.append('momentum_1_7') if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns: df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d'] self.feature_columns.append('trend_7_30') # Velocity (rate of change) if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns: df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7 self.feature_columns.append('velocity_week') self.feature_columns.append('days_since_start') return df def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame: """ Add cyclical encoding for periodic features (day_of_week, month). Helps models understand that Monday follows Sunday, December follows January. Args: df: DataFrame with day_of_week and month columns Returns: DataFrame with cyclical features """ df = df.copy() # Day of week cyclical encoding if 'day_of_week' in df.columns: df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7) df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7) self.feature_columns.extend(['day_of_week_sin', 'day_of_week_cos']) # Month cyclical encoding if 'month' in df.columns: df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12) df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12) self.feature_columns.extend(['month_sin', 'month_cos']) logger.info("Added cyclical encoding for temporal features") return df def create_all_features( self, df: pd.DataFrame, date_column: str = 'date', include_lags: bool = True, include_rolling: bool = True, include_interactions: bool = True, include_cyclical: bool = True ) -> pd.DataFrame: """ Create all enhanced features in one go. Args: df: DataFrame with base data date_column: Name of date column include_lags: Whether to include lagged features include_rolling: Whether to include rolling statistics include_interactions: Whether to include interaction features include_cyclical: Whether to include cyclical encoding Returns: DataFrame with all enhanced features """ logger.info("Creating comprehensive feature set for hybrid model") # Reset feature list self.feature_columns = [] # Day of week and calendar features (always needed) df = self.add_day_of_week_features(df, date_column) df = self.add_calendar_enhanced_features(df, date_column) # Optional features if include_lags: df = self.add_lagged_features(df) if include_rolling: df = self.add_rolling_features(df) if include_interactions: df = self.add_interaction_features(df) if include_cyclical: df = self.add_cyclical_encoding(df) # Trend features (depends on lags and rolling) if include_lags or include_rolling: df = self.add_trend_features(df, date_column) logger.info(f"Created {len(self.feature_columns)} enhanced features for hybrid model") return df def get_feature_columns(self) -> List[str]: """Get list of all created feature column names.""" return self.feature_columns.copy() def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_backward') -> pd.DataFrame: """ Fill NA values in lagged and rolling features. Args: df: DataFrame with potential NA values strategy: 'forward_backward', 'zero', 'mean' Returns: DataFrame with filled NA values """ df = df.copy() if strategy == 'forward_backward': # Forward fill first (use previous values) df = df.fillna(method='ffill') # Backward fill remaining (beginning of series) df = df.fillna(method='bfill') elif strategy == 'zero': df = df.fillna(0) elif strategy == 'mean': df = df.fillna(df.mean()) return df