imporve features

2025-11-14 07:23:56 +01:00
parent 9bc048d360
commit a8d8828935
32 changed files with 5436 additions and 271 deletions
--- a/shared/ml/enhanced_features.py
+++ b/shared/ml/enhanced_features.py
@@ -0,0 +1,347 @@
+"""
+Enhanced Feature Engineering for Hybrid Prophet + XGBoost Models
+Adds lagged features, rolling statistics, and advanced interactions
+"""
+
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Optional
+import structlog
+from shared.ml.feature_calculator import HistoricalFeatureCalculator
+
+logger = structlog.get_logger()
+
+
+class AdvancedFeatureEngineer:
+    """
+    Advanced feature engineering for hybrid forecasting models.
+    Adds lagged features, rolling statistics, and complex interactions.
+    """
+
+    def __init__(self):
+        self.feature_columns = []
+        self.feature_calculator = HistoricalFeatureCalculator()
+
+    def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame:
+        """
+        Add lagged demand features for capturing recent trends.
+        Uses shared feature calculator for consistency with prediction service.
+
+        Args:
+            df: DataFrame with 'quantity' column
+            lag_days: List of lag periods (default: [1, 7, 14])
+
+        Returns:
+            DataFrame with added lagged features
+        """
+        if lag_days is None:
+            lag_days = [1, 7, 14]
+
+        # Use shared calculator for consistent lag calculation
+        df = self.feature_calculator.calculate_lag_features(
+            df,
+            lag_days=lag_days,
+            mode='training'
+        )
+
+        # Update feature columns list
+        for lag in lag_days:
+            col_name = f'lag_{lag}_day'
+            if col_name not in self.feature_columns:
+                self.feature_columns.append(col_name)
+
+        logger.info(f"Added {len(lag_days)} lagged features (using shared calculator)", lags=lag_days)
+        return df
+
+    def add_rolling_features(
+        self,
+        df: pd.DataFrame,
+        windows: List[int] = None,
+        features: List[str] = None
+    ) -> pd.DataFrame:
+        """
+        Add rolling statistics (mean, std, max, min).
+        Uses shared feature calculator for consistency with prediction service.
+
+        Args:
+            df: DataFrame with 'quantity' column
+            windows: List of window sizes (default: [7, 14, 30])
+            features: List of statistics to calculate (default: ['mean', 'std', 'max', 'min'])
+
+        Returns:
+            DataFrame with rolling features
+        """
+        if windows is None:
+            windows = [7, 14, 30]
+
+        if features is None:
+            features = ['mean', 'std', 'max', 'min']
+
+        # Use shared calculator for consistent rolling calculation
+        df = self.feature_calculator.calculate_rolling_features(
+            df,
+            windows=windows,
+            statistics=features,
+            mode='training'
+        )
+
+        # Update feature columns list
+        for window in windows:
+            for feature in features:
+                col_name = f'rolling_{feature}_{window}d'
+                if col_name not in self.feature_columns:
+                    self.feature_columns.append(col_name)
+
+        logger.info(f"Added rolling features (using shared calculator)", windows=windows, features=features)
+        return df
+
+    def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
+        """
+        Add enhanced day-of-week features.
+
+        Args:
+            df: DataFrame with date column
+            date_column: Name of date column
+
+        Returns:
+            DataFrame with day-of-week features
+        """
+        df = df.copy()
+
+        # Day of week (0=Monday, 6=Sunday)
+        df['day_of_week'] = df[date_column].dt.dayofweek
+
+        # Is weekend
+        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
+
+        # Is Friday (often higher demand due to weekend prep)
+        df['is_friday'] = (df['day_of_week'] == 4).astype(int)
+
+        # Is Monday (often lower demand after weekend)
+        df['is_monday'] = (df['day_of_week'] == 0).astype(int)
+
+        # Add to feature list
+        for col in ['day_of_week', 'is_weekend', 'is_friday', 'is_monday']:
+            if col not in self.feature_columns:
+                self.feature_columns.append(col)
+
+        return df
+
+    def add_calendar_enhanced_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
+        """
+        Add enhanced calendar features beyond basic temporal features.
+
+        Args:
+            df: DataFrame with date column
+            date_column: Name of date column
+
+        Returns:
+            DataFrame with enhanced calendar features
+        """
+        df = df.copy()
+
+        # Month and quarter (if not already present)
+        if 'month' not in df.columns:
+            df['month'] = df[date_column].dt.month
+
+        if 'quarter' not in df.columns:
+            df['quarter'] = df[date_column].dt.quarter
+
+        # Day of month
+        df['day_of_month'] = df[date_column].dt.day
+
+        # Is month start/end
+        df['is_month_start'] = (df['day_of_month'] <= 3).astype(int)
+        df['is_month_end'] = (df[date_column].dt.is_month_end).astype(int)
+
+        # Week of year
+        df['week_of_year'] = df[date_column].dt.isocalendar().week
+
+        # Payday indicators (15th and last day of month - high bakery traffic)
+        df['is_payday'] = ((df['day_of_month'] == 15) | df[date_column].dt.is_month_end).astype(int)
+
+        # Add to feature list
+        for col in ['month', 'quarter', 'day_of_month', 'is_month_start', 'is_month_end',
+                    'week_of_year', 'is_payday']:
+            if col not in self.feature_columns:
+                self.feature_columns.append(col)
+
+        return df
+
+    def add_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Add interaction features between variables.
+
+        Args:
+            df: DataFrame with base features
+
+        Returns:
+            DataFrame with interaction features
+        """
+        df = df.copy()
+
+        # Weekend × Temperature (people buy more cold drinks in hot weekends)
+        if 'is_weekend' in df.columns and 'temperature' in df.columns:
+            df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
+            self.feature_columns.append('weekend_temp_interaction')
+
+        # Rain × Weekend (bad weather reduces weekend traffic)
+        if 'is_weekend' in df.columns and 'precipitation' in df.columns:
+            df['rain_weekend_interaction'] = df['is_weekend'] * (df['precipitation'] > 0).astype(int)
+            self.feature_columns.append('rain_weekend_interaction')
+
+        # Friday × Traffic (high Friday traffic means weekend prep buying)
+        if 'is_friday' in df.columns and 'traffic_volume' in df.columns:
+            df['friday_traffic_interaction'] = df['is_friday'] * df['traffic_volume']
+            self.feature_columns.append('friday_traffic_interaction')
+
+        # Month × Temperature (seasonal temperature patterns)
+        if 'month' in df.columns and 'temperature' in df.columns:
+            df['month_temp_interaction'] = df['month'] * df['temperature']
+            self.feature_columns.append('month_temp_interaction')
+
+        # Payday × Weekend (big shopping days)
+        if 'is_payday' in df.columns and 'is_weekend' in df.columns:
+            df['payday_weekend_interaction'] = df['is_payday'] * df['is_weekend']
+            self.feature_columns.append('payday_weekend_interaction')
+
+        logger.info(f"Added {len([c for c in self.feature_columns if 'interaction' in c])} interaction features")
+        return df
+
+    def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
+        """
+        Add trend-based features.
+        Uses shared feature calculator for consistency with prediction service.
+
+        Args:
+            df: DataFrame with date and quantity
+            date_column: Name of date column
+
+        Returns:
+            DataFrame with trend features
+        """
+        # Use shared calculator for consistent trend calculation
+        df = self.feature_calculator.calculate_trend_features(
+            df,
+            mode='training'
+        )
+
+        # Update feature columns list
+        for feature_name in ['days_since_start', 'momentum_1_7', 'trend_7_30', 'velocity_week']:
+            if feature_name in df.columns and feature_name not in self.feature_columns:
+                self.feature_columns.append(feature_name)
+
+        logger.debug("Added trend features (using shared calculator)")
+        return df
+
+    def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Add cyclical encoding for periodic features (day_of_week, month).
+        Helps models understand that Monday follows Sunday, December follows January.
+
+        Args:
+            df: DataFrame with day_of_week and month columns
+
+        Returns:
+            DataFrame with cyclical features
+        """
+        df = df.copy()
+
+        # Day of week cyclical encoding
+        if 'day_of_week' in df.columns:
+            df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
+            df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
+            self.feature_columns.extend(['day_of_week_sin', 'day_of_week_cos'])
+
+        # Month cyclical encoding
+        if 'month' in df.columns:
+            df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
+            df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
+            self.feature_columns.extend(['month_sin', 'month_cos'])
+
+        logger.info("Added cyclical encoding for temporal features")
+        return df
+
+    def create_all_features(
+        self,
+        df: pd.DataFrame,
+        date_column: str = 'date',
+        include_lags: bool = True,
+        include_rolling: bool = True,
+        include_interactions: bool = True,
+        include_cyclical: bool = True
+    ) -> pd.DataFrame:
+        """
+        Create all enhanced features in one go.
+
+        Args:
+            df: DataFrame with base data
+            date_column: Name of date column
+            include_lags: Whether to include lagged features
+            include_rolling: Whether to include rolling statistics
+            include_interactions: Whether to include interaction features
+            include_cyclical: Whether to include cyclical encoding
+
+        Returns:
+            DataFrame with all enhanced features
+        """
+        logger.info("Creating comprehensive feature set for hybrid model")
+
+        # Reset feature list
+        self.feature_columns = []
+
+        # Day of week and calendar features (always needed)
+        df = self.add_day_of_week_features(df, date_column)
+        df = self.add_calendar_enhanced_features(df, date_column)
+
+        # Optional features
+        if include_lags:
+            df = self.add_lagged_features(df)
+
+        if include_rolling:
+            df = self.add_rolling_features(df)
+
+        if include_interactions:
+            df = self.add_interaction_features(df)
+
+        if include_cyclical:
+            df = self.add_cyclical_encoding(df)
+
+        # Trend features (depends on lags and rolling)
+        if include_lags or include_rolling:
+            df = self.add_trend_features(df, date_column)
+
+        logger.info(f"Created {len(self.feature_columns)} enhanced features for hybrid model")
+
+        return df
+
+    def get_feature_columns(self) -> List[str]:
+        """Get list of all created feature column names."""
+        return self.feature_columns.copy()
+
+    def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_backward') -> pd.DataFrame:
+        """
+        Fill NA values in lagged and rolling features.
+
+        Args:
+            df: DataFrame with potential NA values
+            strategy: 'forward_backward', 'zero', 'mean'
+
+        Returns:
+            DataFrame with filled NA values
+        """
+        df = df.copy()
+
+        if strategy == 'forward_backward':
+            # Forward fill first (use previous values)
+            df = df.fillna(method='ffill')
+            # Backward fill remaining (beginning of series)
+            df = df.fillna(method='bfill')
+
+        elif strategy == 'zero':
+            df = df.fillna(0)
+
+        elif strategy == 'mean':
+            df = df.fillna(df.mean())
+
+        return df