imporve features
This commit is contained in:
0
shared/ml/__init__.py
Normal file
0
shared/ml/__init__.py
Normal file
400
shared/ml/data_processor.py
Normal file
400
shared/ml/data_processor.py
Normal file
@@ -0,0 +1,400 @@
|
||||
"""
|
||||
Shared Data Processor for Bakery Forecasting
|
||||
Provides feature engineering capabilities for both training and prediction
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Dict, List, Any, Optional
|
||||
from datetime import datetime
|
||||
import structlog
|
||||
import holidays
|
||||
|
||||
from shared.ml.enhanced_features import AdvancedFeatureEngineer
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class EnhancedBakeryDataProcessor:
|
||||
"""
|
||||
Shared data processor for bakery forecasting.
|
||||
Focuses on prediction feature preparation without training-specific dependencies.
|
||||
"""
|
||||
|
||||
def __init__(self, region: str = 'MD'):
|
||||
"""
|
||||
Initialize the data processor.
|
||||
|
||||
Args:
|
||||
region: Spanish region code for holidays (MD=Madrid, PV=Basque, etc.)
|
||||
"""
|
||||
self.scalers = {}
|
||||
self.feature_engineer = AdvancedFeatureEngineer()
|
||||
self.region = region
|
||||
self.spain_holidays = holidays.Spain(prov=region)
|
||||
|
||||
def get_scalers(self) -> Dict[str, Any]:
|
||||
"""Return the scalers/normalization parameters for use during prediction"""
|
||||
return self.scalers.copy()
|
||||
|
||||
@staticmethod
|
||||
def _extract_numeric_from_dict(value: Any) -> Optional[float]:
|
||||
"""
|
||||
Robust extraction of numeric values from complex data structures.
|
||||
"""
|
||||
if isinstance(value, (int, float)) and not isinstance(value, bool):
|
||||
return float(value)
|
||||
|
||||
if isinstance(value, dict):
|
||||
for key in ['value', 'data', 'result', 'amount', 'count', 'number', 'val']:
|
||||
if key in value:
|
||||
extracted = value[key]
|
||||
if isinstance(extracted, dict):
|
||||
return EnhancedBakeryDataProcessor._extract_numeric_from_dict(extracted)
|
||||
elif isinstance(extracted, (int, float)) and not isinstance(extracted, bool):
|
||||
return float(extracted)
|
||||
|
||||
for v in value.values():
|
||||
if isinstance(v, (int, float)) and not isinstance(v, bool):
|
||||
return float(v)
|
||||
elif isinstance(v, dict):
|
||||
result = EnhancedBakeryDataProcessor._extract_numeric_from_dict(v)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return float(value)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
async def prepare_prediction_features(self,
|
||||
future_dates: pd.DatetimeIndex,
|
||||
weather_forecast: pd.DataFrame = None,
|
||||
traffic_forecast: pd.DataFrame = None,
|
||||
poi_features: Dict[str, Any] = None,
|
||||
historical_data: pd.DataFrame = None) -> pd.DataFrame:
|
||||
"""
|
||||
Create features for future predictions.
|
||||
|
||||
Args:
|
||||
future_dates: Future dates to predict
|
||||
weather_forecast: Weather forecast data
|
||||
traffic_forecast: Traffic forecast data (optional, not commonly forecasted)
|
||||
poi_features: POI features (location-based, static)
|
||||
historical_data: Historical data for creating lagged and rolling features
|
||||
|
||||
Returns:
|
||||
DataFrame with features for prediction
|
||||
"""
|
||||
try:
|
||||
# Create base future dataframe
|
||||
future_df = pd.DataFrame({'ds': future_dates})
|
||||
|
||||
# Add temporal features
|
||||
future_df = self._add_temporal_features(
|
||||
future_df.rename(columns={'ds': 'date'})
|
||||
).rename(columns={'date': 'ds'})
|
||||
|
||||
# Add weather features
|
||||
if weather_forecast is not None and not weather_forecast.empty:
|
||||
weather_features = weather_forecast.copy()
|
||||
if 'date' in weather_features.columns:
|
||||
weather_features = weather_features.rename(columns={'date': 'ds'})
|
||||
|
||||
future_df = future_df.merge(weather_features, on='ds', how='left')
|
||||
|
||||
# Add traffic features
|
||||
if traffic_forecast is not None and not traffic_forecast.empty:
|
||||
traffic_features = traffic_forecast.copy()
|
||||
if 'date' in traffic_features.columns:
|
||||
traffic_features = traffic_features.rename(columns={'date': 'ds'})
|
||||
|
||||
future_df = future_df.merge(traffic_features, on='ds', how='left')
|
||||
|
||||
# Engineer basic features
|
||||
future_df = self._engineer_features(future_df.rename(columns={'ds': 'date'}))
|
||||
|
||||
# Add advanced features if historical data is provided
|
||||
if historical_data is not None and not historical_data.empty:
|
||||
combined_df = pd.concat([
|
||||
historical_data.rename(columns={'ds': 'date'}),
|
||||
future_df
|
||||
], ignore_index=True).sort_values('date')
|
||||
|
||||
combined_df = self._add_advanced_features(combined_df)
|
||||
future_df = combined_df[combined_df['date'].isin(future_df['date'])].copy()
|
||||
else:
|
||||
logger.warning("No historical data provided, lagged features will be NaN")
|
||||
future_df = self._add_advanced_features(future_df)
|
||||
|
||||
# Add POI features (static, location-based)
|
||||
if poi_features:
|
||||
future_df = self._add_poi_features(future_df, poi_features)
|
||||
|
||||
future_df = future_df.rename(columns={'date': 'ds'})
|
||||
|
||||
# Handle missing values
|
||||
future_df = self._handle_missing_values_future(future_df)
|
||||
|
||||
return future_df
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error creating prediction features", error=str(e))
|
||||
return pd.DataFrame({'ds': future_dates})
|
||||
|
||||
def _add_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Add comprehensive temporal features"""
|
||||
df = df.copy()
|
||||
|
||||
if 'date' not in df.columns:
|
||||
raise ValueError("DataFrame must have a 'date' column")
|
||||
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
|
||||
# Basic temporal features
|
||||
df['day_of_week'] = df['date'].dt.dayofweek
|
||||
df['day_of_month'] = df['date'].dt.day
|
||||
df['month'] = df['date'].dt.month
|
||||
df['quarter'] = df['date'].dt.quarter
|
||||
df['week_of_year'] = df['date'].dt.isocalendar().week
|
||||
|
||||
# Bakery-specific features
|
||||
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
|
||||
df['is_monday'] = (df['day_of_week'] == 0).astype(int)
|
||||
df['is_friday'] = (df['day_of_week'] == 4).astype(int)
|
||||
|
||||
# Season mapping
|
||||
df['season'] = df['month'].apply(self._get_season)
|
||||
df['is_summer'] = (df['season'] == 3).astype(int)
|
||||
df['is_winter'] = (df['season'] == 1).astype(int)
|
||||
|
||||
# Holiday indicators
|
||||
df['is_holiday'] = df['date'].apply(self._is_spanish_holiday).astype(int)
|
||||
df['is_school_holiday'] = df['date'].apply(self._is_school_holiday).astype(int)
|
||||
df['is_month_start'] = (df['day_of_month'] <= 3).astype(int)
|
||||
df['is_month_end'] = (df['day_of_month'] >= 28).astype(int)
|
||||
|
||||
# Payday patterns
|
||||
df['is_payday_period'] = ((df['day_of_month'] <= 5) | (df['day_of_month'] >= 25)).astype(int)
|
||||
|
||||
return df
|
||||
|
||||
def _engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Engineer additional features"""
|
||||
df = df.copy()
|
||||
|
||||
# Weather-based features
|
||||
if 'temperature' in df.columns:
|
||||
df['temperature'] = pd.to_numeric(df['temperature'], errors='coerce').fillna(15.0)
|
||||
df['temp_squared'] = df['temperature'] ** 2
|
||||
df['is_hot_day'] = (df['temperature'] > 25).astype(int)
|
||||
df['is_cold_day'] = (df['temperature'] < 10).astype(int)
|
||||
df['is_pleasant_day'] = ((df['temperature'] >= 18) & (df['temperature'] <= 25)).astype(int)
|
||||
df['temp_category'] = pd.cut(df['temperature'],
|
||||
bins=[-np.inf, 5, 15, 25, np.inf],
|
||||
labels=[0, 1, 2, 3]).astype(int)
|
||||
|
||||
if 'precipitation' in df.columns:
|
||||
df['precipitation'] = pd.to_numeric(df['precipitation'], errors='coerce').fillna(0.0)
|
||||
df['is_rainy_day'] = (df['precipitation'] > 0.1).astype(int)
|
||||
df['is_heavy_rain'] = (df['precipitation'] > 10).astype(int)
|
||||
df['rain_intensity'] = pd.cut(df['precipitation'],
|
||||
bins=[-0.1, 0, 2, 10, np.inf],
|
||||
labels=[0, 1, 2, 3]).astype(int)
|
||||
|
||||
# Traffic-based features
|
||||
if 'traffic_volume' in df.columns:
|
||||
df['traffic_volume'] = pd.to_numeric(df['traffic_volume'], errors='coerce').fillna(100.0)
|
||||
q75 = df['traffic_volume'].quantile(0.75)
|
||||
q25 = df['traffic_volume'].quantile(0.25)
|
||||
df['high_traffic'] = (df['traffic_volume'] > q75).astype(int)
|
||||
df['low_traffic'] = (df['traffic_volume'] < q25).astype(int)
|
||||
|
||||
traffic_std = df['traffic_volume'].std()
|
||||
traffic_mean = df['traffic_volume'].mean()
|
||||
|
||||
if traffic_std > 0 and not pd.isna(traffic_std):
|
||||
df['traffic_normalized'] = (df['traffic_volume'] - traffic_mean) / traffic_std
|
||||
self.scalers['traffic_mean'] = float(traffic_mean)
|
||||
self.scalers['traffic_std'] = float(traffic_std)
|
||||
else:
|
||||
df['traffic_normalized'] = 0.0
|
||||
self.scalers['traffic_mean'] = 100.0
|
||||
self.scalers['traffic_std'] = 50.0
|
||||
|
||||
df['traffic_normalized'] = df['traffic_normalized'].fillna(0.0)
|
||||
|
||||
# Interaction features
|
||||
if 'is_weekend' in df.columns and 'temperature' in df.columns:
|
||||
df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
|
||||
df['weekend_pleasant_weather'] = df['is_weekend'] * df.get('is_pleasant_day', 0)
|
||||
|
||||
if 'is_rainy_day' in df.columns and 'traffic_volume' in df.columns:
|
||||
df['rain_traffic_interaction'] = df['is_rainy_day'] * df['traffic_volume']
|
||||
|
||||
if 'is_holiday' in df.columns and 'temperature' in df.columns:
|
||||
df['holiday_temp_interaction'] = df['is_holiday'] * df['temperature']
|
||||
|
||||
if 'season' in df.columns and 'temperature' in df.columns:
|
||||
df['season_temp_interaction'] = df['season'] * df['temperature']
|
||||
|
||||
# Day-of-week specific features
|
||||
if 'day_of_week' in df.columns:
|
||||
df['is_working_day'] = (~df['day_of_week'].isin([5, 6])).astype(int)
|
||||
df['is_peak_bakery_day'] = df['day_of_week'].isin([4, 5, 6]).astype(int)
|
||||
|
||||
# Month-specific features
|
||||
if 'month' in df.columns:
|
||||
df['is_high_demand_month'] = df['month'].isin([6, 7, 8, 12]).astype(int)
|
||||
df['is_warm_season'] = df['month'].isin([4, 5, 6, 7, 8, 9]).astype(int)
|
||||
|
||||
# Special day: Payday
|
||||
if 'is_payday_period' in df.columns:
|
||||
df['is_payday'] = df['is_payday_period']
|
||||
|
||||
return df
|
||||
|
||||
def _add_advanced_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Add advanced features using AdvancedFeatureEngineer"""
|
||||
df = df.copy()
|
||||
|
||||
logger.info("Adding advanced features (lagged, rolling, cyclical, trends)",
|
||||
input_rows=len(df),
|
||||
input_columns=len(df.columns))
|
||||
|
||||
self.feature_engineer = AdvancedFeatureEngineer()
|
||||
|
||||
df = self.feature_engineer.create_all_features(
|
||||
df,
|
||||
date_column='date',
|
||||
include_lags=True,
|
||||
include_rolling=True,
|
||||
include_interactions=True,
|
||||
include_cyclical=True
|
||||
)
|
||||
|
||||
df = self.feature_engineer.fill_na_values(df, strategy='forward_backward')
|
||||
|
||||
created_features = self.feature_engineer.get_feature_columns()
|
||||
logger.info(f"Added {len(created_features)} advanced features")
|
||||
|
||||
return df
|
||||
|
||||
def _add_poi_features(self, df: pd.DataFrame, poi_features: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""Add POI features (static, location-based)"""
|
||||
if not poi_features:
|
||||
logger.warning("No POI features to add")
|
||||
return df
|
||||
|
||||
logger.info(f"Adding {len(poi_features)} POI features to dataframe")
|
||||
|
||||
for feature_name, feature_value in poi_features.items():
|
||||
if isinstance(feature_value, bool):
|
||||
feature_value = 1 if feature_value else 0
|
||||
df[feature_name] = feature_value
|
||||
|
||||
return df
|
||||
|
||||
def _handle_missing_values_future(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Handle missing values in future prediction data"""
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
||||
|
||||
madrid_defaults = {
|
||||
'temperature': 15.0,
|
||||
'precipitation': 0.0,
|
||||
'humidity': 60.0,
|
||||
'wind_speed': 5.0,
|
||||
'traffic_volume': 100.0,
|
||||
'pedestrian_count': 50.0,
|
||||
'pressure': 1013.0
|
||||
}
|
||||
|
||||
for col in numeric_columns:
|
||||
if df[col].isna().any():
|
||||
default_value = 0
|
||||
for key, value in madrid_defaults.items():
|
||||
if key in col.lower():
|
||||
default_value = value
|
||||
break
|
||||
|
||||
df[col] = df[col].fillna(default_value)
|
||||
|
||||
return df
|
||||
|
||||
def _get_season(self, month: int) -> int:
|
||||
"""Get season from month (1-4 for Winter, Spring, Summer, Autumn)"""
|
||||
if month in [12, 1, 2]:
|
||||
return 1 # Winter
|
||||
elif month in [3, 4, 5]:
|
||||
return 2 # Spring
|
||||
elif month in [6, 7, 8]:
|
||||
return 3 # Summer
|
||||
else:
|
||||
return 4 # Autumn
|
||||
|
||||
def _is_spanish_holiday(self, date: datetime) -> bool:
|
||||
"""Check if a date is a Spanish holiday"""
|
||||
try:
|
||||
if isinstance(date, datetime):
|
||||
date = date.date()
|
||||
elif isinstance(date, pd.Timestamp):
|
||||
date = date.date()
|
||||
|
||||
return date in self.spain_holidays
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking holiday status for {date}: {e}")
|
||||
month_day = (date.month, date.day)
|
||||
basic_holidays = [
|
||||
(1, 1), (1, 6), (5, 1), (8, 15), (10, 12),
|
||||
(11, 1), (12, 6), (12, 8), (12, 25)
|
||||
]
|
||||
return month_day in basic_holidays
|
||||
|
||||
def _is_school_holiday(self, date: datetime) -> bool:
|
||||
"""Check if a date is during school holidays in Spain"""
|
||||
try:
|
||||
from datetime import timedelta
|
||||
import holidays as hol
|
||||
|
||||
if isinstance(date, datetime):
|
||||
check_date = date.date()
|
||||
elif isinstance(date, pd.Timestamp):
|
||||
check_date = date.date()
|
||||
else:
|
||||
check_date = date
|
||||
|
||||
month = check_date.month
|
||||
day = check_date.day
|
||||
|
||||
# Summer holidays (July 1 - August 31)
|
||||
if month in [7, 8]:
|
||||
return True
|
||||
|
||||
# Christmas holidays (December 23 - January 7)
|
||||
if (month == 12 and day >= 23) or (month == 1 and day <= 7):
|
||||
return True
|
||||
|
||||
# Easter/Spring break (Semana Santa)
|
||||
year = check_date.year
|
||||
spain_hol = hol.Spain(years=year, prov=self.region)
|
||||
|
||||
for holiday_date, holiday_name in spain_hol.items():
|
||||
if 'viernes santo' in holiday_name.lower() or 'easter' in holiday_name.lower():
|
||||
easter_start = holiday_date - timedelta(days=7)
|
||||
easter_end = holiday_date + timedelta(days=7)
|
||||
if easter_start <= check_date <= easter_end:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking school holiday for {date}: {e}")
|
||||
month = date.month if hasattr(date, 'month') else date.month
|
||||
day = date.day if hasattr(date, 'day') else date.day
|
||||
return (month in [7, 8] or
|
||||
(month == 12 and day >= 23) or
|
||||
(month == 1 and day <= 7) or
|
||||
(month == 4 and 1 <= day <= 15))
|
||||
347
shared/ml/enhanced_features.py
Normal file
347
shared/ml/enhanced_features.py
Normal file
@@ -0,0 +1,347 @@
|
||||
"""
|
||||
Enhanced Feature Engineering for Hybrid Prophet + XGBoost Models
|
||||
Adds lagged features, rolling statistics, and advanced interactions
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional
|
||||
import structlog
|
||||
from shared.ml.feature_calculator import HistoricalFeatureCalculator
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class AdvancedFeatureEngineer:
|
||||
"""
|
||||
Advanced feature engineering for hybrid forecasting models.
|
||||
Adds lagged features, rolling statistics, and complex interactions.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.feature_columns = []
|
||||
self.feature_calculator = HistoricalFeatureCalculator()
|
||||
|
||||
def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame:
|
||||
"""
|
||||
Add lagged demand features for capturing recent trends.
|
||||
Uses shared feature calculator for consistency with prediction service.
|
||||
|
||||
Args:
|
||||
df: DataFrame with 'quantity' column
|
||||
lag_days: List of lag periods (default: [1, 7, 14])
|
||||
|
||||
Returns:
|
||||
DataFrame with added lagged features
|
||||
"""
|
||||
if lag_days is None:
|
||||
lag_days = [1, 7, 14]
|
||||
|
||||
# Use shared calculator for consistent lag calculation
|
||||
df = self.feature_calculator.calculate_lag_features(
|
||||
df,
|
||||
lag_days=lag_days,
|
||||
mode='training'
|
||||
)
|
||||
|
||||
# Update feature columns list
|
||||
for lag in lag_days:
|
||||
col_name = f'lag_{lag}_day'
|
||||
if col_name not in self.feature_columns:
|
||||
self.feature_columns.append(col_name)
|
||||
|
||||
logger.info(f"Added {len(lag_days)} lagged features (using shared calculator)", lags=lag_days)
|
||||
return df
|
||||
|
||||
def add_rolling_features(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
windows: List[int] = None,
|
||||
features: List[str] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Add rolling statistics (mean, std, max, min).
|
||||
Uses shared feature calculator for consistency with prediction service.
|
||||
|
||||
Args:
|
||||
df: DataFrame with 'quantity' column
|
||||
windows: List of window sizes (default: [7, 14, 30])
|
||||
features: List of statistics to calculate (default: ['mean', 'std', 'max', 'min'])
|
||||
|
||||
Returns:
|
||||
DataFrame with rolling features
|
||||
"""
|
||||
if windows is None:
|
||||
windows = [7, 14, 30]
|
||||
|
||||
if features is None:
|
||||
features = ['mean', 'std', 'max', 'min']
|
||||
|
||||
# Use shared calculator for consistent rolling calculation
|
||||
df = self.feature_calculator.calculate_rolling_features(
|
||||
df,
|
||||
windows=windows,
|
||||
statistics=features,
|
||||
mode='training'
|
||||
)
|
||||
|
||||
# Update feature columns list
|
||||
for window in windows:
|
||||
for feature in features:
|
||||
col_name = f'rolling_{feature}_{window}d'
|
||||
if col_name not in self.feature_columns:
|
||||
self.feature_columns.append(col_name)
|
||||
|
||||
logger.info(f"Added rolling features (using shared calculator)", windows=windows, features=features)
|
||||
return df
|
||||
|
||||
def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
|
||||
"""
|
||||
Add enhanced day-of-week features.
|
||||
|
||||
Args:
|
||||
df: DataFrame with date column
|
||||
date_column: Name of date column
|
||||
|
||||
Returns:
|
||||
DataFrame with day-of-week features
|
||||
"""
|
||||
df = df.copy()
|
||||
|
||||
# Day of week (0=Monday, 6=Sunday)
|
||||
df['day_of_week'] = df[date_column].dt.dayofweek
|
||||
|
||||
# Is weekend
|
||||
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
|
||||
|
||||
# Is Friday (often higher demand due to weekend prep)
|
||||
df['is_friday'] = (df['day_of_week'] == 4).astype(int)
|
||||
|
||||
# Is Monday (often lower demand after weekend)
|
||||
df['is_monday'] = (df['day_of_week'] == 0).astype(int)
|
||||
|
||||
# Add to feature list
|
||||
for col in ['day_of_week', 'is_weekend', 'is_friday', 'is_monday']:
|
||||
if col not in self.feature_columns:
|
||||
self.feature_columns.append(col)
|
||||
|
||||
return df
|
||||
|
||||
def add_calendar_enhanced_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
|
||||
"""
|
||||
Add enhanced calendar features beyond basic temporal features.
|
||||
|
||||
Args:
|
||||
df: DataFrame with date column
|
||||
date_column: Name of date column
|
||||
|
||||
Returns:
|
||||
DataFrame with enhanced calendar features
|
||||
"""
|
||||
df = df.copy()
|
||||
|
||||
# Month and quarter (if not already present)
|
||||
if 'month' not in df.columns:
|
||||
df['month'] = df[date_column].dt.month
|
||||
|
||||
if 'quarter' not in df.columns:
|
||||
df['quarter'] = df[date_column].dt.quarter
|
||||
|
||||
# Day of month
|
||||
df['day_of_month'] = df[date_column].dt.day
|
||||
|
||||
# Is month start/end
|
||||
df['is_month_start'] = (df['day_of_month'] <= 3).astype(int)
|
||||
df['is_month_end'] = (df[date_column].dt.is_month_end).astype(int)
|
||||
|
||||
# Week of year
|
||||
df['week_of_year'] = df[date_column].dt.isocalendar().week
|
||||
|
||||
# Payday indicators (15th and last day of month - high bakery traffic)
|
||||
df['is_payday'] = ((df['day_of_month'] == 15) | df[date_column].dt.is_month_end).astype(int)
|
||||
|
||||
# Add to feature list
|
||||
for col in ['month', 'quarter', 'day_of_month', 'is_month_start', 'is_month_end',
|
||||
'week_of_year', 'is_payday']:
|
||||
if col not in self.feature_columns:
|
||||
self.feature_columns.append(col)
|
||||
|
||||
return df
|
||||
|
||||
def add_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Add interaction features between variables.
|
||||
|
||||
Args:
|
||||
df: DataFrame with base features
|
||||
|
||||
Returns:
|
||||
DataFrame with interaction features
|
||||
"""
|
||||
df = df.copy()
|
||||
|
||||
# Weekend × Temperature (people buy more cold drinks in hot weekends)
|
||||
if 'is_weekend' in df.columns and 'temperature' in df.columns:
|
||||
df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
|
||||
self.feature_columns.append('weekend_temp_interaction')
|
||||
|
||||
# Rain × Weekend (bad weather reduces weekend traffic)
|
||||
if 'is_weekend' in df.columns and 'precipitation' in df.columns:
|
||||
df['rain_weekend_interaction'] = df['is_weekend'] * (df['precipitation'] > 0).astype(int)
|
||||
self.feature_columns.append('rain_weekend_interaction')
|
||||
|
||||
# Friday × Traffic (high Friday traffic means weekend prep buying)
|
||||
if 'is_friday' in df.columns and 'traffic_volume' in df.columns:
|
||||
df['friday_traffic_interaction'] = df['is_friday'] * df['traffic_volume']
|
||||
self.feature_columns.append('friday_traffic_interaction')
|
||||
|
||||
# Month × Temperature (seasonal temperature patterns)
|
||||
if 'month' in df.columns and 'temperature' in df.columns:
|
||||
df['month_temp_interaction'] = df['month'] * df['temperature']
|
||||
self.feature_columns.append('month_temp_interaction')
|
||||
|
||||
# Payday × Weekend (big shopping days)
|
||||
if 'is_payday' in df.columns and 'is_weekend' in df.columns:
|
||||
df['payday_weekend_interaction'] = df['is_payday'] * df['is_weekend']
|
||||
self.feature_columns.append('payday_weekend_interaction')
|
||||
|
||||
logger.info(f"Added {len([c for c in self.feature_columns if 'interaction' in c])} interaction features")
|
||||
return df
|
||||
|
||||
def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
|
||||
"""
|
||||
Add trend-based features.
|
||||
Uses shared feature calculator for consistency with prediction service.
|
||||
|
||||
Args:
|
||||
df: DataFrame with date and quantity
|
||||
date_column: Name of date column
|
||||
|
||||
Returns:
|
||||
DataFrame with trend features
|
||||
"""
|
||||
# Use shared calculator for consistent trend calculation
|
||||
df = self.feature_calculator.calculate_trend_features(
|
||||
df,
|
||||
mode='training'
|
||||
)
|
||||
|
||||
# Update feature columns list
|
||||
for feature_name in ['days_since_start', 'momentum_1_7', 'trend_7_30', 'velocity_week']:
|
||||
if feature_name in df.columns and feature_name not in self.feature_columns:
|
||||
self.feature_columns.append(feature_name)
|
||||
|
||||
logger.debug("Added trend features (using shared calculator)")
|
||||
return df
|
||||
|
||||
def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Add cyclical encoding for periodic features (day_of_week, month).
|
||||
Helps models understand that Monday follows Sunday, December follows January.
|
||||
|
||||
Args:
|
||||
df: DataFrame with day_of_week and month columns
|
||||
|
||||
Returns:
|
||||
DataFrame with cyclical features
|
||||
"""
|
||||
df = df.copy()
|
||||
|
||||
# Day of week cyclical encoding
|
||||
if 'day_of_week' in df.columns:
|
||||
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
|
||||
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
|
||||
self.feature_columns.extend(['day_of_week_sin', 'day_of_week_cos'])
|
||||
|
||||
# Month cyclical encoding
|
||||
if 'month' in df.columns:
|
||||
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
|
||||
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
|
||||
self.feature_columns.extend(['month_sin', 'month_cos'])
|
||||
|
||||
logger.info("Added cyclical encoding for temporal features")
|
||||
return df
|
||||
|
||||
def create_all_features(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
date_column: str = 'date',
|
||||
include_lags: bool = True,
|
||||
include_rolling: bool = True,
|
||||
include_interactions: bool = True,
|
||||
include_cyclical: bool = True
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Create all enhanced features in one go.
|
||||
|
||||
Args:
|
||||
df: DataFrame with base data
|
||||
date_column: Name of date column
|
||||
include_lags: Whether to include lagged features
|
||||
include_rolling: Whether to include rolling statistics
|
||||
include_interactions: Whether to include interaction features
|
||||
include_cyclical: Whether to include cyclical encoding
|
||||
|
||||
Returns:
|
||||
DataFrame with all enhanced features
|
||||
"""
|
||||
logger.info("Creating comprehensive feature set for hybrid model")
|
||||
|
||||
# Reset feature list
|
||||
self.feature_columns = []
|
||||
|
||||
# Day of week and calendar features (always needed)
|
||||
df = self.add_day_of_week_features(df, date_column)
|
||||
df = self.add_calendar_enhanced_features(df, date_column)
|
||||
|
||||
# Optional features
|
||||
if include_lags:
|
||||
df = self.add_lagged_features(df)
|
||||
|
||||
if include_rolling:
|
||||
df = self.add_rolling_features(df)
|
||||
|
||||
if include_interactions:
|
||||
df = self.add_interaction_features(df)
|
||||
|
||||
if include_cyclical:
|
||||
df = self.add_cyclical_encoding(df)
|
||||
|
||||
# Trend features (depends on lags and rolling)
|
||||
if include_lags or include_rolling:
|
||||
df = self.add_trend_features(df, date_column)
|
||||
|
||||
logger.info(f"Created {len(self.feature_columns)} enhanced features for hybrid model")
|
||||
|
||||
return df
|
||||
|
||||
def get_feature_columns(self) -> List[str]:
|
||||
"""Get list of all created feature column names."""
|
||||
return self.feature_columns.copy()
|
||||
|
||||
def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_backward') -> pd.DataFrame:
|
||||
"""
|
||||
Fill NA values in lagged and rolling features.
|
||||
|
||||
Args:
|
||||
df: DataFrame with potential NA values
|
||||
strategy: 'forward_backward', 'zero', 'mean'
|
||||
|
||||
Returns:
|
||||
DataFrame with filled NA values
|
||||
"""
|
||||
df = df.copy()
|
||||
|
||||
if strategy == 'forward_backward':
|
||||
# Forward fill first (use previous values)
|
||||
df = df.fillna(method='ffill')
|
||||
# Backward fill remaining (beginning of series)
|
||||
df = df.fillna(method='bfill')
|
||||
|
||||
elif strategy == 'zero':
|
||||
df = df.fillna(0)
|
||||
|
||||
elif strategy == 'mean':
|
||||
df = df.fillna(df.mean())
|
||||
|
||||
return df
|
||||
588
shared/ml/feature_calculator.py
Normal file
588
shared/ml/feature_calculator.py
Normal file
@@ -0,0 +1,588 @@
|
||||
"""
|
||||
Shared Feature Calculator for Training and Prediction Services
|
||||
|
||||
This module provides unified feature calculation logic to ensure consistency
|
||||
between model training and inference (prediction), preventing train/serve skew.
|
||||
|
||||
Key principles:
|
||||
- Same lag calculation logic in training and prediction
|
||||
- Same rolling window statistics in training and prediction
|
||||
- Same trend feature calculations in training and prediction
|
||||
- Graceful handling of sparse/missing data with consistent fallbacks
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional, Union, Tuple
|
||||
from datetime import datetime
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class HistoricalFeatureCalculator:
|
||||
"""
|
||||
Unified historical feature calculator for both training and prediction.
|
||||
|
||||
This class ensures that features are calculated identically whether
|
||||
during model training or during inference, preventing train/serve skew.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the feature calculator."""
|
||||
self.feature_columns = []
|
||||
|
||||
def calculate_lag_features(
|
||||
self,
|
||||
sales_data: Union[pd.Series, pd.DataFrame],
|
||||
lag_days: List[int] = None,
|
||||
mode: str = 'training'
|
||||
) -> Union[pd.DataFrame, Dict[str, float]]:
|
||||
"""
|
||||
Calculate lagged sales features consistently for training and prediction.
|
||||
|
||||
Args:
|
||||
sales_data: Sales data as Series (prediction) or DataFrame (training) with 'quantity' column
|
||||
lag_days: List of lag periods (default: [1, 7, 14])
|
||||
mode: 'training' returns DataFrame with lag columns, 'prediction' returns dict of features
|
||||
|
||||
Returns:
|
||||
DataFrame with lag columns (training mode) or dict of lag features (prediction mode)
|
||||
"""
|
||||
if lag_days is None:
|
||||
lag_days = [1, 7, 14]
|
||||
|
||||
if mode == 'training':
|
||||
return self._calculate_lag_features_training(sales_data, lag_days)
|
||||
else:
|
||||
return self._calculate_lag_features_prediction(sales_data, lag_days)
|
||||
|
||||
def _calculate_lag_features_training(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
lag_days: List[int]
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Calculate lag features for training (operates on DataFrame).
|
||||
|
||||
Args:
|
||||
df: DataFrame with 'quantity' column
|
||||
lag_days: List of lag periods
|
||||
|
||||
Returns:
|
||||
DataFrame with added lag columns
|
||||
"""
|
||||
df = df.copy()
|
||||
|
||||
# Calculate overall statistics for fallback (consistent with prediction)
|
||||
overall_mean = float(df['quantity'].mean()) if len(df) > 0 else 0.0
|
||||
overall_std = float(df['quantity'].std()) if len(df) > 1 else 0.0
|
||||
|
||||
for lag in lag_days:
|
||||
col_name = f'lag_{lag}_day'
|
||||
|
||||
# Use pandas shift
|
||||
df[col_name] = df['quantity'].shift(lag)
|
||||
|
||||
# Fill NaN values using same logic as prediction mode
|
||||
# For missing lags, use cascading fallback: previous lag -> last value -> mean
|
||||
if lag == 1:
|
||||
# For lag_1, fill with last available or mean
|
||||
df[col_name] = df[col_name].fillna(df['quantity'].iloc[0] if len(df) > 0 else overall_mean)
|
||||
elif lag == 7:
|
||||
# For lag_7, fill with lag_1 if available, else last value, else mean
|
||||
mask = df[col_name].isna()
|
||||
if 'lag_1_day' in df.columns:
|
||||
df.loc[mask, col_name] = df.loc[mask, 'lag_1_day']
|
||||
else:
|
||||
df.loc[mask, col_name] = df['quantity'].iloc[0] if len(df) > 0 else overall_mean
|
||||
elif lag == 14:
|
||||
# For lag_14, fill with lag_7 if available, else lag_1, else last value, else mean
|
||||
mask = df[col_name].isna()
|
||||
if 'lag_7_day' in df.columns:
|
||||
df.loc[mask, col_name] = df.loc[mask, 'lag_7_day']
|
||||
elif 'lag_1_day' in df.columns:
|
||||
df.loc[mask, col_name] = df.loc[mask, 'lag_1_day']
|
||||
else:
|
||||
df.loc[mask, col_name] = df['quantity'].iloc[0] if len(df) > 0 else overall_mean
|
||||
|
||||
# Fill any remaining NaN with mean
|
||||
df[col_name] = df[col_name].fillna(overall_mean)
|
||||
|
||||
self.feature_columns.append(col_name)
|
||||
|
||||
logger.debug(f"Added {len(lag_days)} lagged features (training mode)", lags=lag_days)
|
||||
return df
|
||||
|
||||
def _calculate_lag_features_prediction(
|
||||
self,
|
||||
historical_sales: pd.Series,
|
||||
lag_days: List[int]
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Calculate lag features for prediction (operates on Series, returns dict).
|
||||
|
||||
Args:
|
||||
historical_sales: Series of sales quantities indexed by date
|
||||
lag_days: List of lag periods
|
||||
|
||||
Returns:
|
||||
Dictionary of lag features
|
||||
"""
|
||||
features = {}
|
||||
|
||||
if len(historical_sales) == 0:
|
||||
# Return default values if no data
|
||||
for lag in lag_days:
|
||||
features[f'lag_{lag}_day'] = 0.0
|
||||
return features
|
||||
|
||||
# Calculate overall statistics for fallback
|
||||
overall_mean = float(historical_sales.mean())
|
||||
overall_std = float(historical_sales.std()) if len(historical_sales) > 1 else 0.0
|
||||
|
||||
# Calculate lag_1_day
|
||||
if 1 in lag_days:
|
||||
if len(historical_sales) >= 1:
|
||||
features['lag_1_day'] = float(historical_sales.iloc[-1])
|
||||
else:
|
||||
features['lag_1_day'] = overall_mean
|
||||
|
||||
# Calculate lag_7_day
|
||||
if 7 in lag_days:
|
||||
if len(historical_sales) >= 7:
|
||||
features['lag_7_day'] = float(historical_sales.iloc[-7])
|
||||
else:
|
||||
# Fallback to last value if insufficient data
|
||||
features['lag_7_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) > 0 else overall_mean
|
||||
|
||||
# Calculate lag_14_day
|
||||
if 14 in lag_days:
|
||||
if len(historical_sales) >= 14:
|
||||
features['lag_14_day'] = float(historical_sales.iloc[-14])
|
||||
else:
|
||||
# Cascading fallback: lag_7 -> lag_1 -> last value -> mean
|
||||
if len(historical_sales) >= 7:
|
||||
features['lag_14_day'] = float(historical_sales.iloc[-7])
|
||||
else:
|
||||
features['lag_14_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) > 0 else overall_mean
|
||||
|
||||
logger.debug("Calculated lag features (prediction mode)", features=features)
|
||||
return features
|
||||
|
||||
def calculate_rolling_features(
|
||||
self,
|
||||
sales_data: Union[pd.Series, pd.DataFrame],
|
||||
windows: List[int] = None,
|
||||
statistics: List[str] = None,
|
||||
mode: str = 'training'
|
||||
) -> Union[pd.DataFrame, Dict[str, float]]:
|
||||
"""
|
||||
Calculate rolling window statistics consistently for training and prediction.
|
||||
|
||||
Args:
|
||||
sales_data: Sales data as Series (prediction) or DataFrame (training) with 'quantity' column
|
||||
windows: List of window sizes in days (default: [7, 14, 30])
|
||||
statistics: List of statistics to calculate (default: ['mean', 'std', 'max', 'min'])
|
||||
mode: 'training' returns DataFrame, 'prediction' returns dict
|
||||
|
||||
Returns:
|
||||
DataFrame with rolling columns (training mode) or dict of rolling features (prediction mode)
|
||||
"""
|
||||
if windows is None:
|
||||
windows = [7, 14, 30]
|
||||
|
||||
if statistics is None:
|
||||
statistics = ['mean', 'std', 'max', 'min']
|
||||
|
||||
if mode == 'training':
|
||||
return self._calculate_rolling_features_training(sales_data, windows, statistics)
|
||||
else:
|
||||
return self._calculate_rolling_features_prediction(sales_data, windows, statistics)
|
||||
|
||||
def _calculate_rolling_features_training(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
windows: List[int],
|
||||
statistics: List[str]
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Calculate rolling features for training (operates on DataFrame).
|
||||
|
||||
Args:
|
||||
df: DataFrame with 'quantity' column
|
||||
windows: List of window sizes
|
||||
statistics: List of statistics to calculate
|
||||
|
||||
Returns:
|
||||
DataFrame with added rolling columns
|
||||
"""
|
||||
df = df.copy()
|
||||
|
||||
# Calculate overall statistics for fallback
|
||||
overall_mean = float(df['quantity'].mean()) if len(df) > 0 else 0.0
|
||||
overall_std = float(df['quantity'].std()) if len(df) > 1 else 0.0
|
||||
overall_max = float(df['quantity'].max()) if len(df) > 0 else 0.0
|
||||
overall_min = float(df['quantity'].min()) if len(df) > 0 else 0.0
|
||||
|
||||
fallback_values = {
|
||||
'mean': overall_mean,
|
||||
'std': overall_std,
|
||||
'max': overall_max,
|
||||
'min': overall_min
|
||||
}
|
||||
|
||||
for window in windows:
|
||||
for stat in statistics:
|
||||
col_name = f'rolling_{stat}_{window}d'
|
||||
|
||||
# Calculate rolling statistic with full window required (consistent with prediction)
|
||||
# Use min_periods=window to match prediction behavior
|
||||
if stat == 'mean':
|
||||
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).mean()
|
||||
elif stat == 'std':
|
||||
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).std()
|
||||
elif stat == 'max':
|
||||
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).max()
|
||||
elif stat == 'min':
|
||||
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).min()
|
||||
|
||||
# Fill NaN values using cascading fallback (consistent with prediction)
|
||||
# Use smaller window values if available, otherwise use overall statistics
|
||||
mask = df[col_name].isna()
|
||||
if window == 14 and f'rolling_{stat}_7d' in df.columns:
|
||||
# Use 7-day window for 14-day NaN
|
||||
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_7d']
|
||||
elif window == 30 and f'rolling_{stat}_14d' in df.columns:
|
||||
# Use 14-day window for 30-day NaN
|
||||
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_14d']
|
||||
elif window == 30 and f'rolling_{stat}_7d' in df.columns:
|
||||
# Use 7-day window for 30-day NaN if 14-day not available
|
||||
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_7d']
|
||||
|
||||
# Fill any remaining NaN with overall statistics
|
||||
df[col_name] = df[col_name].fillna(fallback_values[stat])
|
||||
|
||||
self.feature_columns.append(col_name)
|
||||
|
||||
logger.debug(f"Added rolling features (training mode)", windows=windows, statistics=statistics)
|
||||
return df
|
||||
|
||||
def _calculate_rolling_features_prediction(
|
||||
self,
|
||||
historical_sales: pd.Series,
|
||||
windows: List[int],
|
||||
statistics: List[str]
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Calculate rolling features for prediction (operates on Series, returns dict).
|
||||
|
||||
Args:
|
||||
historical_sales: Series of sales quantities indexed by date
|
||||
windows: List of window sizes
|
||||
statistics: List of statistics to calculate
|
||||
|
||||
Returns:
|
||||
Dictionary of rolling features
|
||||
"""
|
||||
features = {}
|
||||
|
||||
if len(historical_sales) == 0:
|
||||
# Return default values if no data
|
||||
for window in windows:
|
||||
for stat in statistics:
|
||||
features[f'rolling_{stat}_{window}d'] = 0.0
|
||||
return features
|
||||
|
||||
# Calculate overall statistics for fallback
|
||||
overall_mean = float(historical_sales.mean())
|
||||
overall_std = float(historical_sales.std()) if len(historical_sales) > 1 else 0.0
|
||||
overall_max = float(historical_sales.max())
|
||||
overall_min = float(historical_sales.min())
|
||||
|
||||
fallback_values = {
|
||||
'mean': overall_mean,
|
||||
'std': overall_std,
|
||||
'max': overall_max,
|
||||
'min': overall_min
|
||||
}
|
||||
|
||||
# Calculate for each window
|
||||
for window in windows:
|
||||
if len(historical_sales) >= window:
|
||||
# Have enough data for full window
|
||||
window_data = historical_sales.iloc[-window:]
|
||||
|
||||
for stat in statistics:
|
||||
col_name = f'rolling_{stat}_{window}d'
|
||||
if stat == 'mean':
|
||||
features[col_name] = float(window_data.mean())
|
||||
elif stat == 'std':
|
||||
features[col_name] = float(window_data.std()) if len(window_data) > 1 else 0.0
|
||||
elif stat == 'max':
|
||||
features[col_name] = float(window_data.max())
|
||||
elif stat == 'min':
|
||||
features[col_name] = float(window_data.min())
|
||||
else:
|
||||
# Insufficient data - use cascading fallback
|
||||
for stat in statistics:
|
||||
col_name = f'rolling_{stat}_{window}d'
|
||||
|
||||
# Try to use smaller window if available
|
||||
if window == 14 and f'rolling_{stat}_7d' in features:
|
||||
features[col_name] = features[f'rolling_{stat}_7d']
|
||||
elif window == 30 and f'rolling_{stat}_14d' in features:
|
||||
features[col_name] = features[f'rolling_{stat}_14d']
|
||||
elif window == 30 and f'rolling_{stat}_7d' in features:
|
||||
features[col_name] = features[f'rolling_{stat}_7d']
|
||||
else:
|
||||
# Use overall statistics
|
||||
features[col_name] = fallback_values[stat]
|
||||
|
||||
logger.debug("Calculated rolling features (prediction mode)", num_features=len(features))
|
||||
return features
|
||||
|
||||
def calculate_trend_features(
|
||||
self,
|
||||
sales_data: Union[pd.Series, pd.DataFrame],
|
||||
reference_date: Optional[datetime] = None,
|
||||
lag_features: Optional[Dict[str, float]] = None,
|
||||
rolling_features: Optional[Dict[str, float]] = None,
|
||||
mode: str = 'training'
|
||||
) -> Union[pd.DataFrame, Dict[str, float]]:
|
||||
"""
|
||||
Calculate trend-based features consistently for training and prediction.
|
||||
|
||||
Args:
|
||||
sales_data: Sales data as Series (prediction) or DataFrame (training)
|
||||
reference_date: Reference date for calculations (prediction mode)
|
||||
lag_features: Pre-calculated lag features (prediction mode)
|
||||
rolling_features: Pre-calculated rolling features (prediction mode)
|
||||
mode: 'training' returns DataFrame, 'prediction' returns dict
|
||||
|
||||
Returns:
|
||||
DataFrame with trend columns (training mode) or dict of trend features (prediction mode)
|
||||
"""
|
||||
if mode == 'training':
|
||||
return self._calculate_trend_features_training(sales_data)
|
||||
else:
|
||||
return self._calculate_trend_features_prediction(
|
||||
sales_data,
|
||||
reference_date,
|
||||
lag_features,
|
||||
rolling_features
|
||||
)
|
||||
|
||||
def _calculate_trend_features_training(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
date_column: str = 'date'
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Calculate trend features for training (operates on DataFrame).
|
||||
|
||||
Args:
|
||||
df: DataFrame with date and lag/rolling features
|
||||
date_column: Name of date column
|
||||
|
||||
Returns:
|
||||
DataFrame with added trend columns
|
||||
"""
|
||||
df = df.copy()
|
||||
|
||||
# Days since start
|
||||
df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days
|
||||
|
||||
# Momentum (difference between lag_1 and lag_7)
|
||||
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
|
||||
df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day']
|
||||
self.feature_columns.append('momentum_1_7')
|
||||
else:
|
||||
df['momentum_1_7'] = 0.0
|
||||
self.feature_columns.append('momentum_1_7')
|
||||
|
||||
# Trend (difference between 7-day and 30-day rolling means)
|
||||
if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns:
|
||||
df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d']
|
||||
self.feature_columns.append('trend_7_30')
|
||||
else:
|
||||
df['trend_7_30'] = 0.0
|
||||
self.feature_columns.append('trend_7_30')
|
||||
|
||||
# Velocity (rate of change over week)
|
||||
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
|
||||
df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7.0
|
||||
self.feature_columns.append('velocity_week')
|
||||
else:
|
||||
df['velocity_week'] = 0.0
|
||||
self.feature_columns.append('velocity_week')
|
||||
|
||||
self.feature_columns.append('days_since_start')
|
||||
|
||||
logger.debug("Added trend features (training mode)")
|
||||
return df
|
||||
|
||||
def _calculate_trend_features_prediction(
|
||||
self,
|
||||
historical_sales: pd.Series,
|
||||
reference_date: datetime,
|
||||
lag_features: Dict[str, float],
|
||||
rolling_features: Dict[str, float]
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Calculate trend features for prediction (operates on Series, returns dict).
|
||||
|
||||
Args:
|
||||
historical_sales: Series of sales quantities indexed by date
|
||||
reference_date: The date we're forecasting for
|
||||
lag_features: Pre-calculated lag features
|
||||
rolling_features: Pre-calculated rolling features
|
||||
|
||||
Returns:
|
||||
Dictionary of trend features
|
||||
"""
|
||||
features = {}
|
||||
|
||||
if len(historical_sales) == 0:
|
||||
return {
|
||||
'days_since_start': 0,
|
||||
'momentum_1_7': 0.0,
|
||||
'trend_7_30': 0.0,
|
||||
'velocity_week': 0.0
|
||||
}
|
||||
|
||||
# Days since first sale
|
||||
features['days_since_start'] = (reference_date - historical_sales.index[0]).days
|
||||
|
||||
# Momentum (difference between lag_1 and lag_7)
|
||||
if 'lag_1_day' in lag_features and 'lag_7_day' in lag_features:
|
||||
if len(historical_sales) >= 7:
|
||||
features['momentum_1_7'] = lag_features['lag_1_day'] - lag_features['lag_7_day']
|
||||
else:
|
||||
features['momentum_1_7'] = 0.0 # Insufficient data
|
||||
else:
|
||||
features['momentum_1_7'] = 0.0
|
||||
|
||||
# Trend (difference between 7-day and 30-day rolling means)
|
||||
if 'rolling_mean_7d' in rolling_features and 'rolling_mean_30d' in rolling_features:
|
||||
if len(historical_sales) >= 30:
|
||||
features['trend_7_30'] = rolling_features['rolling_mean_7d'] - rolling_features['rolling_mean_30d']
|
||||
else:
|
||||
features['trend_7_30'] = 0.0 # Insufficient data
|
||||
else:
|
||||
features['trend_7_30'] = 0.0
|
||||
|
||||
# Velocity (rate of change over week)
|
||||
if 'lag_1_day' in lag_features and 'lag_7_day' in lag_features:
|
||||
if len(historical_sales) >= 7:
|
||||
recent_value = lag_features['lag_1_day']
|
||||
past_value = lag_features['lag_7_day']
|
||||
features['velocity_week'] = float((recent_value - past_value) / 7.0)
|
||||
else:
|
||||
features['velocity_week'] = 0.0 # Insufficient data
|
||||
else:
|
||||
features['velocity_week'] = 0.0
|
||||
|
||||
logger.debug("Calculated trend features (prediction mode)", features=features)
|
||||
return features
|
||||
|
||||
def calculate_data_freshness_metrics(
|
||||
self,
|
||||
historical_sales: pd.Series,
|
||||
forecast_date: datetime
|
||||
) -> Dict[str, Union[int, float]]:
|
||||
"""
|
||||
Calculate data freshness and availability metrics.
|
||||
|
||||
This is used by prediction service to assess data quality and adjust confidence.
|
||||
Not used in training mode.
|
||||
|
||||
Args:
|
||||
historical_sales: Series of sales quantities indexed by date
|
||||
forecast_date: The date we're forecasting for
|
||||
|
||||
Returns:
|
||||
Dictionary with freshness metrics
|
||||
"""
|
||||
if len(historical_sales) == 0:
|
||||
return {
|
||||
'days_since_last_sale': 999, # Very large number indicating no data
|
||||
'historical_data_availability_score': 0.0
|
||||
}
|
||||
|
||||
last_available_date = historical_sales.index.max()
|
||||
days_since_last_sale = (forecast_date - last_available_date).days
|
||||
|
||||
# Calculate data availability score (0-1 scale, 1 being recent data)
|
||||
max_considered_days = 180 # Consider data older than 6 months as very stale
|
||||
availability_score = max(0.0, 1.0 - (days_since_last_sale / max_considered_days))
|
||||
|
||||
return {
|
||||
'days_since_last_sale': days_since_last_sale,
|
||||
'historical_data_availability_score': availability_score
|
||||
}
|
||||
|
||||
def calculate_all_features(
|
||||
self,
|
||||
sales_data: Union[pd.Series, pd.DataFrame],
|
||||
reference_date: Optional[datetime] = None,
|
||||
mode: str = 'training',
|
||||
date_column: str = 'date'
|
||||
) -> Union[pd.DataFrame, Dict[str, float]]:
|
||||
"""
|
||||
Calculate all historical features in one call.
|
||||
|
||||
Args:
|
||||
sales_data: Sales data as Series (prediction) or DataFrame (training)
|
||||
reference_date: Reference date for predictions (prediction mode only)
|
||||
mode: 'training' or 'prediction'
|
||||
date_column: Name of date column (training mode only)
|
||||
|
||||
Returns:
|
||||
DataFrame with all features (training) or dict of all features (prediction)
|
||||
"""
|
||||
if mode == 'training':
|
||||
df = sales_data.copy()
|
||||
|
||||
# Calculate lag features
|
||||
df = self.calculate_lag_features(df, mode='training')
|
||||
|
||||
# Calculate rolling features
|
||||
df = self.calculate_rolling_features(df, mode='training')
|
||||
|
||||
# Calculate trend features
|
||||
df = self.calculate_trend_features(df, mode='training')
|
||||
|
||||
logger.info(f"Calculated all features (training mode)", feature_count=len(self.feature_columns))
|
||||
return df
|
||||
|
||||
else: # prediction mode
|
||||
if reference_date is None:
|
||||
raise ValueError("reference_date is required for prediction mode")
|
||||
|
||||
features = {}
|
||||
|
||||
# Calculate lag features
|
||||
lag_features = self.calculate_lag_features(sales_data, mode='prediction')
|
||||
features.update(lag_features)
|
||||
|
||||
# Calculate rolling features
|
||||
rolling_features = self.calculate_rolling_features(sales_data, mode='prediction')
|
||||
features.update(rolling_features)
|
||||
|
||||
# Calculate trend features
|
||||
trend_features = self.calculate_trend_features(
|
||||
sales_data,
|
||||
reference_date=reference_date,
|
||||
lag_features=lag_features,
|
||||
rolling_features=rolling_features,
|
||||
mode='prediction'
|
||||
)
|
||||
features.update(trend_features)
|
||||
|
||||
# Calculate data freshness metrics
|
||||
freshness_metrics = self.calculate_data_freshness_metrics(sales_data, reference_date)
|
||||
features.update(freshness_metrics)
|
||||
|
||||
logger.info(f"Calculated all features (prediction mode)", feature_count=len(features))
|
||||
return features
|
||||
Reference in New Issue
Block a user