imporve features

This commit is contained in:
Urtzi Alfaro
2025-11-14 07:23:56 +01:00
parent 9bc048d360
commit a8d8828935
32 changed files with 5436 additions and 271 deletions

0
shared/ml/__init__.py Normal file
View File

400
shared/ml/data_processor.py Normal file
View File

@@ -0,0 +1,400 @@
"""
Shared Data Processor for Bakery Forecasting
Provides feature engineering capabilities for both training and prediction
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional
from datetime import datetime
import structlog
import holidays
from shared.ml.enhanced_features import AdvancedFeatureEngineer
logger = structlog.get_logger()
class EnhancedBakeryDataProcessor:
"""
Shared data processor for bakery forecasting.
Focuses on prediction feature preparation without training-specific dependencies.
"""
def __init__(self, region: str = 'MD'):
"""
Initialize the data processor.
Args:
region: Spanish region code for holidays (MD=Madrid, PV=Basque, etc.)
"""
self.scalers = {}
self.feature_engineer = AdvancedFeatureEngineer()
self.region = region
self.spain_holidays = holidays.Spain(prov=region)
def get_scalers(self) -> Dict[str, Any]:
"""Return the scalers/normalization parameters for use during prediction"""
return self.scalers.copy()
@staticmethod
def _extract_numeric_from_dict(value: Any) -> Optional[float]:
"""
Robust extraction of numeric values from complex data structures.
"""
if isinstance(value, (int, float)) and not isinstance(value, bool):
return float(value)
if isinstance(value, dict):
for key in ['value', 'data', 'result', 'amount', 'count', 'number', 'val']:
if key in value:
extracted = value[key]
if isinstance(extracted, dict):
return EnhancedBakeryDataProcessor._extract_numeric_from_dict(extracted)
elif isinstance(extracted, (int, float)) and not isinstance(extracted, bool):
return float(extracted)
for v in value.values():
if isinstance(v, (int, float)) and not isinstance(v, bool):
return float(v)
elif isinstance(v, dict):
result = EnhancedBakeryDataProcessor._extract_numeric_from_dict(v)
if result is not None:
return result
if isinstance(value, str):
try:
return float(value)
except (ValueError, TypeError):
pass
return None
async def prepare_prediction_features(self,
future_dates: pd.DatetimeIndex,
weather_forecast: pd.DataFrame = None,
traffic_forecast: pd.DataFrame = None,
poi_features: Dict[str, Any] = None,
historical_data: pd.DataFrame = None) -> pd.DataFrame:
"""
Create features for future predictions.
Args:
future_dates: Future dates to predict
weather_forecast: Weather forecast data
traffic_forecast: Traffic forecast data (optional, not commonly forecasted)
poi_features: POI features (location-based, static)
historical_data: Historical data for creating lagged and rolling features
Returns:
DataFrame with features for prediction
"""
try:
# Create base future dataframe
future_df = pd.DataFrame({'ds': future_dates})
# Add temporal features
future_df = self._add_temporal_features(
future_df.rename(columns={'ds': 'date'})
).rename(columns={'date': 'ds'})
# Add weather features
if weather_forecast is not None and not weather_forecast.empty:
weather_features = weather_forecast.copy()
if 'date' in weather_features.columns:
weather_features = weather_features.rename(columns={'date': 'ds'})
future_df = future_df.merge(weather_features, on='ds', how='left')
# Add traffic features
if traffic_forecast is not None and not traffic_forecast.empty:
traffic_features = traffic_forecast.copy()
if 'date' in traffic_features.columns:
traffic_features = traffic_features.rename(columns={'date': 'ds'})
future_df = future_df.merge(traffic_features, on='ds', how='left')
# Engineer basic features
future_df = self._engineer_features(future_df.rename(columns={'ds': 'date'}))
# Add advanced features if historical data is provided
if historical_data is not None and not historical_data.empty:
combined_df = pd.concat([
historical_data.rename(columns={'ds': 'date'}),
future_df
], ignore_index=True).sort_values('date')
combined_df = self._add_advanced_features(combined_df)
future_df = combined_df[combined_df['date'].isin(future_df['date'])].copy()
else:
logger.warning("No historical data provided, lagged features will be NaN")
future_df = self._add_advanced_features(future_df)
# Add POI features (static, location-based)
if poi_features:
future_df = self._add_poi_features(future_df, poi_features)
future_df = future_df.rename(columns={'date': 'ds'})
# Handle missing values
future_df = self._handle_missing_values_future(future_df)
return future_df
except Exception as e:
logger.error("Error creating prediction features", error=str(e))
return pd.DataFrame({'ds': future_dates})
def _add_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add comprehensive temporal features"""
df = df.copy()
if 'date' not in df.columns:
raise ValueError("DataFrame must have a 'date' column")
df['date'] = pd.to_datetime(df['date'])
# Basic temporal features
df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_month'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter
df['week_of_year'] = df['date'].dt.isocalendar().week
# Bakery-specific features
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['is_monday'] = (df['day_of_week'] == 0).astype(int)
df['is_friday'] = (df['day_of_week'] == 4).astype(int)
# Season mapping
df['season'] = df['month'].apply(self._get_season)
df['is_summer'] = (df['season'] == 3).astype(int)
df['is_winter'] = (df['season'] == 1).astype(int)
# Holiday indicators
df['is_holiday'] = df['date'].apply(self._is_spanish_holiday).astype(int)
df['is_school_holiday'] = df['date'].apply(self._is_school_holiday).astype(int)
df['is_month_start'] = (df['day_of_month'] <= 3).astype(int)
df['is_month_end'] = (df['day_of_month'] >= 28).astype(int)
# Payday patterns
df['is_payday_period'] = ((df['day_of_month'] <= 5) | (df['day_of_month'] >= 25)).astype(int)
return df
def _engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Engineer additional features"""
df = df.copy()
# Weather-based features
if 'temperature' in df.columns:
df['temperature'] = pd.to_numeric(df['temperature'], errors='coerce').fillna(15.0)
df['temp_squared'] = df['temperature'] ** 2
df['is_hot_day'] = (df['temperature'] > 25).astype(int)
df['is_cold_day'] = (df['temperature'] < 10).astype(int)
df['is_pleasant_day'] = ((df['temperature'] >= 18) & (df['temperature'] <= 25)).astype(int)
df['temp_category'] = pd.cut(df['temperature'],
bins=[-np.inf, 5, 15, 25, np.inf],
labels=[0, 1, 2, 3]).astype(int)
if 'precipitation' in df.columns:
df['precipitation'] = pd.to_numeric(df['precipitation'], errors='coerce').fillna(0.0)
df['is_rainy_day'] = (df['precipitation'] > 0.1).astype(int)
df['is_heavy_rain'] = (df['precipitation'] > 10).astype(int)
df['rain_intensity'] = pd.cut(df['precipitation'],
bins=[-0.1, 0, 2, 10, np.inf],
labels=[0, 1, 2, 3]).astype(int)
# Traffic-based features
if 'traffic_volume' in df.columns:
df['traffic_volume'] = pd.to_numeric(df['traffic_volume'], errors='coerce').fillna(100.0)
q75 = df['traffic_volume'].quantile(0.75)
q25 = df['traffic_volume'].quantile(0.25)
df['high_traffic'] = (df['traffic_volume'] > q75).astype(int)
df['low_traffic'] = (df['traffic_volume'] < q25).astype(int)
traffic_std = df['traffic_volume'].std()
traffic_mean = df['traffic_volume'].mean()
if traffic_std > 0 and not pd.isna(traffic_std):
df['traffic_normalized'] = (df['traffic_volume'] - traffic_mean) / traffic_std
self.scalers['traffic_mean'] = float(traffic_mean)
self.scalers['traffic_std'] = float(traffic_std)
else:
df['traffic_normalized'] = 0.0
self.scalers['traffic_mean'] = 100.0
self.scalers['traffic_std'] = 50.0
df['traffic_normalized'] = df['traffic_normalized'].fillna(0.0)
# Interaction features
if 'is_weekend' in df.columns and 'temperature' in df.columns:
df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
df['weekend_pleasant_weather'] = df['is_weekend'] * df.get('is_pleasant_day', 0)
if 'is_rainy_day' in df.columns and 'traffic_volume' in df.columns:
df['rain_traffic_interaction'] = df['is_rainy_day'] * df['traffic_volume']
if 'is_holiday' in df.columns and 'temperature' in df.columns:
df['holiday_temp_interaction'] = df['is_holiday'] * df['temperature']
if 'season' in df.columns and 'temperature' in df.columns:
df['season_temp_interaction'] = df['season'] * df['temperature']
# Day-of-week specific features
if 'day_of_week' in df.columns:
df['is_working_day'] = (~df['day_of_week'].isin([5, 6])).astype(int)
df['is_peak_bakery_day'] = df['day_of_week'].isin([4, 5, 6]).astype(int)
# Month-specific features
if 'month' in df.columns:
df['is_high_demand_month'] = df['month'].isin([6, 7, 8, 12]).astype(int)
df['is_warm_season'] = df['month'].isin([4, 5, 6, 7, 8, 9]).astype(int)
# Special day: Payday
if 'is_payday_period' in df.columns:
df['is_payday'] = df['is_payday_period']
return df
def _add_advanced_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add advanced features using AdvancedFeatureEngineer"""
df = df.copy()
logger.info("Adding advanced features (lagged, rolling, cyclical, trends)",
input_rows=len(df),
input_columns=len(df.columns))
self.feature_engineer = AdvancedFeatureEngineer()
df = self.feature_engineer.create_all_features(
df,
date_column='date',
include_lags=True,
include_rolling=True,
include_interactions=True,
include_cyclical=True
)
df = self.feature_engineer.fill_na_values(df, strategy='forward_backward')
created_features = self.feature_engineer.get_feature_columns()
logger.info(f"Added {len(created_features)} advanced features")
return df
def _add_poi_features(self, df: pd.DataFrame, poi_features: Dict[str, Any]) -> pd.DataFrame:
"""Add POI features (static, location-based)"""
if not poi_features:
logger.warning("No POI features to add")
return df
logger.info(f"Adding {len(poi_features)} POI features to dataframe")
for feature_name, feature_value in poi_features.items():
if isinstance(feature_value, bool):
feature_value = 1 if feature_value else 0
df[feature_name] = feature_value
return df
def _handle_missing_values_future(self, df: pd.DataFrame) -> pd.DataFrame:
"""Handle missing values in future prediction data"""
numeric_columns = df.select_dtypes(include=[np.number]).columns
madrid_defaults = {
'temperature': 15.0,
'precipitation': 0.0,
'humidity': 60.0,
'wind_speed': 5.0,
'traffic_volume': 100.0,
'pedestrian_count': 50.0,
'pressure': 1013.0
}
for col in numeric_columns:
if df[col].isna().any():
default_value = 0
for key, value in madrid_defaults.items():
if key in col.lower():
default_value = value
break
df[col] = df[col].fillna(default_value)
return df
def _get_season(self, month: int) -> int:
"""Get season from month (1-4 for Winter, Spring, Summer, Autumn)"""
if month in [12, 1, 2]:
return 1 # Winter
elif month in [3, 4, 5]:
return 2 # Spring
elif month in [6, 7, 8]:
return 3 # Summer
else:
return 4 # Autumn
def _is_spanish_holiday(self, date: datetime) -> bool:
"""Check if a date is a Spanish holiday"""
try:
if isinstance(date, datetime):
date = date.date()
elif isinstance(date, pd.Timestamp):
date = date.date()
return date in self.spain_holidays
except Exception as e:
logger.warning(f"Error checking holiday status for {date}: {e}")
month_day = (date.month, date.day)
basic_holidays = [
(1, 1), (1, 6), (5, 1), (8, 15), (10, 12),
(11, 1), (12, 6), (12, 8), (12, 25)
]
return month_day in basic_holidays
def _is_school_holiday(self, date: datetime) -> bool:
"""Check if a date is during school holidays in Spain"""
try:
from datetime import timedelta
import holidays as hol
if isinstance(date, datetime):
check_date = date.date()
elif isinstance(date, pd.Timestamp):
check_date = date.date()
else:
check_date = date
month = check_date.month
day = check_date.day
# Summer holidays (July 1 - August 31)
if month in [7, 8]:
return True
# Christmas holidays (December 23 - January 7)
if (month == 12 and day >= 23) or (month == 1 and day <= 7):
return True
# Easter/Spring break (Semana Santa)
year = check_date.year
spain_hol = hol.Spain(years=year, prov=self.region)
for holiday_date, holiday_name in spain_hol.items():
if 'viernes santo' in holiday_name.lower() or 'easter' in holiday_name.lower():
easter_start = holiday_date - timedelta(days=7)
easter_end = holiday_date + timedelta(days=7)
if easter_start <= check_date <= easter_end:
return True
return False
except Exception as e:
logger.warning(f"Error checking school holiday for {date}: {e}")
month = date.month if hasattr(date, 'month') else date.month
day = date.day if hasattr(date, 'day') else date.day
return (month in [7, 8] or
(month == 12 and day >= 23) or
(month == 1 and day <= 7) or
(month == 4 and 1 <= day <= 15))

View File

@@ -0,0 +1,347 @@
"""
Enhanced Feature Engineering for Hybrid Prophet + XGBoost Models
Adds lagged features, rolling statistics, and advanced interactions
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Optional
import structlog
from shared.ml.feature_calculator import HistoricalFeatureCalculator
logger = structlog.get_logger()
class AdvancedFeatureEngineer:
"""
Advanced feature engineering for hybrid forecasting models.
Adds lagged features, rolling statistics, and complex interactions.
"""
def __init__(self):
self.feature_columns = []
self.feature_calculator = HistoricalFeatureCalculator()
def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame:
"""
Add lagged demand features for capturing recent trends.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
lag_days: List of lag periods (default: [1, 7, 14])
Returns:
DataFrame with added lagged features
"""
if lag_days is None:
lag_days = [1, 7, 14]
# Use shared calculator for consistent lag calculation
df = self.feature_calculator.calculate_lag_features(
df,
lag_days=lag_days,
mode='training'
)
# Update feature columns list
for lag in lag_days:
col_name = f'lag_{lag}_day'
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
logger.info(f"Added {len(lag_days)} lagged features (using shared calculator)", lags=lag_days)
return df
def add_rolling_features(
self,
df: pd.DataFrame,
windows: List[int] = None,
features: List[str] = None
) -> pd.DataFrame:
"""
Add rolling statistics (mean, std, max, min).
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
windows: List of window sizes (default: [7, 14, 30])
features: List of statistics to calculate (default: ['mean', 'std', 'max', 'min'])
Returns:
DataFrame with rolling features
"""
if windows is None:
windows = [7, 14, 30]
if features is None:
features = ['mean', 'std', 'max', 'min']
# Use shared calculator for consistent rolling calculation
df = self.feature_calculator.calculate_rolling_features(
df,
windows=windows,
statistics=features,
mode='training'
)
# Update feature columns list
for window in windows:
for feature in features:
col_name = f'rolling_{feature}_{window}d'
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
logger.info(f"Added rolling features (using shared calculator)", windows=windows, features=features)
return df
def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add enhanced day-of-week features.
Args:
df: DataFrame with date column
date_column: Name of date column
Returns:
DataFrame with day-of-week features
"""
df = df.copy()
# Day of week (0=Monday, 6=Sunday)
df['day_of_week'] = df[date_column].dt.dayofweek
# Is weekend
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
# Is Friday (often higher demand due to weekend prep)
df['is_friday'] = (df['day_of_week'] == 4).astype(int)
# Is Monday (often lower demand after weekend)
df['is_monday'] = (df['day_of_week'] == 0).astype(int)
# Add to feature list
for col in ['day_of_week', 'is_weekend', 'is_friday', 'is_monday']:
if col not in self.feature_columns:
self.feature_columns.append(col)
return df
def add_calendar_enhanced_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add enhanced calendar features beyond basic temporal features.
Args:
df: DataFrame with date column
date_column: Name of date column
Returns:
DataFrame with enhanced calendar features
"""
df = df.copy()
# Month and quarter (if not already present)
if 'month' not in df.columns:
df['month'] = df[date_column].dt.month
if 'quarter' not in df.columns:
df['quarter'] = df[date_column].dt.quarter
# Day of month
df['day_of_month'] = df[date_column].dt.day
# Is month start/end
df['is_month_start'] = (df['day_of_month'] <= 3).astype(int)
df['is_month_end'] = (df[date_column].dt.is_month_end).astype(int)
# Week of year
df['week_of_year'] = df[date_column].dt.isocalendar().week
# Payday indicators (15th and last day of month - high bakery traffic)
df['is_payday'] = ((df['day_of_month'] == 15) | df[date_column].dt.is_month_end).astype(int)
# Add to feature list
for col in ['month', 'quarter', 'day_of_month', 'is_month_start', 'is_month_end',
'week_of_year', 'is_payday']:
if col not in self.feature_columns:
self.feature_columns.append(col)
return df
def add_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Add interaction features between variables.
Args:
df: DataFrame with base features
Returns:
DataFrame with interaction features
"""
df = df.copy()
# Weekend × Temperature (people buy more cold drinks in hot weekends)
if 'is_weekend' in df.columns and 'temperature' in df.columns:
df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
self.feature_columns.append('weekend_temp_interaction')
# Rain × Weekend (bad weather reduces weekend traffic)
if 'is_weekend' in df.columns and 'precipitation' in df.columns:
df['rain_weekend_interaction'] = df['is_weekend'] * (df['precipitation'] > 0).astype(int)
self.feature_columns.append('rain_weekend_interaction')
# Friday × Traffic (high Friday traffic means weekend prep buying)
if 'is_friday' in df.columns and 'traffic_volume' in df.columns:
df['friday_traffic_interaction'] = df['is_friday'] * df['traffic_volume']
self.feature_columns.append('friday_traffic_interaction')
# Month × Temperature (seasonal temperature patterns)
if 'month' in df.columns and 'temperature' in df.columns:
df['month_temp_interaction'] = df['month'] * df['temperature']
self.feature_columns.append('month_temp_interaction')
# Payday × Weekend (big shopping days)
if 'is_payday' in df.columns and 'is_weekend' in df.columns:
df['payday_weekend_interaction'] = df['is_payday'] * df['is_weekend']
self.feature_columns.append('payday_weekend_interaction')
logger.info(f"Added {len([c for c in self.feature_columns if 'interaction' in c])} interaction features")
return df
def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add trend-based features.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with date and quantity
date_column: Name of date column
Returns:
DataFrame with trend features
"""
# Use shared calculator for consistent trend calculation
df = self.feature_calculator.calculate_trend_features(
df,
mode='training'
)
# Update feature columns list
for feature_name in ['days_since_start', 'momentum_1_7', 'trend_7_30', 'velocity_week']:
if feature_name in df.columns and feature_name not in self.feature_columns:
self.feature_columns.append(feature_name)
logger.debug("Added trend features (using shared calculator)")
return df
def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Add cyclical encoding for periodic features (day_of_week, month).
Helps models understand that Monday follows Sunday, December follows January.
Args:
df: DataFrame with day_of_week and month columns
Returns:
DataFrame with cyclical features
"""
df = df.copy()
# Day of week cyclical encoding
if 'day_of_week' in df.columns:
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
self.feature_columns.extend(['day_of_week_sin', 'day_of_week_cos'])
# Month cyclical encoding
if 'month' in df.columns:
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
self.feature_columns.extend(['month_sin', 'month_cos'])
logger.info("Added cyclical encoding for temporal features")
return df
def create_all_features(
self,
df: pd.DataFrame,
date_column: str = 'date',
include_lags: bool = True,
include_rolling: bool = True,
include_interactions: bool = True,
include_cyclical: bool = True
) -> pd.DataFrame:
"""
Create all enhanced features in one go.
Args:
df: DataFrame with base data
date_column: Name of date column
include_lags: Whether to include lagged features
include_rolling: Whether to include rolling statistics
include_interactions: Whether to include interaction features
include_cyclical: Whether to include cyclical encoding
Returns:
DataFrame with all enhanced features
"""
logger.info("Creating comprehensive feature set for hybrid model")
# Reset feature list
self.feature_columns = []
# Day of week and calendar features (always needed)
df = self.add_day_of_week_features(df, date_column)
df = self.add_calendar_enhanced_features(df, date_column)
# Optional features
if include_lags:
df = self.add_lagged_features(df)
if include_rolling:
df = self.add_rolling_features(df)
if include_interactions:
df = self.add_interaction_features(df)
if include_cyclical:
df = self.add_cyclical_encoding(df)
# Trend features (depends on lags and rolling)
if include_lags or include_rolling:
df = self.add_trend_features(df, date_column)
logger.info(f"Created {len(self.feature_columns)} enhanced features for hybrid model")
return df
def get_feature_columns(self) -> List[str]:
"""Get list of all created feature column names."""
return self.feature_columns.copy()
def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_backward') -> pd.DataFrame:
"""
Fill NA values in lagged and rolling features.
Args:
df: DataFrame with potential NA values
strategy: 'forward_backward', 'zero', 'mean'
Returns:
DataFrame with filled NA values
"""
df = df.copy()
if strategy == 'forward_backward':
# Forward fill first (use previous values)
df = df.fillna(method='ffill')
# Backward fill remaining (beginning of series)
df = df.fillna(method='bfill')
elif strategy == 'zero':
df = df.fillna(0)
elif strategy == 'mean':
df = df.fillna(df.mean())
return df

View File

@@ -0,0 +1,588 @@
"""
Shared Feature Calculator for Training and Prediction Services
This module provides unified feature calculation logic to ensure consistency
between model training and inference (prediction), preventing train/serve skew.
Key principles:
- Same lag calculation logic in training and prediction
- Same rolling window statistics in training and prediction
- Same trend feature calculations in training and prediction
- Graceful handling of sparse/missing data with consistent fallbacks
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Union, Tuple
from datetime import datetime
import structlog
logger = structlog.get_logger()
class HistoricalFeatureCalculator:
"""
Unified historical feature calculator for both training and prediction.
This class ensures that features are calculated identically whether
during model training or during inference, preventing train/serve skew.
"""
def __init__(self):
"""Initialize the feature calculator."""
self.feature_columns = []
def calculate_lag_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
lag_days: List[int] = None,
mode: str = 'training'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate lagged sales features consistently for training and prediction.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training) with 'quantity' column
lag_days: List of lag periods (default: [1, 7, 14])
mode: 'training' returns DataFrame with lag columns, 'prediction' returns dict of features
Returns:
DataFrame with lag columns (training mode) or dict of lag features (prediction mode)
"""
if lag_days is None:
lag_days = [1, 7, 14]
if mode == 'training':
return self._calculate_lag_features_training(sales_data, lag_days)
else:
return self._calculate_lag_features_prediction(sales_data, lag_days)
def _calculate_lag_features_training(
self,
df: pd.DataFrame,
lag_days: List[int]
) -> pd.DataFrame:
"""
Calculate lag features for training (operates on DataFrame).
Args:
df: DataFrame with 'quantity' column
lag_days: List of lag periods
Returns:
DataFrame with added lag columns
"""
df = df.copy()
# Calculate overall statistics for fallback (consistent with prediction)
overall_mean = float(df['quantity'].mean()) if len(df) > 0 else 0.0
overall_std = float(df['quantity'].std()) if len(df) > 1 else 0.0
for lag in lag_days:
col_name = f'lag_{lag}_day'
# Use pandas shift
df[col_name] = df['quantity'].shift(lag)
# Fill NaN values using same logic as prediction mode
# For missing lags, use cascading fallback: previous lag -> last value -> mean
if lag == 1:
# For lag_1, fill with last available or mean
df[col_name] = df[col_name].fillna(df['quantity'].iloc[0] if len(df) > 0 else overall_mean)
elif lag == 7:
# For lag_7, fill with lag_1 if available, else last value, else mean
mask = df[col_name].isna()
if 'lag_1_day' in df.columns:
df.loc[mask, col_name] = df.loc[mask, 'lag_1_day']
else:
df.loc[mask, col_name] = df['quantity'].iloc[0] if len(df) > 0 else overall_mean
elif lag == 14:
# For lag_14, fill with lag_7 if available, else lag_1, else last value, else mean
mask = df[col_name].isna()
if 'lag_7_day' in df.columns:
df.loc[mask, col_name] = df.loc[mask, 'lag_7_day']
elif 'lag_1_day' in df.columns:
df.loc[mask, col_name] = df.loc[mask, 'lag_1_day']
else:
df.loc[mask, col_name] = df['quantity'].iloc[0] if len(df) > 0 else overall_mean
# Fill any remaining NaN with mean
df[col_name] = df[col_name].fillna(overall_mean)
self.feature_columns.append(col_name)
logger.debug(f"Added {len(lag_days)} lagged features (training mode)", lags=lag_days)
return df
def _calculate_lag_features_prediction(
self,
historical_sales: pd.Series,
lag_days: List[int]
) -> Dict[str, float]:
"""
Calculate lag features for prediction (operates on Series, returns dict).
Args:
historical_sales: Series of sales quantities indexed by date
lag_days: List of lag periods
Returns:
Dictionary of lag features
"""
features = {}
if len(historical_sales) == 0:
# Return default values if no data
for lag in lag_days:
features[f'lag_{lag}_day'] = 0.0
return features
# Calculate overall statistics for fallback
overall_mean = float(historical_sales.mean())
overall_std = float(historical_sales.std()) if len(historical_sales) > 1 else 0.0
# Calculate lag_1_day
if 1 in lag_days:
if len(historical_sales) >= 1:
features['lag_1_day'] = float(historical_sales.iloc[-1])
else:
features['lag_1_day'] = overall_mean
# Calculate lag_7_day
if 7 in lag_days:
if len(historical_sales) >= 7:
features['lag_7_day'] = float(historical_sales.iloc[-7])
else:
# Fallback to last value if insufficient data
features['lag_7_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) > 0 else overall_mean
# Calculate lag_14_day
if 14 in lag_days:
if len(historical_sales) >= 14:
features['lag_14_day'] = float(historical_sales.iloc[-14])
else:
# Cascading fallback: lag_7 -> lag_1 -> last value -> mean
if len(historical_sales) >= 7:
features['lag_14_day'] = float(historical_sales.iloc[-7])
else:
features['lag_14_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) > 0 else overall_mean
logger.debug("Calculated lag features (prediction mode)", features=features)
return features
def calculate_rolling_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
windows: List[int] = None,
statistics: List[str] = None,
mode: str = 'training'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate rolling window statistics consistently for training and prediction.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training) with 'quantity' column
windows: List of window sizes in days (default: [7, 14, 30])
statistics: List of statistics to calculate (default: ['mean', 'std', 'max', 'min'])
mode: 'training' returns DataFrame, 'prediction' returns dict
Returns:
DataFrame with rolling columns (training mode) or dict of rolling features (prediction mode)
"""
if windows is None:
windows = [7, 14, 30]
if statistics is None:
statistics = ['mean', 'std', 'max', 'min']
if mode == 'training':
return self._calculate_rolling_features_training(sales_data, windows, statistics)
else:
return self._calculate_rolling_features_prediction(sales_data, windows, statistics)
def _calculate_rolling_features_training(
self,
df: pd.DataFrame,
windows: List[int],
statistics: List[str]
) -> pd.DataFrame:
"""
Calculate rolling features for training (operates on DataFrame).
Args:
df: DataFrame with 'quantity' column
windows: List of window sizes
statistics: List of statistics to calculate
Returns:
DataFrame with added rolling columns
"""
df = df.copy()
# Calculate overall statistics for fallback
overall_mean = float(df['quantity'].mean()) if len(df) > 0 else 0.0
overall_std = float(df['quantity'].std()) if len(df) > 1 else 0.0
overall_max = float(df['quantity'].max()) if len(df) > 0 else 0.0
overall_min = float(df['quantity'].min()) if len(df) > 0 else 0.0
fallback_values = {
'mean': overall_mean,
'std': overall_std,
'max': overall_max,
'min': overall_min
}
for window in windows:
for stat in statistics:
col_name = f'rolling_{stat}_{window}d'
# Calculate rolling statistic with full window required (consistent with prediction)
# Use min_periods=window to match prediction behavior
if stat == 'mean':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).mean()
elif stat == 'std':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).std()
elif stat == 'max':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).max()
elif stat == 'min':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).min()
# Fill NaN values using cascading fallback (consistent with prediction)
# Use smaller window values if available, otherwise use overall statistics
mask = df[col_name].isna()
if window == 14 and f'rolling_{stat}_7d' in df.columns:
# Use 7-day window for 14-day NaN
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_7d']
elif window == 30 and f'rolling_{stat}_14d' in df.columns:
# Use 14-day window for 30-day NaN
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_14d']
elif window == 30 and f'rolling_{stat}_7d' in df.columns:
# Use 7-day window for 30-day NaN if 14-day not available
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_7d']
# Fill any remaining NaN with overall statistics
df[col_name] = df[col_name].fillna(fallback_values[stat])
self.feature_columns.append(col_name)
logger.debug(f"Added rolling features (training mode)", windows=windows, statistics=statistics)
return df
def _calculate_rolling_features_prediction(
self,
historical_sales: pd.Series,
windows: List[int],
statistics: List[str]
) -> Dict[str, float]:
"""
Calculate rolling features for prediction (operates on Series, returns dict).
Args:
historical_sales: Series of sales quantities indexed by date
windows: List of window sizes
statistics: List of statistics to calculate
Returns:
Dictionary of rolling features
"""
features = {}
if len(historical_sales) == 0:
# Return default values if no data
for window in windows:
for stat in statistics:
features[f'rolling_{stat}_{window}d'] = 0.0
return features
# Calculate overall statistics for fallback
overall_mean = float(historical_sales.mean())
overall_std = float(historical_sales.std()) if len(historical_sales) > 1 else 0.0
overall_max = float(historical_sales.max())
overall_min = float(historical_sales.min())
fallback_values = {
'mean': overall_mean,
'std': overall_std,
'max': overall_max,
'min': overall_min
}
# Calculate for each window
for window in windows:
if len(historical_sales) >= window:
# Have enough data for full window
window_data = historical_sales.iloc[-window:]
for stat in statistics:
col_name = f'rolling_{stat}_{window}d'
if stat == 'mean':
features[col_name] = float(window_data.mean())
elif stat == 'std':
features[col_name] = float(window_data.std()) if len(window_data) > 1 else 0.0
elif stat == 'max':
features[col_name] = float(window_data.max())
elif stat == 'min':
features[col_name] = float(window_data.min())
else:
# Insufficient data - use cascading fallback
for stat in statistics:
col_name = f'rolling_{stat}_{window}d'
# Try to use smaller window if available
if window == 14 and f'rolling_{stat}_7d' in features:
features[col_name] = features[f'rolling_{stat}_7d']
elif window == 30 and f'rolling_{stat}_14d' in features:
features[col_name] = features[f'rolling_{stat}_14d']
elif window == 30 and f'rolling_{stat}_7d' in features:
features[col_name] = features[f'rolling_{stat}_7d']
else:
# Use overall statistics
features[col_name] = fallback_values[stat]
logger.debug("Calculated rolling features (prediction mode)", num_features=len(features))
return features
def calculate_trend_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
reference_date: Optional[datetime] = None,
lag_features: Optional[Dict[str, float]] = None,
rolling_features: Optional[Dict[str, float]] = None,
mode: str = 'training'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate trend-based features consistently for training and prediction.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training)
reference_date: Reference date for calculations (prediction mode)
lag_features: Pre-calculated lag features (prediction mode)
rolling_features: Pre-calculated rolling features (prediction mode)
mode: 'training' returns DataFrame, 'prediction' returns dict
Returns:
DataFrame with trend columns (training mode) or dict of trend features (prediction mode)
"""
if mode == 'training':
return self._calculate_trend_features_training(sales_data)
else:
return self._calculate_trend_features_prediction(
sales_data,
reference_date,
lag_features,
rolling_features
)
def _calculate_trend_features_training(
self,
df: pd.DataFrame,
date_column: str = 'date'
) -> pd.DataFrame:
"""
Calculate trend features for training (operates on DataFrame).
Args:
df: DataFrame with date and lag/rolling features
date_column: Name of date column
Returns:
DataFrame with added trend columns
"""
df = df.copy()
# Days since start
df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days
# Momentum (difference between lag_1 and lag_7)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day']
self.feature_columns.append('momentum_1_7')
else:
df['momentum_1_7'] = 0.0
self.feature_columns.append('momentum_1_7')
# Trend (difference between 7-day and 30-day rolling means)
if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns:
df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d']
self.feature_columns.append('trend_7_30')
else:
df['trend_7_30'] = 0.0
self.feature_columns.append('trend_7_30')
# Velocity (rate of change over week)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7.0
self.feature_columns.append('velocity_week')
else:
df['velocity_week'] = 0.0
self.feature_columns.append('velocity_week')
self.feature_columns.append('days_since_start')
logger.debug("Added trend features (training mode)")
return df
def _calculate_trend_features_prediction(
self,
historical_sales: pd.Series,
reference_date: datetime,
lag_features: Dict[str, float],
rolling_features: Dict[str, float]
) -> Dict[str, float]:
"""
Calculate trend features for prediction (operates on Series, returns dict).
Args:
historical_sales: Series of sales quantities indexed by date
reference_date: The date we're forecasting for
lag_features: Pre-calculated lag features
rolling_features: Pre-calculated rolling features
Returns:
Dictionary of trend features
"""
features = {}
if len(historical_sales) == 0:
return {
'days_since_start': 0,
'momentum_1_7': 0.0,
'trend_7_30': 0.0,
'velocity_week': 0.0
}
# Days since first sale
features['days_since_start'] = (reference_date - historical_sales.index[0]).days
# Momentum (difference between lag_1 and lag_7)
if 'lag_1_day' in lag_features and 'lag_7_day' in lag_features:
if len(historical_sales) >= 7:
features['momentum_1_7'] = lag_features['lag_1_day'] - lag_features['lag_7_day']
else:
features['momentum_1_7'] = 0.0 # Insufficient data
else:
features['momentum_1_7'] = 0.0
# Trend (difference between 7-day and 30-day rolling means)
if 'rolling_mean_7d' in rolling_features and 'rolling_mean_30d' in rolling_features:
if len(historical_sales) >= 30:
features['trend_7_30'] = rolling_features['rolling_mean_7d'] - rolling_features['rolling_mean_30d']
else:
features['trend_7_30'] = 0.0 # Insufficient data
else:
features['trend_7_30'] = 0.0
# Velocity (rate of change over week)
if 'lag_1_day' in lag_features and 'lag_7_day' in lag_features:
if len(historical_sales) >= 7:
recent_value = lag_features['lag_1_day']
past_value = lag_features['lag_7_day']
features['velocity_week'] = float((recent_value - past_value) / 7.0)
else:
features['velocity_week'] = 0.0 # Insufficient data
else:
features['velocity_week'] = 0.0
logger.debug("Calculated trend features (prediction mode)", features=features)
return features
def calculate_data_freshness_metrics(
self,
historical_sales: pd.Series,
forecast_date: datetime
) -> Dict[str, Union[int, float]]:
"""
Calculate data freshness and availability metrics.
This is used by prediction service to assess data quality and adjust confidence.
Not used in training mode.
Args:
historical_sales: Series of sales quantities indexed by date
forecast_date: The date we're forecasting for
Returns:
Dictionary with freshness metrics
"""
if len(historical_sales) == 0:
return {
'days_since_last_sale': 999, # Very large number indicating no data
'historical_data_availability_score': 0.0
}
last_available_date = historical_sales.index.max()
days_since_last_sale = (forecast_date - last_available_date).days
# Calculate data availability score (0-1 scale, 1 being recent data)
max_considered_days = 180 # Consider data older than 6 months as very stale
availability_score = max(0.0, 1.0 - (days_since_last_sale / max_considered_days))
return {
'days_since_last_sale': days_since_last_sale,
'historical_data_availability_score': availability_score
}
def calculate_all_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
reference_date: Optional[datetime] = None,
mode: str = 'training',
date_column: str = 'date'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate all historical features in one call.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training)
reference_date: Reference date for predictions (prediction mode only)
mode: 'training' or 'prediction'
date_column: Name of date column (training mode only)
Returns:
DataFrame with all features (training) or dict of all features (prediction)
"""
if mode == 'training':
df = sales_data.copy()
# Calculate lag features
df = self.calculate_lag_features(df, mode='training')
# Calculate rolling features
df = self.calculate_rolling_features(df, mode='training')
# Calculate trend features
df = self.calculate_trend_features(df, mode='training')
logger.info(f"Calculated all features (training mode)", feature_count=len(self.feature_columns))
return df
else: # prediction mode
if reference_date is None:
raise ValueError("reference_date is required for prediction mode")
features = {}
# Calculate lag features
lag_features = self.calculate_lag_features(sales_data, mode='prediction')
features.update(lag_features)
# Calculate rolling features
rolling_features = self.calculate_rolling_features(sales_data, mode='prediction')
features.update(rolling_features)
# Calculate trend features
trend_features = self.calculate_trend_features(
sales_data,
reference_date=reference_date,
lag_features=lag_features,
rolling_features=rolling_features,
mode='prediction'
)
features.update(trend_features)
# Calculate data freshness metrics
freshness_metrics = self.calculate_data_freshness_metrics(sales_data, reference_date)
features.update(freshness_metrics)
logger.info(f"Calculated all features (prediction mode)", feature_count=len(features))
return features