Add improvements

This commit is contained in:
Urtzi Alfaro
2026-01-12 14:24:14 +01:00
parent 6037faaf8c
commit 230bbe6a19
61 changed files with 1668 additions and 894 deletions

View File

@@ -157,8 +157,13 @@ class AdvancedFeatureEngineer:
# Week of year
df['week_of_year'] = df[date_column].dt.isocalendar().week
# Payday indicators (15th and last day of month - high bakery traffic)
df['is_payday'] = ((df['day_of_month'] == 15) | df[date_column].dt.is_month_end).astype(int)
# Payday indicators for Spain (high bakery traffic)
# Spain commonly pays on: 28th, 15th, or last day of month
df['is_payday'] = (
(df['day_of_month'] == 15) | # Mid-month payday
(df['day_of_month'] == 28) | # Common Spanish payday (28th)
df[date_column].dt.is_month_end # End of month
).astype(int)
# Add to feature list
for col in ['month', 'quarter', 'day_of_month', 'is_month_start', 'is_month_end',
@@ -319,24 +324,27 @@ class AdvancedFeatureEngineer:
"""Get list of all created feature column names."""
return self.feature_columns.copy()
def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_backward') -> pd.DataFrame:
def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_mean') -> pd.DataFrame:
"""
Fill NA values in lagged and rolling features.
IMPORTANT: Never uses backward fill to prevent data leakage in time series training.
Args:
df: DataFrame with potential NA values
strategy: 'forward_backward', 'zero', 'mean'
strategy: 'forward_mean', 'zero', 'mean'
Returns:
DataFrame with filled NA values
"""
df = df.copy()
if strategy == 'forward_backward':
if strategy == 'forward_mean':
# Forward fill first (use previous values)
df = df.fillna(method='ffill')
# Backward fill remaining (beginning of series)
df = df.fillna(method='bfill')
# Fill remaining with mean (typically at beginning of series)
# NEVER use bfill as it leaks future information into training data
df = df.fillna(df.mean())
elif strategy == 'zero':
df = df.fillna(0)