Add improvements

This commit is contained in:
Urtzi Alfaro
2026-01-12 14:24:14 +01:00
parent 6037faaf8c
commit 230bbe6a19
61 changed files with 1668 additions and 894 deletions

View File

@@ -1121,9 +1121,10 @@ class EnhancedBakeryDataProcessor:
output_columns=len(df.columns))
# Fill NA values from lagged and rolling features
# IMPORTANT: Use forward_mean strategy to prevent data leakage (no backward fill)
logger.debug("Starting NA value filling",
na_counts={col: df[col].isna().sum() for col in df.columns if df[col].isna().any()})
df = self.feature_engineer.fill_na_values(df, strategy='forward_backward')
df = self.feature_engineer.fill_na_values(df, strategy='forward_mean')
logger.debug("NA value filling completed",
remaining_na_counts={col: df[col].isna().sum() for col in df.columns if df[col].isna().any()})

View File

@@ -157,8 +157,13 @@ class AdvancedFeatureEngineer:
# Week of year
df['week_of_year'] = df[date_column].dt.isocalendar().week
# Payday indicators (15th and last day of month - high bakery traffic)
df['is_payday'] = ((df['day_of_month'] == 15) | df[date_column].dt.is_month_end).astype(int)
# Payday indicators for Spain (high bakery traffic)
# Spain commonly pays on: 28th, 15th, or last day of month
df['is_payday'] = (
(df['day_of_month'] == 15) | # Mid-month payday
(df['day_of_month'] == 28) | # Common Spanish payday (28th)
df[date_column].dt.is_month_end # End of month
).astype(int)
# Add to feature list
for col in ['month', 'quarter', 'day_of_month', 'is_month_start', 'is_month_end',
@@ -319,24 +324,27 @@ class AdvancedFeatureEngineer:
"""Get list of all created feature column names."""
return self.feature_columns.copy()
def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_backward') -> pd.DataFrame:
def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_mean') -> pd.DataFrame:
"""
Fill NA values in lagged and rolling features.
IMPORTANT: Never uses backward fill to prevent data leakage in time series training.
Args:
df: DataFrame with potential NA values
strategy: 'forward_backward', 'zero', 'mean'
strategy: 'forward_mean', 'zero', 'mean'
Returns:
DataFrame with filled NA values
"""
df = df.copy()
if strategy == 'forward_backward':
if strategy == 'forward_mean':
# Forward fill first (use previous values)
df = df.fillna(method='ffill')
# Backward fill remaining (beginning of series)
df = df.fillna(method='bfill')
# Fill remaining with mean (typically at beginning of series)
# NEVER use bfill as it leaks future information into training data
df = df.fillna(df.mean())
elif strategy == 'zero':
df = df.fillna(0)

View File

@@ -142,10 +142,25 @@ class ModelSelector:
# Zero ratio
zero_ratio = (y == 0).sum() / len(y)
# Seasonality strength (simple proxy using rolling std)
# Seasonality strength using autocorrelation at key lags (7 days, 30 days)
# This better captures periodic patterns without using future data
if len(df) >= 14:
rolling_mean = pd.Series(y).rolling(window=7, center=True).mean()
seasonality_strength = rolling_mean.std() / (np.std(y) + 1e-6) if np.std(y) > 0 else 0
# Calculate autocorrelation at weekly lag (7 days)
# Higher autocorrelation indicates stronger weekly patterns
try:
weekly_autocorr = pd.Series(y).autocorr(lag=7) if len(y) > 7 else 0
# Calculate autocorrelation at monthly lag if enough data
monthly_autocorr = pd.Series(y).autocorr(lag=30) if len(y) > 30 else 0
# Combine autocorrelations (weekly weighted more for bakery data)
seasonality_strength = abs(weekly_autocorr) * 0.7 + abs(monthly_autocorr) * 0.3
# Ensure in valid range [0, 1]
seasonality_strength = max(0.0, min(1.0, seasonality_strength))
except Exception:
# Fallback to simpler calculation if autocorrelation fails
seasonality_strength = 0.5
else:
seasonality_strength = 0.5 # Default