Add improvements
This commit is contained in:
@@ -1121,9 +1121,10 @@ class EnhancedBakeryDataProcessor:
|
||||
output_columns=len(df.columns))
|
||||
|
||||
# Fill NA values from lagged and rolling features
|
||||
# IMPORTANT: Use forward_mean strategy to prevent data leakage (no backward fill)
|
||||
logger.debug("Starting NA value filling",
|
||||
na_counts={col: df[col].isna().sum() for col in df.columns if df[col].isna().any()})
|
||||
df = self.feature_engineer.fill_na_values(df, strategy='forward_backward')
|
||||
df = self.feature_engineer.fill_na_values(df, strategy='forward_mean')
|
||||
logger.debug("NA value filling completed",
|
||||
remaining_na_counts={col: df[col].isna().sum() for col in df.columns if df[col].isna().any()})
|
||||
|
||||
|
||||
@@ -157,8 +157,13 @@ class AdvancedFeatureEngineer:
|
||||
# Week of year
|
||||
df['week_of_year'] = df[date_column].dt.isocalendar().week
|
||||
|
||||
# Payday indicators (15th and last day of month - high bakery traffic)
|
||||
df['is_payday'] = ((df['day_of_month'] == 15) | df[date_column].dt.is_month_end).astype(int)
|
||||
# Payday indicators for Spain (high bakery traffic)
|
||||
# Spain commonly pays on: 28th, 15th, or last day of month
|
||||
df['is_payday'] = (
|
||||
(df['day_of_month'] == 15) | # Mid-month payday
|
||||
(df['day_of_month'] == 28) | # Common Spanish payday (28th)
|
||||
df[date_column].dt.is_month_end # End of month
|
||||
).astype(int)
|
||||
|
||||
# Add to feature list
|
||||
for col in ['month', 'quarter', 'day_of_month', 'is_month_start', 'is_month_end',
|
||||
@@ -319,24 +324,27 @@ class AdvancedFeatureEngineer:
|
||||
"""Get list of all created feature column names."""
|
||||
return self.feature_columns.copy()
|
||||
|
||||
def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_backward') -> pd.DataFrame:
|
||||
def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_mean') -> pd.DataFrame:
|
||||
"""
|
||||
Fill NA values in lagged and rolling features.
|
||||
|
||||
IMPORTANT: Never uses backward fill to prevent data leakage in time series training.
|
||||
|
||||
Args:
|
||||
df: DataFrame with potential NA values
|
||||
strategy: 'forward_backward', 'zero', 'mean'
|
||||
strategy: 'forward_mean', 'zero', 'mean'
|
||||
|
||||
Returns:
|
||||
DataFrame with filled NA values
|
||||
"""
|
||||
df = df.copy()
|
||||
|
||||
if strategy == 'forward_backward':
|
||||
if strategy == 'forward_mean':
|
||||
# Forward fill first (use previous values)
|
||||
df = df.fillna(method='ffill')
|
||||
# Backward fill remaining (beginning of series)
|
||||
df = df.fillna(method='bfill')
|
||||
# Fill remaining with mean (typically at beginning of series)
|
||||
# NEVER use bfill as it leaks future information into training data
|
||||
df = df.fillna(df.mean())
|
||||
|
||||
elif strategy == 'zero':
|
||||
df = df.fillna(0)
|
||||
|
||||
@@ -142,10 +142,25 @@ class ModelSelector:
|
||||
# Zero ratio
|
||||
zero_ratio = (y == 0).sum() / len(y)
|
||||
|
||||
# Seasonality strength (simple proxy using rolling std)
|
||||
# Seasonality strength using autocorrelation at key lags (7 days, 30 days)
|
||||
# This better captures periodic patterns without using future data
|
||||
if len(df) >= 14:
|
||||
rolling_mean = pd.Series(y).rolling(window=7, center=True).mean()
|
||||
seasonality_strength = rolling_mean.std() / (np.std(y) + 1e-6) if np.std(y) > 0 else 0
|
||||
# Calculate autocorrelation at weekly lag (7 days)
|
||||
# Higher autocorrelation indicates stronger weekly patterns
|
||||
try:
|
||||
weekly_autocorr = pd.Series(y).autocorr(lag=7) if len(y) > 7 else 0
|
||||
|
||||
# Calculate autocorrelation at monthly lag if enough data
|
||||
monthly_autocorr = pd.Series(y).autocorr(lag=30) if len(y) > 30 else 0
|
||||
|
||||
# Combine autocorrelations (weekly weighted more for bakery data)
|
||||
seasonality_strength = abs(weekly_autocorr) * 0.7 + abs(monthly_autocorr) * 0.3
|
||||
|
||||
# Ensure in valid range [0, 1]
|
||||
seasonality_strength = max(0.0, min(1.0, seasonality_strength))
|
||||
except Exception:
|
||||
# Fallback to simpler calculation if autocorrelation fails
|
||||
seasonality_strength = 0.5
|
||||
else:
|
||||
seasonality_strength = 0.5 # Default
|
||||
|
||||
|
||||
Reference in New Issue
Block a user