imporve features

This commit is contained in:
Urtzi Alfaro
2025-11-14 07:23:56 +01:00
parent 9bc048d360
commit a8d8828935
32 changed files with 5436 additions and 271 deletions

View File

@@ -7,6 +7,7 @@ import pandas as pd
import numpy as np
from typing import Dict, List, Optional
import structlog
from shared.ml.feature_calculator import HistoricalFeatureCalculator
logger = structlog.get_logger()
@@ -19,10 +20,12 @@ class AdvancedFeatureEngineer:
def __init__(self):
self.feature_columns = []
self.feature_calculator = HistoricalFeatureCalculator()
def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame:
"""
Add lagged demand features for capturing recent trends.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
@@ -34,14 +37,20 @@ class AdvancedFeatureEngineer:
if lag_days is None:
lag_days = [1, 7, 14]
df = df.copy()
# Use shared calculator for consistent lag calculation
df = self.feature_calculator.calculate_lag_features(
df,
lag_days=lag_days,
mode='training'
)
# Update feature columns list
for lag in lag_days:
col_name = f'lag_{lag}_day'
df[col_name] = df['quantity'].shift(lag)
self.feature_columns.append(col_name)
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
logger.info(f"Added {len(lag_days)} lagged features", lags=lag_days)
logger.info(f"Added {len(lag_days)} lagged features (using shared calculator)", lags=lag_days)
return df
def add_rolling_features(
@@ -52,6 +61,7 @@ class AdvancedFeatureEngineer:
) -> pd.DataFrame:
"""
Add rolling statistics (mean, std, max, min).
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
@@ -67,24 +77,22 @@ class AdvancedFeatureEngineer:
if features is None:
features = ['mean', 'std', 'max', 'min']
df = df.copy()
# Use shared calculator for consistent rolling calculation
df = self.feature_calculator.calculate_rolling_features(
df,
windows=windows,
statistics=features,
mode='training'
)
# Update feature columns list
for window in windows:
for feature in features:
col_name = f'rolling_{feature}_{window}d'
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
if feature == 'mean':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).mean()
elif feature == 'std':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).std()
elif feature == 'max':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).max()
elif feature == 'min':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).min()
self.feature_columns.append(col_name)
logger.info(f"Added rolling features", windows=windows, features=features)
logger.info(f"Added rolling features (using shared calculator)", windows=windows, features=features)
return df
def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
@@ -203,6 +211,7 @@ class AdvancedFeatureEngineer:
def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add trend-based features.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with date and quantity
@@ -211,27 +220,18 @@ class AdvancedFeatureEngineer:
Returns:
DataFrame with trend features
"""
df = df.copy()
# Use shared calculator for consistent trend calculation
df = self.feature_calculator.calculate_trend_features(
df,
mode='training'
)
# Days since start (linear trend proxy)
df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days
# Momentum indicators (recent change vs. older change)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day']
self.feature_columns.append('momentum_1_7')
if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns:
df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d']
self.feature_columns.append('trend_7_30')
# Velocity (rate of change)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7
self.feature_columns.append('velocity_week')
self.feature_columns.append('days_since_start')
# Update feature columns list
for feature_name in ['days_since_start', 'momentum_1_7', 'trend_7_30', 'velocity_week']:
if feature_name in df.columns and feature_name not in self.feature_columns:
self.feature_columns.append(feature_name)
logger.debug("Added trend features (using shared calculator)")
return df
def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame: