imporve features

This commit is contained in:
Urtzi Alfaro
2025-11-14 07:23:56 +01:00
parent 9bc048d360
commit a8d8828935
32 changed files with 5436 additions and 271 deletions

View File

@@ -17,7 +17,7 @@ from shared.database.base import create_database_manager
from shared.database.transactions import transactional
from shared.database.exceptions import DatabaseError
from app.core.config import settings
from app.ml.enhanced_features import AdvancedFeatureEngineer
from shared.ml.enhanced_features import AdvancedFeatureEngineer
import holidays
logger = structlog.get_logger()

View File

@@ -7,6 +7,7 @@ import pandas as pd
import numpy as np
from typing import Dict, List, Optional
import structlog
from shared.ml.feature_calculator import HistoricalFeatureCalculator
logger = structlog.get_logger()
@@ -19,10 +20,12 @@ class AdvancedFeatureEngineer:
def __init__(self):
self.feature_columns = []
self.feature_calculator = HistoricalFeatureCalculator()
def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame:
"""
Add lagged demand features for capturing recent trends.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
@@ -34,14 +37,20 @@ class AdvancedFeatureEngineer:
if lag_days is None:
lag_days = [1, 7, 14]
df = df.copy()
# Use shared calculator for consistent lag calculation
df = self.feature_calculator.calculate_lag_features(
df,
lag_days=lag_days,
mode='training'
)
# Update feature columns list
for lag in lag_days:
col_name = f'lag_{lag}_day'
df[col_name] = df['quantity'].shift(lag)
self.feature_columns.append(col_name)
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
logger.info(f"Added {len(lag_days)} lagged features", lags=lag_days)
logger.info(f"Added {len(lag_days)} lagged features (using shared calculator)", lags=lag_days)
return df
def add_rolling_features(
@@ -52,6 +61,7 @@ class AdvancedFeatureEngineer:
) -> pd.DataFrame:
"""
Add rolling statistics (mean, std, max, min).
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
@@ -67,24 +77,22 @@ class AdvancedFeatureEngineer:
if features is None:
features = ['mean', 'std', 'max', 'min']
df = df.copy()
# Use shared calculator for consistent rolling calculation
df = self.feature_calculator.calculate_rolling_features(
df,
windows=windows,
statistics=features,
mode='training'
)
# Update feature columns list
for window in windows:
for feature in features:
col_name = f'rolling_{feature}_{window}d'
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
if feature == 'mean':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).mean()
elif feature == 'std':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).std()
elif feature == 'max':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).max()
elif feature == 'min':
df[col_name] = df['quantity'].rolling(window=window, min_periods=max(1, window // 2)).min()
self.feature_columns.append(col_name)
logger.info(f"Added rolling features", windows=windows, features=features)
logger.info(f"Added rolling features (using shared calculator)", windows=windows, features=features)
return df
def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
@@ -203,6 +211,7 @@ class AdvancedFeatureEngineer:
def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add trend-based features.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with date and quantity
@@ -211,27 +220,18 @@ class AdvancedFeatureEngineer:
Returns:
DataFrame with trend features
"""
df = df.copy()
# Use shared calculator for consistent trend calculation
df = self.feature_calculator.calculate_trend_features(
df,
mode='training'
)
# Days since start (linear trend proxy)
df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days
# Momentum indicators (recent change vs. older change)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day']
self.feature_columns.append('momentum_1_7')
if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns:
df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d']
self.feature_columns.append('trend_7_30')
# Velocity (rate of change)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7
self.feature_columns.append('velocity_week')
self.feature_columns.append('days_since_start')
# Update feature columns list
for feature_name in ['days_since_start', 'momentum_1_7', 'trend_7_30', 'velocity_week']:
if feature_name in df.columns and feature_name not in self.feature_columns:
self.feature_columns.append(feature_name)
logger.debug("Added trend features (using shared calculator)")
return df
def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame:

View File

@@ -7,7 +7,7 @@ import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
import structlog
from datetime import datetime
from datetime import datetime, timezone
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit
@@ -408,7 +408,7 @@ class HybridProphetXGBoost:
},
'tenant_id': tenant_id,
'inventory_product_id': inventory_product_id,
'trained_at': datetime.utcnow().isoformat()
'trained_at': datetime.now(timezone.utc).isoformat()
}
async def predict(

View File

@@ -844,6 +844,9 @@ class EnhancedBakeryMLTrainer:
# Extract training period from the processed data
training_start_date = None
training_end_date = None
data_freshness_days = None
data_coverage_days = None
if 'ds' in processed_data.columns and not processed_data.empty:
# Ensure ds column is datetime64 before extracting dates (prevents object dtype issues)
ds_datetime = pd.to_datetime(processed_data['ds'])
@@ -857,6 +860,15 @@ class EnhancedBakeryMLTrainer:
training_start_date = pd.Timestamp(min_ts).to_pydatetime().replace(tzinfo=None)
if pd.notna(max_ts):
training_end_date = pd.Timestamp(max_ts).to_pydatetime().replace(tzinfo=None)
# Calculate data freshness metrics
if training_end_date:
from datetime import datetime
data_freshness_days = (datetime.now() - training_end_date).days
# Calculate data coverage period
if training_start_date and training_end_date:
data_coverage_days = (training_end_date - training_start_date).days
# Ensure features are clean string list
try:
@@ -864,6 +876,13 @@ class EnhancedBakeryMLTrainer:
except Exception:
features_used = []
# Prepare hyperparameters with data freshness metrics
hyperparameters = model_info.get("hyperparameters", {})
if data_freshness_days is not None:
hyperparameters["data_freshness_days"] = data_freshness_days
if data_coverage_days is not None:
hyperparameters["data_coverage_days"] = data_coverage_days
model_data = {
"tenant_id": tenant_id,
"inventory_product_id": inventory_product_id,
@@ -876,7 +895,7 @@ class EnhancedBakeryMLTrainer:
"rmse": float(model_info.get("training_metrics", {}).get("rmse", 0)) if model_info.get("training_metrics", {}).get("rmse") is not None else 0,
"r2_score": float(model_info.get("training_metrics", {}).get("r2", 0)) if model_info.get("training_metrics", {}).get("r2") is not None else 0,
"training_samples": int(len(processed_data)),
"hyperparameters": self._serialize_scalers(model_info.get("hyperparameters", {})),
"hyperparameters": self._serialize_scalers(hyperparameters),
"features_used": [str(f) for f in features_used] if features_used else [],
"normalization_params": self._serialize_scalers(self.enhanced_data_processor.get_scalers()) or {}, # Include scalers for prediction consistency
"product_category": model_info.get("product_category", "unknown"), # Store product category
@@ -890,7 +909,9 @@ class EnhancedBakeryMLTrainer:
model_record = await repos['model'].create_model(model_data)
logger.info("Created enhanced model record",
inventory_product_id=inventory_product_id,
model_id=model_record.id)
model_id=model_record.id,
data_freshness_days=data_freshness_days,
data_coverage_days=data_coverage_days)
# Create artifacts for model files
if model_info.get("model_path"):