imporve features

This commit is contained in:
Urtzi Alfaro
2025-11-14 07:23:56 +01:00
parent 9bc048d360
commit a8d8828935
32 changed files with 5436 additions and 271 deletions

View File

@@ -245,7 +245,7 @@ class ExternalServiceClient(BaseServiceClient):
result = await self._make_request(
"GET",
f"external/tenants/{tenant_id}/location-context",
"external/location-context",
tenant_id=tenant_id,
timeout=5.0
)
@@ -257,6 +257,128 @@ class ExternalServiceClient(BaseServiceClient):
logger.info("No location context found for tenant", tenant_id=tenant_id)
return None
async def create_tenant_location_context(
self,
tenant_id: str,
city_id: str,
school_calendar_id: Optional[str] = None,
neighborhood: Optional[str] = None,
local_events: Optional[List[Dict[str, Any]]] = None,
notes: Optional[str] = None
) -> Optional[Dict[str, Any]]:
"""
Create or update location context for a tenant.
This establishes the city association for a tenant and optionally assigns
a school calendar. Typically called during tenant registration to set up
location-based context for ML features.
Args:
tenant_id: Tenant UUID
city_id: Normalized city ID (e.g., "madrid", "barcelona")
school_calendar_id: Optional school calendar UUID to assign
neighborhood: Optional neighborhood name
local_events: Optional list of local events with impact data
notes: Optional notes about the location context
Returns:
Dict with created location context including nested calendar details,
or None if creation failed
"""
payload = {"city_id": city_id}
if school_calendar_id:
payload["school_calendar_id"] = school_calendar_id
if neighborhood:
payload["neighborhood"] = neighborhood
if local_events:
payload["local_events"] = local_events
if notes:
payload["notes"] = notes
logger.info(
"Creating tenant location context",
tenant_id=tenant_id,
city_id=city_id,
has_calendar=bool(school_calendar_id)
)
result = await self._make_request(
"POST",
"external/location-context",
tenant_id=tenant_id,
json=payload,
timeout=10.0
)
if result:
logger.info(
"Successfully created tenant location context",
tenant_id=tenant_id,
city_id=city_id
)
return result
else:
logger.warning(
"Failed to create tenant location context",
tenant_id=tenant_id,
city_id=city_id
)
return None
async def suggest_calendar_for_tenant(
self,
tenant_id: str
) -> Optional[Dict[str, Any]]:
"""
Get smart calendar suggestion for a tenant based on POI data and location.
Analyzes tenant's location context, nearby schools from POI detection,
and available calendars to provide an intelligent suggestion with
confidence score and reasoning.
Args:
tenant_id: Tenant UUID
Returns:
Dict with:
- suggested_calendar_id: Suggested calendar UUID
- calendar_name: Name of suggested calendar
- confidence: Float 0.0-1.0
- confidence_percentage: Percentage format
- reasoning: List of reasoning steps
- fallback_calendars: Alternative suggestions
- should_auto_assign: Boolean recommendation
- admin_message: Formatted message for display
- school_analysis: Analysis of nearby schools
Or None if request failed
"""
logger.info("Requesting calendar suggestion", tenant_id=tenant_id)
result = await self._make_request(
"POST",
"external/location-context/suggest-calendar",
tenant_id=tenant_id,
timeout=10.0
)
if result:
confidence = result.get("confidence_percentage", 0)
suggested = result.get("calendar_name", "None")
logger.info(
"Calendar suggestion received",
tenant_id=tenant_id,
suggested_calendar=suggested,
confidence=confidence
)
return result
else:
logger.warning(
"Failed to get calendar suggestion",
tenant_id=tenant_id
)
return None
async def get_school_calendar(
self,
calendar_id: str,
@@ -379,6 +501,11 @@ class ExternalServiceClient(BaseServiceClient):
"""
Get POI context for a tenant including ML features for forecasting.
With the new tenant-based architecture:
- Gateway receives at: /api/v1/tenants/{tenant_id}/external/poi-context
- Gateway proxies to external service at: /api/v1/tenants/{tenant_id}/poi-context
- This client calls: /tenants/{tenant_id}/poi-context
This retrieves stored POI detection results and calculated ML features
that should be included in demand forecasting predictions.
@@ -394,14 +521,11 @@ class ExternalServiceClient(BaseServiceClient):
"""
logger.info("Fetching POI context for forecasting", tenant_id=tenant_id)
# Note: POI context endpoint structure is /external/poi-context/{tenant_id}
# We pass tenant_id to _make_request which will build: /api/v1/tenants/{tenant_id}/external/poi-context/{tenant_id}
# But the actual endpoint in external service is just /poi-context/{tenant_id}
# So we need to use the operations prefix correctly
# Updated endpoint path to follow tenant-based pattern: /tenants/{tenant_id}/poi-context
result = await self._make_request(
"GET",
f"external/operations/poi-context/{tenant_id}",
tenant_id=None, # Don't auto-prefix, we're including tenant_id in the path
f"tenants/{tenant_id}/poi-context", # Updated path: /tenants/{tenant_id}/poi-context
tenant_id=tenant_id, # Pass tenant_id to include in headers for authentication
timeout=5.0
)

0
shared/ml/__init__.py Normal file
View File

400
shared/ml/data_processor.py Normal file
View File

@@ -0,0 +1,400 @@
"""
Shared Data Processor for Bakery Forecasting
Provides feature engineering capabilities for both training and prediction
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional
from datetime import datetime
import structlog
import holidays
from shared.ml.enhanced_features import AdvancedFeatureEngineer
logger = structlog.get_logger()
class EnhancedBakeryDataProcessor:
"""
Shared data processor for bakery forecasting.
Focuses on prediction feature preparation without training-specific dependencies.
"""
def __init__(self, region: str = 'MD'):
"""
Initialize the data processor.
Args:
region: Spanish region code for holidays (MD=Madrid, PV=Basque, etc.)
"""
self.scalers = {}
self.feature_engineer = AdvancedFeatureEngineer()
self.region = region
self.spain_holidays = holidays.Spain(prov=region)
def get_scalers(self) -> Dict[str, Any]:
"""Return the scalers/normalization parameters for use during prediction"""
return self.scalers.copy()
@staticmethod
def _extract_numeric_from_dict(value: Any) -> Optional[float]:
"""
Robust extraction of numeric values from complex data structures.
"""
if isinstance(value, (int, float)) and not isinstance(value, bool):
return float(value)
if isinstance(value, dict):
for key in ['value', 'data', 'result', 'amount', 'count', 'number', 'val']:
if key in value:
extracted = value[key]
if isinstance(extracted, dict):
return EnhancedBakeryDataProcessor._extract_numeric_from_dict(extracted)
elif isinstance(extracted, (int, float)) and not isinstance(extracted, bool):
return float(extracted)
for v in value.values():
if isinstance(v, (int, float)) and not isinstance(v, bool):
return float(v)
elif isinstance(v, dict):
result = EnhancedBakeryDataProcessor._extract_numeric_from_dict(v)
if result is not None:
return result
if isinstance(value, str):
try:
return float(value)
except (ValueError, TypeError):
pass
return None
async def prepare_prediction_features(self,
future_dates: pd.DatetimeIndex,
weather_forecast: pd.DataFrame = None,
traffic_forecast: pd.DataFrame = None,
poi_features: Dict[str, Any] = None,
historical_data: pd.DataFrame = None) -> pd.DataFrame:
"""
Create features for future predictions.
Args:
future_dates: Future dates to predict
weather_forecast: Weather forecast data
traffic_forecast: Traffic forecast data (optional, not commonly forecasted)
poi_features: POI features (location-based, static)
historical_data: Historical data for creating lagged and rolling features
Returns:
DataFrame with features for prediction
"""
try:
# Create base future dataframe
future_df = pd.DataFrame({'ds': future_dates})
# Add temporal features
future_df = self._add_temporal_features(
future_df.rename(columns={'ds': 'date'})
).rename(columns={'date': 'ds'})
# Add weather features
if weather_forecast is not None and not weather_forecast.empty:
weather_features = weather_forecast.copy()
if 'date' in weather_features.columns:
weather_features = weather_features.rename(columns={'date': 'ds'})
future_df = future_df.merge(weather_features, on='ds', how='left')
# Add traffic features
if traffic_forecast is not None and not traffic_forecast.empty:
traffic_features = traffic_forecast.copy()
if 'date' in traffic_features.columns:
traffic_features = traffic_features.rename(columns={'date': 'ds'})
future_df = future_df.merge(traffic_features, on='ds', how='left')
# Engineer basic features
future_df = self._engineer_features(future_df.rename(columns={'ds': 'date'}))
# Add advanced features if historical data is provided
if historical_data is not None and not historical_data.empty:
combined_df = pd.concat([
historical_data.rename(columns={'ds': 'date'}),
future_df
], ignore_index=True).sort_values('date')
combined_df = self._add_advanced_features(combined_df)
future_df = combined_df[combined_df['date'].isin(future_df['date'])].copy()
else:
logger.warning("No historical data provided, lagged features will be NaN")
future_df = self._add_advanced_features(future_df)
# Add POI features (static, location-based)
if poi_features:
future_df = self._add_poi_features(future_df, poi_features)
future_df = future_df.rename(columns={'date': 'ds'})
# Handle missing values
future_df = self._handle_missing_values_future(future_df)
return future_df
except Exception as e:
logger.error("Error creating prediction features", error=str(e))
return pd.DataFrame({'ds': future_dates})
def _add_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add comprehensive temporal features"""
df = df.copy()
if 'date' not in df.columns:
raise ValueError("DataFrame must have a 'date' column")
df['date'] = pd.to_datetime(df['date'])
# Basic temporal features
df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_month'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['quarter'] = df['date'].dt.quarter
df['week_of_year'] = df['date'].dt.isocalendar().week
# Bakery-specific features
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['is_monday'] = (df['day_of_week'] == 0).astype(int)
df['is_friday'] = (df['day_of_week'] == 4).astype(int)
# Season mapping
df['season'] = df['month'].apply(self._get_season)
df['is_summer'] = (df['season'] == 3).astype(int)
df['is_winter'] = (df['season'] == 1).astype(int)
# Holiday indicators
df['is_holiday'] = df['date'].apply(self._is_spanish_holiday).astype(int)
df['is_school_holiday'] = df['date'].apply(self._is_school_holiday).astype(int)
df['is_month_start'] = (df['day_of_month'] <= 3).astype(int)
df['is_month_end'] = (df['day_of_month'] >= 28).astype(int)
# Payday patterns
df['is_payday_period'] = ((df['day_of_month'] <= 5) | (df['day_of_month'] >= 25)).astype(int)
return df
def _engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Engineer additional features"""
df = df.copy()
# Weather-based features
if 'temperature' in df.columns:
df['temperature'] = pd.to_numeric(df['temperature'], errors='coerce').fillna(15.0)
df['temp_squared'] = df['temperature'] ** 2
df['is_hot_day'] = (df['temperature'] > 25).astype(int)
df['is_cold_day'] = (df['temperature'] < 10).astype(int)
df['is_pleasant_day'] = ((df['temperature'] >= 18) & (df['temperature'] <= 25)).astype(int)
df['temp_category'] = pd.cut(df['temperature'],
bins=[-np.inf, 5, 15, 25, np.inf],
labels=[0, 1, 2, 3]).astype(int)
if 'precipitation' in df.columns:
df['precipitation'] = pd.to_numeric(df['precipitation'], errors='coerce').fillna(0.0)
df['is_rainy_day'] = (df['precipitation'] > 0.1).astype(int)
df['is_heavy_rain'] = (df['precipitation'] > 10).astype(int)
df['rain_intensity'] = pd.cut(df['precipitation'],
bins=[-0.1, 0, 2, 10, np.inf],
labels=[0, 1, 2, 3]).astype(int)
# Traffic-based features
if 'traffic_volume' in df.columns:
df['traffic_volume'] = pd.to_numeric(df['traffic_volume'], errors='coerce').fillna(100.0)
q75 = df['traffic_volume'].quantile(0.75)
q25 = df['traffic_volume'].quantile(0.25)
df['high_traffic'] = (df['traffic_volume'] > q75).astype(int)
df['low_traffic'] = (df['traffic_volume'] < q25).astype(int)
traffic_std = df['traffic_volume'].std()
traffic_mean = df['traffic_volume'].mean()
if traffic_std > 0 and not pd.isna(traffic_std):
df['traffic_normalized'] = (df['traffic_volume'] - traffic_mean) / traffic_std
self.scalers['traffic_mean'] = float(traffic_mean)
self.scalers['traffic_std'] = float(traffic_std)
else:
df['traffic_normalized'] = 0.0
self.scalers['traffic_mean'] = 100.0
self.scalers['traffic_std'] = 50.0
df['traffic_normalized'] = df['traffic_normalized'].fillna(0.0)
# Interaction features
if 'is_weekend' in df.columns and 'temperature' in df.columns:
df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
df['weekend_pleasant_weather'] = df['is_weekend'] * df.get('is_pleasant_day', 0)
if 'is_rainy_day' in df.columns and 'traffic_volume' in df.columns:
df['rain_traffic_interaction'] = df['is_rainy_day'] * df['traffic_volume']
if 'is_holiday' in df.columns and 'temperature' in df.columns:
df['holiday_temp_interaction'] = df['is_holiday'] * df['temperature']
if 'season' in df.columns and 'temperature' in df.columns:
df['season_temp_interaction'] = df['season'] * df['temperature']
# Day-of-week specific features
if 'day_of_week' in df.columns:
df['is_working_day'] = (~df['day_of_week'].isin([5, 6])).astype(int)
df['is_peak_bakery_day'] = df['day_of_week'].isin([4, 5, 6]).astype(int)
# Month-specific features
if 'month' in df.columns:
df['is_high_demand_month'] = df['month'].isin([6, 7, 8, 12]).astype(int)
df['is_warm_season'] = df['month'].isin([4, 5, 6, 7, 8, 9]).astype(int)
# Special day: Payday
if 'is_payday_period' in df.columns:
df['is_payday'] = df['is_payday_period']
return df
def _add_advanced_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add advanced features using AdvancedFeatureEngineer"""
df = df.copy()
logger.info("Adding advanced features (lagged, rolling, cyclical, trends)",
input_rows=len(df),
input_columns=len(df.columns))
self.feature_engineer = AdvancedFeatureEngineer()
df = self.feature_engineer.create_all_features(
df,
date_column='date',
include_lags=True,
include_rolling=True,
include_interactions=True,
include_cyclical=True
)
df = self.feature_engineer.fill_na_values(df, strategy='forward_backward')
created_features = self.feature_engineer.get_feature_columns()
logger.info(f"Added {len(created_features)} advanced features")
return df
def _add_poi_features(self, df: pd.DataFrame, poi_features: Dict[str, Any]) -> pd.DataFrame:
"""Add POI features (static, location-based)"""
if not poi_features:
logger.warning("No POI features to add")
return df
logger.info(f"Adding {len(poi_features)} POI features to dataframe")
for feature_name, feature_value in poi_features.items():
if isinstance(feature_value, bool):
feature_value = 1 if feature_value else 0
df[feature_name] = feature_value
return df
def _handle_missing_values_future(self, df: pd.DataFrame) -> pd.DataFrame:
"""Handle missing values in future prediction data"""
numeric_columns = df.select_dtypes(include=[np.number]).columns
madrid_defaults = {
'temperature': 15.0,
'precipitation': 0.0,
'humidity': 60.0,
'wind_speed': 5.0,
'traffic_volume': 100.0,
'pedestrian_count': 50.0,
'pressure': 1013.0
}
for col in numeric_columns:
if df[col].isna().any():
default_value = 0
for key, value in madrid_defaults.items():
if key in col.lower():
default_value = value
break
df[col] = df[col].fillna(default_value)
return df
def _get_season(self, month: int) -> int:
"""Get season from month (1-4 for Winter, Spring, Summer, Autumn)"""
if month in [12, 1, 2]:
return 1 # Winter
elif month in [3, 4, 5]:
return 2 # Spring
elif month in [6, 7, 8]:
return 3 # Summer
else:
return 4 # Autumn
def _is_spanish_holiday(self, date: datetime) -> bool:
"""Check if a date is a Spanish holiday"""
try:
if isinstance(date, datetime):
date = date.date()
elif isinstance(date, pd.Timestamp):
date = date.date()
return date in self.spain_holidays
except Exception as e:
logger.warning(f"Error checking holiday status for {date}: {e}")
month_day = (date.month, date.day)
basic_holidays = [
(1, 1), (1, 6), (5, 1), (8, 15), (10, 12),
(11, 1), (12, 6), (12, 8), (12, 25)
]
return month_day in basic_holidays
def _is_school_holiday(self, date: datetime) -> bool:
"""Check if a date is during school holidays in Spain"""
try:
from datetime import timedelta
import holidays as hol
if isinstance(date, datetime):
check_date = date.date()
elif isinstance(date, pd.Timestamp):
check_date = date.date()
else:
check_date = date
month = check_date.month
day = check_date.day
# Summer holidays (July 1 - August 31)
if month in [7, 8]:
return True
# Christmas holidays (December 23 - January 7)
if (month == 12 and day >= 23) or (month == 1 and day <= 7):
return True
# Easter/Spring break (Semana Santa)
year = check_date.year
spain_hol = hol.Spain(years=year, prov=self.region)
for holiday_date, holiday_name in spain_hol.items():
if 'viernes santo' in holiday_name.lower() or 'easter' in holiday_name.lower():
easter_start = holiday_date - timedelta(days=7)
easter_end = holiday_date + timedelta(days=7)
if easter_start <= check_date <= easter_end:
return True
return False
except Exception as e:
logger.warning(f"Error checking school holiday for {date}: {e}")
month = date.month if hasattr(date, 'month') else date.month
day = date.day if hasattr(date, 'day') else date.day
return (month in [7, 8] or
(month == 12 and day >= 23) or
(month == 1 and day <= 7) or
(month == 4 and 1 <= day <= 15))

View File

@@ -0,0 +1,347 @@
"""
Enhanced Feature Engineering for Hybrid Prophet + XGBoost Models
Adds lagged features, rolling statistics, and advanced interactions
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Optional
import structlog
from shared.ml.feature_calculator import HistoricalFeatureCalculator
logger = structlog.get_logger()
class AdvancedFeatureEngineer:
"""
Advanced feature engineering for hybrid forecasting models.
Adds lagged features, rolling statistics, and complex interactions.
"""
def __init__(self):
self.feature_columns = []
self.feature_calculator = HistoricalFeatureCalculator()
def add_lagged_features(self, df: pd.DataFrame, lag_days: List[int] = None) -> pd.DataFrame:
"""
Add lagged demand features for capturing recent trends.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
lag_days: List of lag periods (default: [1, 7, 14])
Returns:
DataFrame with added lagged features
"""
if lag_days is None:
lag_days = [1, 7, 14]
# Use shared calculator for consistent lag calculation
df = self.feature_calculator.calculate_lag_features(
df,
lag_days=lag_days,
mode='training'
)
# Update feature columns list
for lag in lag_days:
col_name = f'lag_{lag}_day'
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
logger.info(f"Added {len(lag_days)} lagged features (using shared calculator)", lags=lag_days)
return df
def add_rolling_features(
self,
df: pd.DataFrame,
windows: List[int] = None,
features: List[str] = None
) -> pd.DataFrame:
"""
Add rolling statistics (mean, std, max, min).
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with 'quantity' column
windows: List of window sizes (default: [7, 14, 30])
features: List of statistics to calculate (default: ['mean', 'std', 'max', 'min'])
Returns:
DataFrame with rolling features
"""
if windows is None:
windows = [7, 14, 30]
if features is None:
features = ['mean', 'std', 'max', 'min']
# Use shared calculator for consistent rolling calculation
df = self.feature_calculator.calculate_rolling_features(
df,
windows=windows,
statistics=features,
mode='training'
)
# Update feature columns list
for window in windows:
for feature in features:
col_name = f'rolling_{feature}_{window}d'
if col_name not in self.feature_columns:
self.feature_columns.append(col_name)
logger.info(f"Added rolling features (using shared calculator)", windows=windows, features=features)
return df
def add_day_of_week_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add enhanced day-of-week features.
Args:
df: DataFrame with date column
date_column: Name of date column
Returns:
DataFrame with day-of-week features
"""
df = df.copy()
# Day of week (0=Monday, 6=Sunday)
df['day_of_week'] = df[date_column].dt.dayofweek
# Is weekend
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
# Is Friday (often higher demand due to weekend prep)
df['is_friday'] = (df['day_of_week'] == 4).astype(int)
# Is Monday (often lower demand after weekend)
df['is_monday'] = (df['day_of_week'] == 0).astype(int)
# Add to feature list
for col in ['day_of_week', 'is_weekend', 'is_friday', 'is_monday']:
if col not in self.feature_columns:
self.feature_columns.append(col)
return df
def add_calendar_enhanced_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add enhanced calendar features beyond basic temporal features.
Args:
df: DataFrame with date column
date_column: Name of date column
Returns:
DataFrame with enhanced calendar features
"""
df = df.copy()
# Month and quarter (if not already present)
if 'month' not in df.columns:
df['month'] = df[date_column].dt.month
if 'quarter' not in df.columns:
df['quarter'] = df[date_column].dt.quarter
# Day of month
df['day_of_month'] = df[date_column].dt.day
# Is month start/end
df['is_month_start'] = (df['day_of_month'] <= 3).astype(int)
df['is_month_end'] = (df[date_column].dt.is_month_end).astype(int)
# Week of year
df['week_of_year'] = df[date_column].dt.isocalendar().week
# Payday indicators (15th and last day of month - high bakery traffic)
df['is_payday'] = ((df['day_of_month'] == 15) | df[date_column].dt.is_month_end).astype(int)
# Add to feature list
for col in ['month', 'quarter', 'day_of_month', 'is_month_start', 'is_month_end',
'week_of_year', 'is_payday']:
if col not in self.feature_columns:
self.feature_columns.append(col)
return df
def add_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Add interaction features between variables.
Args:
df: DataFrame with base features
Returns:
DataFrame with interaction features
"""
df = df.copy()
# Weekend × Temperature (people buy more cold drinks in hot weekends)
if 'is_weekend' in df.columns and 'temperature' in df.columns:
df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
self.feature_columns.append('weekend_temp_interaction')
# Rain × Weekend (bad weather reduces weekend traffic)
if 'is_weekend' in df.columns and 'precipitation' in df.columns:
df['rain_weekend_interaction'] = df['is_weekend'] * (df['precipitation'] > 0).astype(int)
self.feature_columns.append('rain_weekend_interaction')
# Friday × Traffic (high Friday traffic means weekend prep buying)
if 'is_friday' in df.columns and 'traffic_volume' in df.columns:
df['friday_traffic_interaction'] = df['is_friday'] * df['traffic_volume']
self.feature_columns.append('friday_traffic_interaction')
# Month × Temperature (seasonal temperature patterns)
if 'month' in df.columns and 'temperature' in df.columns:
df['month_temp_interaction'] = df['month'] * df['temperature']
self.feature_columns.append('month_temp_interaction')
# Payday × Weekend (big shopping days)
if 'is_payday' in df.columns and 'is_weekend' in df.columns:
df['payday_weekend_interaction'] = df['is_payday'] * df['is_weekend']
self.feature_columns.append('payday_weekend_interaction')
logger.info(f"Added {len([c for c in self.feature_columns if 'interaction' in c])} interaction features")
return df
def add_trend_features(self, df: pd.DataFrame, date_column: str = 'date') -> pd.DataFrame:
"""
Add trend-based features.
Uses shared feature calculator for consistency with prediction service.
Args:
df: DataFrame with date and quantity
date_column: Name of date column
Returns:
DataFrame with trend features
"""
# Use shared calculator for consistent trend calculation
df = self.feature_calculator.calculate_trend_features(
df,
mode='training'
)
# Update feature columns list
for feature_name in ['days_since_start', 'momentum_1_7', 'trend_7_30', 'velocity_week']:
if feature_name in df.columns and feature_name not in self.feature_columns:
self.feature_columns.append(feature_name)
logger.debug("Added trend features (using shared calculator)")
return df
def add_cyclical_encoding(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Add cyclical encoding for periodic features (day_of_week, month).
Helps models understand that Monday follows Sunday, December follows January.
Args:
df: DataFrame with day_of_week and month columns
Returns:
DataFrame with cyclical features
"""
df = df.copy()
# Day of week cyclical encoding
if 'day_of_week' in df.columns:
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
self.feature_columns.extend(['day_of_week_sin', 'day_of_week_cos'])
# Month cyclical encoding
if 'month' in df.columns:
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
self.feature_columns.extend(['month_sin', 'month_cos'])
logger.info("Added cyclical encoding for temporal features")
return df
def create_all_features(
self,
df: pd.DataFrame,
date_column: str = 'date',
include_lags: bool = True,
include_rolling: bool = True,
include_interactions: bool = True,
include_cyclical: bool = True
) -> pd.DataFrame:
"""
Create all enhanced features in one go.
Args:
df: DataFrame with base data
date_column: Name of date column
include_lags: Whether to include lagged features
include_rolling: Whether to include rolling statistics
include_interactions: Whether to include interaction features
include_cyclical: Whether to include cyclical encoding
Returns:
DataFrame with all enhanced features
"""
logger.info("Creating comprehensive feature set for hybrid model")
# Reset feature list
self.feature_columns = []
# Day of week and calendar features (always needed)
df = self.add_day_of_week_features(df, date_column)
df = self.add_calendar_enhanced_features(df, date_column)
# Optional features
if include_lags:
df = self.add_lagged_features(df)
if include_rolling:
df = self.add_rolling_features(df)
if include_interactions:
df = self.add_interaction_features(df)
if include_cyclical:
df = self.add_cyclical_encoding(df)
# Trend features (depends on lags and rolling)
if include_lags or include_rolling:
df = self.add_trend_features(df, date_column)
logger.info(f"Created {len(self.feature_columns)} enhanced features for hybrid model")
return df
def get_feature_columns(self) -> List[str]:
"""Get list of all created feature column names."""
return self.feature_columns.copy()
def fill_na_values(self, df: pd.DataFrame, strategy: str = 'forward_backward') -> pd.DataFrame:
"""
Fill NA values in lagged and rolling features.
Args:
df: DataFrame with potential NA values
strategy: 'forward_backward', 'zero', 'mean'
Returns:
DataFrame with filled NA values
"""
df = df.copy()
if strategy == 'forward_backward':
# Forward fill first (use previous values)
df = df.fillna(method='ffill')
# Backward fill remaining (beginning of series)
df = df.fillna(method='bfill')
elif strategy == 'zero':
df = df.fillna(0)
elif strategy == 'mean':
df = df.fillna(df.mean())
return df

View File

@@ -0,0 +1,588 @@
"""
Shared Feature Calculator for Training and Prediction Services
This module provides unified feature calculation logic to ensure consistency
between model training and inference (prediction), preventing train/serve skew.
Key principles:
- Same lag calculation logic in training and prediction
- Same rolling window statistics in training and prediction
- Same trend feature calculations in training and prediction
- Graceful handling of sparse/missing data with consistent fallbacks
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Union, Tuple
from datetime import datetime
import structlog
logger = structlog.get_logger()
class HistoricalFeatureCalculator:
"""
Unified historical feature calculator for both training and prediction.
This class ensures that features are calculated identically whether
during model training or during inference, preventing train/serve skew.
"""
def __init__(self):
"""Initialize the feature calculator."""
self.feature_columns = []
def calculate_lag_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
lag_days: List[int] = None,
mode: str = 'training'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate lagged sales features consistently for training and prediction.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training) with 'quantity' column
lag_days: List of lag periods (default: [1, 7, 14])
mode: 'training' returns DataFrame with lag columns, 'prediction' returns dict of features
Returns:
DataFrame with lag columns (training mode) or dict of lag features (prediction mode)
"""
if lag_days is None:
lag_days = [1, 7, 14]
if mode == 'training':
return self._calculate_lag_features_training(sales_data, lag_days)
else:
return self._calculate_lag_features_prediction(sales_data, lag_days)
def _calculate_lag_features_training(
self,
df: pd.DataFrame,
lag_days: List[int]
) -> pd.DataFrame:
"""
Calculate lag features for training (operates on DataFrame).
Args:
df: DataFrame with 'quantity' column
lag_days: List of lag periods
Returns:
DataFrame with added lag columns
"""
df = df.copy()
# Calculate overall statistics for fallback (consistent with prediction)
overall_mean = float(df['quantity'].mean()) if len(df) > 0 else 0.0
overall_std = float(df['quantity'].std()) if len(df) > 1 else 0.0
for lag in lag_days:
col_name = f'lag_{lag}_day'
# Use pandas shift
df[col_name] = df['quantity'].shift(lag)
# Fill NaN values using same logic as prediction mode
# For missing lags, use cascading fallback: previous lag -> last value -> mean
if lag == 1:
# For lag_1, fill with last available or mean
df[col_name] = df[col_name].fillna(df['quantity'].iloc[0] if len(df) > 0 else overall_mean)
elif lag == 7:
# For lag_7, fill with lag_1 if available, else last value, else mean
mask = df[col_name].isna()
if 'lag_1_day' in df.columns:
df.loc[mask, col_name] = df.loc[mask, 'lag_1_day']
else:
df.loc[mask, col_name] = df['quantity'].iloc[0] if len(df) > 0 else overall_mean
elif lag == 14:
# For lag_14, fill with lag_7 if available, else lag_1, else last value, else mean
mask = df[col_name].isna()
if 'lag_7_day' in df.columns:
df.loc[mask, col_name] = df.loc[mask, 'lag_7_day']
elif 'lag_1_day' in df.columns:
df.loc[mask, col_name] = df.loc[mask, 'lag_1_day']
else:
df.loc[mask, col_name] = df['quantity'].iloc[0] if len(df) > 0 else overall_mean
# Fill any remaining NaN with mean
df[col_name] = df[col_name].fillna(overall_mean)
self.feature_columns.append(col_name)
logger.debug(f"Added {len(lag_days)} lagged features (training mode)", lags=lag_days)
return df
def _calculate_lag_features_prediction(
self,
historical_sales: pd.Series,
lag_days: List[int]
) -> Dict[str, float]:
"""
Calculate lag features for prediction (operates on Series, returns dict).
Args:
historical_sales: Series of sales quantities indexed by date
lag_days: List of lag periods
Returns:
Dictionary of lag features
"""
features = {}
if len(historical_sales) == 0:
# Return default values if no data
for lag in lag_days:
features[f'lag_{lag}_day'] = 0.0
return features
# Calculate overall statistics for fallback
overall_mean = float(historical_sales.mean())
overall_std = float(historical_sales.std()) if len(historical_sales) > 1 else 0.0
# Calculate lag_1_day
if 1 in lag_days:
if len(historical_sales) >= 1:
features['lag_1_day'] = float(historical_sales.iloc[-1])
else:
features['lag_1_day'] = overall_mean
# Calculate lag_7_day
if 7 in lag_days:
if len(historical_sales) >= 7:
features['lag_7_day'] = float(historical_sales.iloc[-7])
else:
# Fallback to last value if insufficient data
features['lag_7_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) > 0 else overall_mean
# Calculate lag_14_day
if 14 in lag_days:
if len(historical_sales) >= 14:
features['lag_14_day'] = float(historical_sales.iloc[-14])
else:
# Cascading fallback: lag_7 -> lag_1 -> last value -> mean
if len(historical_sales) >= 7:
features['lag_14_day'] = float(historical_sales.iloc[-7])
else:
features['lag_14_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) > 0 else overall_mean
logger.debug("Calculated lag features (prediction mode)", features=features)
return features
def calculate_rolling_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
windows: List[int] = None,
statistics: List[str] = None,
mode: str = 'training'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate rolling window statistics consistently for training and prediction.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training) with 'quantity' column
windows: List of window sizes in days (default: [7, 14, 30])
statistics: List of statistics to calculate (default: ['mean', 'std', 'max', 'min'])
mode: 'training' returns DataFrame, 'prediction' returns dict
Returns:
DataFrame with rolling columns (training mode) or dict of rolling features (prediction mode)
"""
if windows is None:
windows = [7, 14, 30]
if statistics is None:
statistics = ['mean', 'std', 'max', 'min']
if mode == 'training':
return self._calculate_rolling_features_training(sales_data, windows, statistics)
else:
return self._calculate_rolling_features_prediction(sales_data, windows, statistics)
def _calculate_rolling_features_training(
self,
df: pd.DataFrame,
windows: List[int],
statistics: List[str]
) -> pd.DataFrame:
"""
Calculate rolling features for training (operates on DataFrame).
Args:
df: DataFrame with 'quantity' column
windows: List of window sizes
statistics: List of statistics to calculate
Returns:
DataFrame with added rolling columns
"""
df = df.copy()
# Calculate overall statistics for fallback
overall_mean = float(df['quantity'].mean()) if len(df) > 0 else 0.0
overall_std = float(df['quantity'].std()) if len(df) > 1 else 0.0
overall_max = float(df['quantity'].max()) if len(df) > 0 else 0.0
overall_min = float(df['quantity'].min()) if len(df) > 0 else 0.0
fallback_values = {
'mean': overall_mean,
'std': overall_std,
'max': overall_max,
'min': overall_min
}
for window in windows:
for stat in statistics:
col_name = f'rolling_{stat}_{window}d'
# Calculate rolling statistic with full window required (consistent with prediction)
# Use min_periods=window to match prediction behavior
if stat == 'mean':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).mean()
elif stat == 'std':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).std()
elif stat == 'max':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).max()
elif stat == 'min':
df[col_name] = df['quantity'].rolling(window=window, min_periods=window).min()
# Fill NaN values using cascading fallback (consistent with prediction)
# Use smaller window values if available, otherwise use overall statistics
mask = df[col_name].isna()
if window == 14 and f'rolling_{stat}_7d' in df.columns:
# Use 7-day window for 14-day NaN
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_7d']
elif window == 30 and f'rolling_{stat}_14d' in df.columns:
# Use 14-day window for 30-day NaN
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_14d']
elif window == 30 and f'rolling_{stat}_7d' in df.columns:
# Use 7-day window for 30-day NaN if 14-day not available
df.loc[mask, col_name] = df.loc[mask, f'rolling_{stat}_7d']
# Fill any remaining NaN with overall statistics
df[col_name] = df[col_name].fillna(fallback_values[stat])
self.feature_columns.append(col_name)
logger.debug(f"Added rolling features (training mode)", windows=windows, statistics=statistics)
return df
def _calculate_rolling_features_prediction(
self,
historical_sales: pd.Series,
windows: List[int],
statistics: List[str]
) -> Dict[str, float]:
"""
Calculate rolling features for prediction (operates on Series, returns dict).
Args:
historical_sales: Series of sales quantities indexed by date
windows: List of window sizes
statistics: List of statistics to calculate
Returns:
Dictionary of rolling features
"""
features = {}
if len(historical_sales) == 0:
# Return default values if no data
for window in windows:
for stat in statistics:
features[f'rolling_{stat}_{window}d'] = 0.0
return features
# Calculate overall statistics for fallback
overall_mean = float(historical_sales.mean())
overall_std = float(historical_sales.std()) if len(historical_sales) > 1 else 0.0
overall_max = float(historical_sales.max())
overall_min = float(historical_sales.min())
fallback_values = {
'mean': overall_mean,
'std': overall_std,
'max': overall_max,
'min': overall_min
}
# Calculate for each window
for window in windows:
if len(historical_sales) >= window:
# Have enough data for full window
window_data = historical_sales.iloc[-window:]
for stat in statistics:
col_name = f'rolling_{stat}_{window}d'
if stat == 'mean':
features[col_name] = float(window_data.mean())
elif stat == 'std':
features[col_name] = float(window_data.std()) if len(window_data) > 1 else 0.0
elif stat == 'max':
features[col_name] = float(window_data.max())
elif stat == 'min':
features[col_name] = float(window_data.min())
else:
# Insufficient data - use cascading fallback
for stat in statistics:
col_name = f'rolling_{stat}_{window}d'
# Try to use smaller window if available
if window == 14 and f'rolling_{stat}_7d' in features:
features[col_name] = features[f'rolling_{stat}_7d']
elif window == 30 and f'rolling_{stat}_14d' in features:
features[col_name] = features[f'rolling_{stat}_14d']
elif window == 30 and f'rolling_{stat}_7d' in features:
features[col_name] = features[f'rolling_{stat}_7d']
else:
# Use overall statistics
features[col_name] = fallback_values[stat]
logger.debug("Calculated rolling features (prediction mode)", num_features=len(features))
return features
def calculate_trend_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
reference_date: Optional[datetime] = None,
lag_features: Optional[Dict[str, float]] = None,
rolling_features: Optional[Dict[str, float]] = None,
mode: str = 'training'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate trend-based features consistently for training and prediction.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training)
reference_date: Reference date for calculations (prediction mode)
lag_features: Pre-calculated lag features (prediction mode)
rolling_features: Pre-calculated rolling features (prediction mode)
mode: 'training' returns DataFrame, 'prediction' returns dict
Returns:
DataFrame with trend columns (training mode) or dict of trend features (prediction mode)
"""
if mode == 'training':
return self._calculate_trend_features_training(sales_data)
else:
return self._calculate_trend_features_prediction(
sales_data,
reference_date,
lag_features,
rolling_features
)
def _calculate_trend_features_training(
self,
df: pd.DataFrame,
date_column: str = 'date'
) -> pd.DataFrame:
"""
Calculate trend features for training (operates on DataFrame).
Args:
df: DataFrame with date and lag/rolling features
date_column: Name of date column
Returns:
DataFrame with added trend columns
"""
df = df.copy()
# Days since start
df['days_since_start'] = (df[date_column] - df[date_column].min()).dt.days
# Momentum (difference between lag_1 and lag_7)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['momentum_1_7'] = df['lag_1_day'] - df['lag_7_day']
self.feature_columns.append('momentum_1_7')
else:
df['momentum_1_7'] = 0.0
self.feature_columns.append('momentum_1_7')
# Trend (difference between 7-day and 30-day rolling means)
if 'rolling_mean_7d' in df.columns and 'rolling_mean_30d' in df.columns:
df['trend_7_30'] = df['rolling_mean_7d'] - df['rolling_mean_30d']
self.feature_columns.append('trend_7_30')
else:
df['trend_7_30'] = 0.0
self.feature_columns.append('trend_7_30')
# Velocity (rate of change over week)
if 'lag_1_day' in df.columns and 'lag_7_day' in df.columns:
df['velocity_week'] = (df['lag_1_day'] - df['lag_7_day']) / 7.0
self.feature_columns.append('velocity_week')
else:
df['velocity_week'] = 0.0
self.feature_columns.append('velocity_week')
self.feature_columns.append('days_since_start')
logger.debug("Added trend features (training mode)")
return df
def _calculate_trend_features_prediction(
self,
historical_sales: pd.Series,
reference_date: datetime,
lag_features: Dict[str, float],
rolling_features: Dict[str, float]
) -> Dict[str, float]:
"""
Calculate trend features for prediction (operates on Series, returns dict).
Args:
historical_sales: Series of sales quantities indexed by date
reference_date: The date we're forecasting for
lag_features: Pre-calculated lag features
rolling_features: Pre-calculated rolling features
Returns:
Dictionary of trend features
"""
features = {}
if len(historical_sales) == 0:
return {
'days_since_start': 0,
'momentum_1_7': 0.0,
'trend_7_30': 0.0,
'velocity_week': 0.0
}
# Days since first sale
features['days_since_start'] = (reference_date - historical_sales.index[0]).days
# Momentum (difference between lag_1 and lag_7)
if 'lag_1_day' in lag_features and 'lag_7_day' in lag_features:
if len(historical_sales) >= 7:
features['momentum_1_7'] = lag_features['lag_1_day'] - lag_features['lag_7_day']
else:
features['momentum_1_7'] = 0.0 # Insufficient data
else:
features['momentum_1_7'] = 0.0
# Trend (difference between 7-day and 30-day rolling means)
if 'rolling_mean_7d' in rolling_features and 'rolling_mean_30d' in rolling_features:
if len(historical_sales) >= 30:
features['trend_7_30'] = rolling_features['rolling_mean_7d'] - rolling_features['rolling_mean_30d']
else:
features['trend_7_30'] = 0.0 # Insufficient data
else:
features['trend_7_30'] = 0.0
# Velocity (rate of change over week)
if 'lag_1_day' in lag_features and 'lag_7_day' in lag_features:
if len(historical_sales) >= 7:
recent_value = lag_features['lag_1_day']
past_value = lag_features['lag_7_day']
features['velocity_week'] = float((recent_value - past_value) / 7.0)
else:
features['velocity_week'] = 0.0 # Insufficient data
else:
features['velocity_week'] = 0.0
logger.debug("Calculated trend features (prediction mode)", features=features)
return features
def calculate_data_freshness_metrics(
self,
historical_sales: pd.Series,
forecast_date: datetime
) -> Dict[str, Union[int, float]]:
"""
Calculate data freshness and availability metrics.
This is used by prediction service to assess data quality and adjust confidence.
Not used in training mode.
Args:
historical_sales: Series of sales quantities indexed by date
forecast_date: The date we're forecasting for
Returns:
Dictionary with freshness metrics
"""
if len(historical_sales) == 0:
return {
'days_since_last_sale': 999, # Very large number indicating no data
'historical_data_availability_score': 0.0
}
last_available_date = historical_sales.index.max()
days_since_last_sale = (forecast_date - last_available_date).days
# Calculate data availability score (0-1 scale, 1 being recent data)
max_considered_days = 180 # Consider data older than 6 months as very stale
availability_score = max(0.0, 1.0 - (days_since_last_sale / max_considered_days))
return {
'days_since_last_sale': days_since_last_sale,
'historical_data_availability_score': availability_score
}
def calculate_all_features(
self,
sales_data: Union[pd.Series, pd.DataFrame],
reference_date: Optional[datetime] = None,
mode: str = 'training',
date_column: str = 'date'
) -> Union[pd.DataFrame, Dict[str, float]]:
"""
Calculate all historical features in one call.
Args:
sales_data: Sales data as Series (prediction) or DataFrame (training)
reference_date: Reference date for predictions (prediction mode only)
mode: 'training' or 'prediction'
date_column: Name of date column (training mode only)
Returns:
DataFrame with all features (training) or dict of all features (prediction)
"""
if mode == 'training':
df = sales_data.copy()
# Calculate lag features
df = self.calculate_lag_features(df, mode='training')
# Calculate rolling features
df = self.calculate_rolling_features(df, mode='training')
# Calculate trend features
df = self.calculate_trend_features(df, mode='training')
logger.info(f"Calculated all features (training mode)", feature_count=len(self.feature_columns))
return df
else: # prediction mode
if reference_date is None:
raise ValueError("reference_date is required for prediction mode")
features = {}
# Calculate lag features
lag_features = self.calculate_lag_features(sales_data, mode='prediction')
features.update(lag_features)
# Calculate rolling features
rolling_features = self.calculate_rolling_features(sales_data, mode='prediction')
features.update(rolling_features)
# Calculate trend features
trend_features = self.calculate_trend_features(
sales_data,
reference_date=reference_date,
lag_features=lag_features,
rolling_features=rolling_features,
mode='prediction'
)
features.update(trend_features)
# Calculate data freshness metrics
freshness_metrics = self.calculate_data_freshness_metrics(sales_data, reference_date)
features.update(freshness_metrics)
logger.info(f"Calculated all features (prediction mode)", feature_count=len(features))
return features

View File

@@ -0,0 +1,127 @@
"""
City normalization utilities for converting free-text city names to normalized city IDs.
This module provides functions to normalize city names from tenant registration
(which are free-text strings) to standardized city_id values used by the
school calendar and location context systems.
"""
from typing import Optional
import logging
logger = logging.getLogger(__name__)
# Mapping of common city name variations to normalized city IDs
CITY_NAME_TO_ID_MAP = {
# Madrid variations
"Madrid": "madrid",
"madrid": "madrid",
"MADRID": "madrid",
# Barcelona variations
"Barcelona": "barcelona",
"barcelona": "barcelona",
"BARCELONA": "barcelona",
# Valencia variations
"Valencia": "valencia",
"valencia": "valencia",
"VALENCIA": "valencia",
# Seville variations
"Sevilla": "sevilla",
"sevilla": "sevilla",
"Seville": "sevilla",
"seville": "sevilla",
# Bilbao variations
"Bilbao": "bilbao",
"bilbao": "bilbao",
# Add more cities as needed
}
def normalize_city_id(city_name: Optional[str]) -> Optional[str]:
"""
Convert a free-text city name to a normalized city_id.
This function handles various capitalizations and spellings of city names,
converting them to standardized lowercase identifiers used by the
location context and school calendar systems.
Args:
city_name: Free-text city name from tenant registration (e.g., "Madrid", "MADRID")
Returns:
Normalized city_id (e.g., "madrid") or None if city_name is None
Falls back to lowercase city_name if not in mapping
Examples:
>>> normalize_city_id("Madrid")
'madrid'
>>> normalize_city_id("BARCELONA")
'barcelona'
>>> normalize_city_id("Unknown City")
'unknown city'
>>> normalize_city_id(None)
None
"""
if city_name is None:
return None
# Strip whitespace
city_name = city_name.strip()
if not city_name:
logger.warning("Empty city name provided to normalize_city_id")
return None
# Check if we have an explicit mapping
if city_name in CITY_NAME_TO_ID_MAP:
return CITY_NAME_TO_ID_MAP[city_name]
# Fallback: convert to lowercase for consistency
normalized = city_name.lower()
logger.info(
f"City name '{city_name}' not in explicit mapping, using lowercase fallback: '{normalized}'"
)
return normalized
def is_city_supported(city_id: str) -> bool:
"""
Check if a city has school calendars configured.
Currently only Madrid has school calendars in the system.
This function can be updated as more cities are added.
Args:
city_id: Normalized city_id (e.g., "madrid")
Returns:
True if the city has school calendars configured, False otherwise
Examples:
>>> is_city_supported("madrid")
True
>>> is_city_supported("barcelona")
False
"""
# Currently only Madrid has school calendars configured
supported_cities = {"madrid"}
return city_id in supported_cities
def get_supported_cities() -> list[str]:
"""
Get list of city IDs that have school calendars configured.
Returns:
List of supported city_id values
Examples:
>>> get_supported_cities()
['madrid']
"""
return ["madrid"]