Files
bakery-ia/services/forecasting/app/services/prediction_service.py
2025-11-05 22:54:14 +01:00

1006 lines
44 KiB
Python

# services/forecasting/app/services/prediction_service.py - FIXED SEASON FEATURE
"""
Prediction service for loading models and generating predictions
FIXED: Added missing 'season' feature that matches training service exactly
"""
import structlog
from typing import Dict, List, Any, Optional
import asyncio
import pickle
import json
from datetime import datetime, date
import numpy as np
import pandas as pd
import httpx
from pathlib import Path
import os
import joblib
from app.core.config import settings
from shared.monitoring.metrics import MetricsCollector
from shared.database.base import create_database_manager
from shared.clients import get_sales_client
logger = structlog.get_logger()
metrics = MetricsCollector("forecasting-service")
class PredictionService:
"""
Service for loading ML models and generating predictions with dependency injection
Interfaces with trained Prophet models from the training service
"""
def __init__(self, database_manager=None):
self.database_manager = database_manager or create_database_manager(settings.DATABASE_URL, "forecasting-service")
self.model_cache = {}
self.cache_ttl = 3600 # 1 hour cache
# Initialize sales client for fetching historical data
self.sales_client = get_sales_client(settings, "forecasting")
async def validate_prediction_request(self, request: Dict[str, Any]) -> Dict[str, Any]:
"""Validate prediction request"""
try:
required_fields = ["inventory_product_id", "model_id", "features"]
missing_fields = [field for field in required_fields if field not in request]
if missing_fields:
return {
"is_valid": False,
"errors": [f"Missing required fields: {missing_fields}"],
"validation_passed": False
}
return {
"is_valid": True,
"errors": [],
"validation_passed": True,
"validated_fields": list(request.keys())
}
except Exception as e:
logger.error("Validation error", error=str(e))
return {
"is_valid": False,
"errors": [str(e)],
"validation_passed": False
}
async def predict(self, model_id: str, model_path: str, features: Dict[str, Any],
confidence_level: float = 0.8) -> Dict[str, float]:
"""Generate prediction using trained model"""
start_time = datetime.now()
try:
logger.info("Generating prediction",
model_id=model_id,
features_count=len(features))
# Load model
model = await self._load_model(model_id, model_path)
if not model:
raise ValueError(f"Model {model_id} not found or failed to load")
# CRITICAL FIX: Fetch historical sales data and calculate historical features
# This populates lag, rolling, and trend features for better predictions
# Using 90 days for better trend analysis and more robust rolling statistics
if 'tenant_id' in features and 'inventory_product_id' in features and 'date' in features:
try:
forecast_date = pd.to_datetime(features['date'])
historical_sales = await self._fetch_historical_sales(
tenant_id=features['tenant_id'],
inventory_product_id=features['inventory_product_id'],
forecast_date=forecast_date,
days_back=90 # Changed from 30 to 90 for better historical context
)
# Calculate historical features and merge into features dict
historical_features = self._calculate_historical_features(
historical_sales, forecast_date
)
features.update(historical_features)
logger.info("Historical features enriched",
lag_1_day=historical_features.get('lag_1_day'),
rolling_mean_7d=historical_features.get('rolling_mean_7d'))
except Exception as e:
logger.warning("Failed to enrich with historical features, using defaults",
error=str(e))
# Features dict will use defaults (0.0) from _prepare_prophet_features
# Prepare features for Prophet model
prophet_df = self._prepare_prophet_features(features)
# Generate prediction
forecast = model.predict(prophet_df)
# Extract prediction values
prediction_value = float(forecast['yhat'].iloc[0])
lower_bound = float(forecast['yhat_lower'].iloc[0])
upper_bound = float(forecast['yhat_upper'].iloc[0])
# Calculate confidence interval
confidence_interval = upper_bound - lower_bound
result = {
"prediction": max(0, prediction_value), # Ensure non-negative
"lower_bound": max(0, lower_bound),
"upper_bound": max(0, upper_bound),
"confidence_interval": confidence_interval,
"confidence_level": confidence_level
}
# Record metrics
processing_time = (datetime.now() - start_time).total_seconds()
# Record metrics with proper registration and error handling
try:
# Register metrics if not already registered
if "prediction_processing_time" not in metrics._histograms:
metrics.register_histogram(
"prediction_processing_time",
"Time taken to process predictions",
labels=['service', 'model_type']
)
if "predictions_served_total" not in metrics._counters:
try:
metrics.register_counter(
"predictions_served_total",
"Total number of predictions served",
labels=['service', 'status']
)
except Exception as reg_error:
# Metric might already exist in global registry
logger.debug("Counter already exists in registry", error=str(reg_error))
# Now record the metrics
metrics.observe_histogram(
"prediction_processing_time",
processing_time,
labels={'service': 'forecasting-service', 'model_type': 'prophet'}
)
metrics.increment_counter(
"predictions_served_total",
labels={'service': 'forecasting-service', 'status': 'success'}
)
except Exception as metrics_error:
# Log metrics error but don't fail the prediction
logger.warning("Failed to record metrics", error=str(metrics_error))
logger.info("Prediction generated successfully",
model_id=model_id,
prediction=result["prediction"],
processing_time=processing_time)
return result
except Exception as e:
logger.error("Error generating prediction",
error=str(e),
model_id=model_id)
try:
if "prediction_errors_total" not in metrics._counters:
metrics.register_counter(
"prediction_errors_total",
"Total number of prediction errors",
labels=['service', 'error_type']
)
metrics.increment_counter(
"prediction_errors_total",
labels={'service': 'forecasting-service', 'error_type': 'prediction_failed'}
)
except Exception:
pass # Don't fail on metrics errors
raise
async def predict_with_weather_forecast(
self,
model_id: str,
model_path: str,
features: Dict[str, Any],
tenant_id: str,
days: int = 7,
confidence_level: float = 0.8
) -> List[Dict[str, float]]:
"""
Generate predictions enriched with real weather forecast data
This method:
1. Loads the trained ML model
2. Fetches real weather forecast from external service
3. Enriches prediction features with actual forecast data
4. Generates weather-aware predictions
Args:
model_id: ID of the trained model
model_path: Path to model file
features: Base features for prediction
tenant_id: Tenant ID for weather forecast
days: Number of days to forecast
confidence_level: Confidence level for predictions
Returns:
List of predictions with weather-aware adjustments
"""
from app.services.data_client import data_client
start_time = datetime.now()
try:
logger.info("Generating weather-aware predictions",
model_id=model_id,
days=days)
# Step 1: Load ML model
model = await self._load_model(model_id, model_path)
if not model:
raise ValueError(f"Model {model_id} not found")
# Step 2: Fetch real weather forecast
latitude = features.get('latitude', 40.4168)
longitude = features.get('longitude', -3.7038)
weather_forecast = await data_client.fetch_weather_forecast(
tenant_id=tenant_id,
days=days,
latitude=latitude,
longitude=longitude
)
logger.info(f"Fetched weather forecast for {len(weather_forecast)} days",
tenant_id=tenant_id)
# Step 3: Generate predictions for each day with weather data
predictions = []
for day_offset in range(days):
# Get weather for this specific day
day_weather = weather_forecast[day_offset] if day_offset < len(weather_forecast) else {}
# Enrich features with actual weather forecast
enriched_features = features.copy()
enriched_features.update({
'temperature': day_weather.get('temperature', features.get('temperature', 20.0)),
'precipitation': day_weather.get('precipitation', features.get('precipitation', 0.0)),
'humidity': day_weather.get('humidity', features.get('humidity', 60.0)),
'wind_speed': day_weather.get('wind_speed', features.get('wind_speed', 10.0)),
'pressure': day_weather.get('pressure', features.get('pressure', 1013.0)),
'weather_description': day_weather.get('description', 'Clear')
})
# Prepare Prophet dataframe with weather features
prophet_df = self._prepare_prophet_features(enriched_features)
# Generate prediction for this day
forecast = model.predict(prophet_df)
prediction_value = float(forecast['yhat'].iloc[0])
lower_bound = float(forecast['yhat_lower'].iloc[0])
upper_bound = float(forecast['yhat_upper'].iloc[0])
# Apply weather-based adjustments (business rules)
adjusted_prediction = self._apply_weather_adjustments(
prediction_value,
day_weather,
features.get('product_category', 'general')
)
predictions.append({
"date": enriched_features['date'],
"prediction": max(0, adjusted_prediction),
"lower_bound": max(0, lower_bound),
"upper_bound": max(0, upper_bound),
"confidence_level": confidence_level,
"weather": {
"temperature": enriched_features['temperature'],
"precipitation": enriched_features['precipitation'],
"description": enriched_features['weather_description']
}
})
processing_time = (datetime.now() - start_time).total_seconds()
logger.info("Weather-aware predictions generated",
model_id=model_id,
days=len(predictions),
processing_time=processing_time)
return predictions
except Exception as e:
logger.error("Error generating weather-aware predictions",
error=str(e),
model_id=model_id)
raise
def _apply_weather_adjustments(
self,
base_prediction: float,
weather: Dict[str, Any],
product_category: str
) -> float:
"""
Apply business rules based on weather conditions
Adjusts predictions based on real weather forecast
"""
adjusted = base_prediction
temp = weather.get('temperature', 20.0)
precip = weather.get('precipitation', 0.0)
# Temperature-based adjustments
if product_category == 'ice_cream':
if temp > 30:
adjusted *= 1.4 # +40% for very hot days
elif temp > 25:
adjusted *= 1.2 # +20% for hot days
elif temp < 15:
adjusted *= 0.7 # -30% for cold days
elif product_category == 'bread':
if temp > 30:
adjusted *= 0.9 # -10% for very hot days
elif temp < 10:
adjusted *= 1.1 # +10% for cold days
elif product_category == 'coffee':
if temp < 15:
adjusted *= 1.2 # +20% for cold days
elif precip > 5:
adjusted *= 1.15 # +15% for rainy days
# Precipitation-based adjustments
if precip > 10: # Heavy rain
if product_category in ['pastry', 'coffee']:
adjusted *= 1.2 # People stay indoors, buy comfort food
return adjusted
async def _load_model(self, model_id: str, model_path: str):
"""Load model from file with improved validation and error handling"""
# Enhanced model file validation
if not await self._validate_model_file(model_path):
logger.error(f"Model file not valid: {model_path}")
return None
# Check cache first
if model_id in self.model_cache:
cached_model, cached_time = self.model_cache[model_id]
if (datetime.now() - cached_time).seconds < self.cache_ttl:
return cached_model
try:
if os.path.exists(model_path):
# Try multiple loading methods for compatibility
model = await self._load_model_safely(model_path)
if model is None:
logger.error(f"Failed to load model from: {model_path}")
return None
# Cache the model
self.model_cache[model_id] = (model, datetime.now())
logger.info(f"Model loaded successfully: {model_path}")
return model
else:
logger.error(f"Model file not found: {model_path}")
return None
except Exception as e:
logger.error(f"Error loading model: {e}")
return None
async def _load_model_safely(self, model_path: str):
"""Safely load model with multiple fallback methods"""
# Method 1: Try joblib first (recommended for sklearn/Prophet models)
try:
logger.debug(f"Attempting to load model with joblib: {model_path}")
model = joblib.load(model_path)
logger.info(f"Model loaded successfully with joblib")
return model
except Exception as e:
logger.warning(f"Joblib loading failed: {e}")
# Method 2: Try pickle as fallback
try:
logger.debug(f"Attempting to load model with pickle: {model_path}")
with open(model_path, 'rb') as f:
model = pickle.load(f)
logger.info(f"Model loaded successfully with pickle")
return model
except Exception as e:
logger.warning(f"Pickle loading failed: {e}")
# Method 3: Try pandas pickle (for Prophet models saved with pandas)
try:
logger.debug(f"Attempting to load model with pandas: {model_path}")
import pandas as pd
model = pd.read_pickle(model_path)
logger.info(f"Model loaded successfully with pandas")
return model
except Exception as e:
logger.warning(f"Pandas loading failed: {e}")
logger.error(f"All loading methods failed for: {model_path}")
return None
async def _validate_model_file(self, model_path: str) -> bool:
"""Enhanced model file validation"""
try:
if not os.path.exists(model_path):
logger.error(f"Model file not found: {model_path}")
return False
# Check file size (should be > 1KB for a trained model)
file_size = os.path.getsize(model_path)
if file_size < 1024:
logger.warning(f"Model file too small ({file_size} bytes): {model_path}")
return False
# More comprehensive file format detection
try:
with open(model_path, 'rb') as f:
header = f.read(16) # Read more bytes for better detection
# Check for various pickle/joblib signatures
valid_signatures = [
b']\x93PICKLE', # Joblib
b'\x80\x03', # Pickle protocol 3
b'\x80\x04', # Pickle protocol 4
b'\x80\x05', # Pickle protocol 5
b'}\x94', # Newer joblib format
b'}\x93', # Alternative joblib format
]
is_valid_format = any(header.startswith(sig) for sig in valid_signatures)
if not is_valid_format:
# Log header for debugging but don't fail validation
logger.warning(f"Unrecognized file header: {header[:8]} for {model_path}")
logger.info("Proceeding with loading attempt despite unrecognized header")
# Return True to allow loading attempt - some valid files may have different headers
return True
return True
except Exception as e:
logger.error(f"Error reading model file header: {e}")
return False
except Exception as e:
logger.error(f"Model validation error: {e}")
return False
async def _fetch_historical_sales(
self,
tenant_id: str,
inventory_product_id: str,
forecast_date: datetime,
days_back: int = 90
) -> pd.Series:
"""
Fetch historical sales data for calculating lagged and rolling features.
Args:
tenant_id: Tenant UUID
inventory_product_id: Product UUID
forecast_date: The date we're forecasting for
days_back: Number of days of history to fetch (default 90 for better trend analysis)
Returns:
pandas Series with sales quantities indexed by date
"""
try:
# Calculate date range
end_date = forecast_date - pd.Timedelta(days=1) # Day before forecast
start_date = end_date - pd.Timedelta(days=days_back)
logger.debug("Fetching historical sales for feature calculation",
tenant_id=tenant_id,
product_id=inventory_product_id,
start_date=start_date.date(),
end_date=end_date.date(),
days_back=days_back)
# Fetch sales data from sales service
sales_data = await self.sales_client.get_sales_data(
tenant_id=tenant_id,
start_date=start_date.strftime("%Y-%m-%d"),
end_date=end_date.strftime("%Y-%m-%d"),
product_id=inventory_product_id,
aggregation="daily"
)
if not sales_data:
logger.warning("No historical sales data found",
tenant_id=tenant_id,
product_id=inventory_product_id)
return pd.Series(dtype=float)
# Convert to pandas Series indexed by date
df = pd.DataFrame(sales_data)
df['sale_date'] = pd.to_datetime(df['sale_date'])
df = df.set_index('sale_date')
# Extract quantity column (could be 'quantity' or 'total_quantity')
if 'quantity' in df.columns:
series = df['quantity']
elif 'total_quantity' in df.columns:
series = df['total_quantity']
else:
logger.warning("Sales data missing quantity field",
columns=list(df.columns))
return pd.Series(dtype=float)
logger.debug("Historical sales fetched successfully",
records=len(series),
date_range=f"{series.index.min()} to {series.index.max()}")
return series.sort_index()
except Exception as e:
logger.error("Error fetching historical sales",
error=str(e),
tenant_id=tenant_id,
product_id=inventory_product_id)
return pd.Series(dtype=float)
def _calculate_historical_features(
self,
historical_sales: pd.Series,
forecast_date: datetime
) -> Dict[str, float]:
"""
Calculate lagged, rolling, and trend features from historical sales data.
Args:
historical_sales: Series of sales quantities indexed by date
forecast_date: The date we're forecasting for
Returns:
Dictionary of calculated features
"""
features = {}
try:
if len(historical_sales) == 0:
logger.warning("No historical data available, using default values")
# Return all features with default values (0.0)
return {
# Lagged features
'lag_1_day': 0.0,
'lag_7_day': 0.0,
'lag_14_day': 0.0,
# Rolling statistics (7-day window)
'rolling_mean_7d': 0.0,
'rolling_std_7d': 0.0,
'rolling_max_7d': 0.0,
'rolling_min_7d': 0.0,
# Rolling statistics (14-day window)
'rolling_mean_14d': 0.0,
'rolling_std_14d': 0.0,
'rolling_max_14d': 0.0,
'rolling_min_14d': 0.0,
# Rolling statistics (30-day window)
'rolling_mean_30d': 0.0,
'rolling_std_30d': 0.0,
'rolling_max_30d': 0.0,
'rolling_min_30d': 0.0,
# Trend features
'days_since_start': 0,
'momentum_1_7': 0.0,
'trend_7_30': 0.0,
'velocity_week': 0.0,
}
# Calculate lagged features
features['lag_1_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) >= 1 else 0.0
features['lag_7_day'] = float(historical_sales.iloc[-7]) if len(historical_sales) >= 7 else features['lag_1_day']
features['lag_14_day'] = float(historical_sales.iloc[-14]) if len(historical_sales) >= 14 else features['lag_7_day']
# Calculate rolling statistics (7-day window)
if len(historical_sales) >= 7:
window_7d = historical_sales.iloc[-7:]
features['rolling_mean_7d'] = float(window_7d.mean())
features['rolling_std_7d'] = float(window_7d.std())
features['rolling_max_7d'] = float(window_7d.max())
features['rolling_min_7d'] = float(window_7d.min())
else:
features['rolling_mean_7d'] = features['lag_1_day']
features['rolling_std_7d'] = 0.0
features['rolling_max_7d'] = features['lag_1_day']
features['rolling_min_7d'] = features['lag_1_day']
# Calculate rolling statistics (14-day window)
if len(historical_sales) >= 14:
window_14d = historical_sales.iloc[-14:]
features['rolling_mean_14d'] = float(window_14d.mean())
features['rolling_std_14d'] = float(window_14d.std())
features['rolling_max_14d'] = float(window_14d.max())
features['rolling_min_14d'] = float(window_14d.min())
else:
features['rolling_mean_14d'] = features['rolling_mean_7d']
features['rolling_std_14d'] = features['rolling_std_7d']
features['rolling_max_14d'] = features['rolling_max_7d']
features['rolling_min_14d'] = features['rolling_min_7d']
# Calculate rolling statistics (30-day window)
if len(historical_sales) >= 30:
window_30d = historical_sales.iloc[-30:]
features['rolling_mean_30d'] = float(window_30d.mean())
features['rolling_std_30d'] = float(window_30d.std())
features['rolling_max_30d'] = float(window_30d.max())
features['rolling_min_30d'] = float(window_30d.min())
else:
features['rolling_mean_30d'] = features['rolling_mean_14d']
features['rolling_std_30d'] = features['rolling_std_14d']
features['rolling_max_30d'] = features['rolling_max_14d']
features['rolling_min_30d'] = features['rolling_min_14d']
# Calculate trend features
if len(historical_sales) > 0:
# Days since first sale
features['days_since_start'] = (forecast_date - historical_sales.index[0]).days
# Momentum (difference between recent lag_1_day and lag_7_day)
if len(historical_sales) >= 7:
features['momentum_1_7'] = features['lag_1_day'] - features['lag_7_day']
else:
features['momentum_1_7'] = 0.0
# Trend (difference between recent 7-day and 30-day averages)
if len(historical_sales) >= 30:
features['trend_7_30'] = features['rolling_mean_7d'] - features['rolling_mean_30d']
else:
features['trend_7_30'] = 0.0
# Velocity (rate of change over the last week)
if len(historical_sales) >= 7:
week_change = historical_sales.iloc[-1] - historical_sales.iloc[-7]
features['velocity_week'] = float(week_change / 7.0)
else:
features['velocity_week'] = 0.0
else:
features['days_since_start'] = 0
features['momentum_1_7'] = 0.0
features['trend_7_30'] = 0.0
features['velocity_week'] = 0.0
logger.debug("Historical features calculated",
lag_1_day=features['lag_1_day'],
rolling_mean_7d=features['rolling_mean_7d'],
rolling_mean_30d=features['rolling_mean_30d'],
momentum=features['momentum_1_7'])
return features
except Exception as e:
logger.error("Error calculating historical features",
error=str(e))
# Return default values on error
return {k: 0.0 for k in [
'lag_1_day', 'lag_7_day', 'lag_14_day',
'rolling_mean_7d', 'rolling_std_7d', 'rolling_max_7d', 'rolling_min_7d',
'rolling_mean_14d', 'rolling_std_14d', 'rolling_max_14d', 'rolling_min_14d',
'rolling_mean_30d', 'rolling_std_30d', 'rolling_max_30d', 'rolling_min_30d',
'momentum_1_7', 'trend_7_30', 'velocity_week'
]} | {'days_since_start': 0}
def _prepare_prophet_features(self, features: Dict[str, Any]) -> pd.DataFrame:
"""Convert features to Prophet-compatible DataFrame - COMPLETE FEATURE MATCHING"""
try:
# Create base DataFrame with required 'ds' column
df = pd.DataFrame({
'ds': [pd.to_datetime(features['date'])]
})
# ✅ FIX: Add ALL traffic features that training service uses
# Core traffic features
df['traffic_volume'] = float(features.get('traffic_volume', 100.0))
df['pedestrian_count'] = float(features.get('pedestrian_count', 50.0))
df['congestion_level'] = float(features.get('congestion_level', 1.0))
df['average_speed'] = float(features.get('average_speed', 30.0)) # ← MISSING FEATURE!
# Weather features
df['temperature'] = float(features.get('temperature', 15.0))
df['precipitation'] = float(features.get('precipitation', 0.0))
df['humidity'] = float(features.get('humidity', 60.0))
df['wind_speed'] = float(features.get('wind_speed', 5.0))
df['pressure'] = float(features.get('pressure', 1013.0))
df['temp_category'] = self._get_temp_category(df['temperature'].iloc[0])
# Extract date information for temporal features
forecast_date = pd.to_datetime(features['date'])
day_of_week = forecast_date.weekday() # 0=Monday, 6=Sunday
# ✅ FIX: Add ALL temporal features (must match training exactly!)
df['day_of_week'] = int(day_of_week)
df['day_of_month'] = int(forecast_date.day)
df['month'] = int(forecast_date.month)
df['quarter'] = int(forecast_date.quarter)
df['week_of_year'] = int(forecast_date.isocalendar().week)
# ✅ FIX: Add the missing 'season' feature that matches training exactly
df['season'] = self._get_season(forecast_date.month)
# Bakery-specific temporal features
df['is_weekend'] = int(day_of_week >= 5)
df['is_monday'] = int(day_of_week == 0)
df['is_tuesday'] = int(day_of_week == 1)
df['is_wednesday'] = int(day_of_week == 2)
df['is_thursday'] = int(day_of_week == 3)
df['is_friday'] = int(day_of_week == 4)
df['is_saturday'] = int(day_of_week == 5)
df['is_sunday'] = int(day_of_week == 6)
df['is_working_day'] = int(day_of_week < 5) # Working days (Mon-Fri)
# Season-based features (match training service)
df['is_spring'] = int(df['season'].iloc[0] == 2)
df['is_summer'] = int(df['season'].iloc[0] == 3)
df['is_autumn'] = int(df['season'].iloc[0] == 4)
df['is_winter'] = int(df['season'].iloc[0] == 1)
# ✅ PERFORMANCE FIX: Build all features at once to avoid DataFrame fragmentation
# Extract values once to avoid repeated iloc calls
temperature = df['temperature'].iloc[0]
humidity = df['humidity'].iloc[0]
pressure = df['pressure'].iloc[0]
wind_speed = df['wind_speed'].iloc[0]
precipitation = df['precipitation'].iloc[0]
traffic = df['traffic_volume'].iloc[0]
pedestrians = df['pedestrian_count'].iloc[0]
avg_speed = df['average_speed'].iloc[0]
congestion = df['congestion_level'].iloc[0]
season = df['season'].iloc[0]
is_weekend = df['is_weekend'].iloc[0]
# Build all new features as a dictionary
new_features = {
# Holiday features
'is_holiday': int(features.get('is_holiday', False)),
'is_school_holiday': int(features.get('is_school_holiday', False)),
# Month-based features
'is_january': int(forecast_date.month == 1),
'is_february': int(forecast_date.month == 2),
'is_march': int(forecast_date.month == 3),
'is_april': int(forecast_date.month == 4),
'is_may': int(forecast_date.month == 5),
'is_june': int(forecast_date.month == 6),
'is_july': int(forecast_date.month == 7),
'is_august': int(forecast_date.month == 8),
'is_september': int(forecast_date.month == 9),
'is_october': int(forecast_date.month == 10),
'is_november': int(forecast_date.month == 11),
'is_december': int(forecast_date.month == 12),
# Special day features
'is_month_start': int(forecast_date.day <= 3),
'is_month_end': int(forecast_date.day >= 28),
'is_payday_period': int((forecast_date.day <= 5) or (forecast_date.day >= 25)),
# CRITICAL FIX: Add is_payday feature to match training service
# Training defines: is_payday = (day == 15 OR is_month_end)
'is_payday': int((forecast_date.day == 15) or self._is_end_of_month(forecast_date)),
# Weather-based derived features
'temp_squared': temperature ** 2,
'is_cold_day': int(temperature < 10),
'is_hot_day': int(temperature > 25),
'is_pleasant_day': int(10 <= temperature <= 25),
# Humidity features
'humidity_squared': humidity ** 2,
'is_high_humidity': int(humidity > 70),
'is_low_humidity': int(humidity < 40),
# Pressure features
'pressure_squared': pressure ** 2,
'is_high_pressure': int(pressure > 1020),
'is_low_pressure': int(pressure < 1000),
# Wind features
'wind_squared': wind_speed ** 2,
'is_windy': int(wind_speed > 15),
'is_calm': int(wind_speed < 5),
# Precipitation features
'precip_squared': precipitation ** 2,
'precip_log': float(np.log1p(precipitation)),
'is_rainy_day': int(precipitation > 0.1),
'is_very_rainy_day': int(precipitation > 5.0),
'is_heavy_rain': int(precipitation > 10),
'rain_intensity': self._get_rain_intensity(precipitation),
# Traffic-based features
'high_traffic': int(traffic > 150) if traffic > 0 else 0,
'low_traffic': int(traffic < 50) if traffic > 0 else 0,
# Fix: Use same normalization as training (when std=0, normalized=0.0)
# Training uses constant 100.0 values, so std=0 and normalized=0.0
'traffic_normalized': 0.0, # Match training behavior for consistent predictions
'traffic_squared': traffic ** 2,
'traffic_log': float(np.log1p(traffic)),
# Pedestrian features
'high_pedestrian_count': int(pedestrians > 100),
'low_pedestrian_count': int(pedestrians < 25),
'pedestrian_normalized': float((pedestrians - 50) / 25),
'pedestrian_squared': pedestrians ** 2,
'pedestrian_log': float(np.log1p(pedestrians)),
# Speed features
'high_speed': int(avg_speed > 40),
'low_speed': int(avg_speed < 20),
'speed_normalized': float((avg_speed - 30) / 10),
'speed_squared': avg_speed ** 2,
'speed_log': float(np.log1p(avg_speed)),
# Congestion features
'high_congestion': int(congestion > 3),
'low_congestion': int(congestion < 2),
'congestion_squared': congestion ** 2,
# Day features
'is_peak_bakery_day': int(day_of_week in [4, 5, 6]),
'is_high_demand_month': int(forecast_date.month in [6, 7, 8, 12]),
'is_warm_season': int(forecast_date.month in [4, 5, 6, 7, 8, 9]),
# CRITICAL FIX: Cyclical encoding features (MATCH TRAINING)
# These encode day_of_week and month as sin/cos for cyclical patterns
'day_of_week_sin': float(np.sin(2 * np.pi * day_of_week / 7)),
'day_of_week_cos': float(np.cos(2 * np.pi * day_of_week / 7)),
'month_sin': float(np.sin(2 * np.pi * forecast_date.month / 12)),
'month_cos': float(np.cos(2 * np.pi * forecast_date.month / 12)),
# CRITICAL FIX: Historical features (lagged, rolling, trend)
# These will be populated from historical sales data
# Default to 0.0 here, will be updated if historical data is provided
'lag_1_day': float(features.get('lag_1_day', 0.0)),
'lag_7_day': float(features.get('lag_7_day', 0.0)),
'lag_14_day': float(features.get('lag_14_day', 0.0)),
'rolling_mean_7d': float(features.get('rolling_mean_7d', 0.0)),
'rolling_std_7d': float(features.get('rolling_std_7d', 0.0)),
'rolling_max_7d': float(features.get('rolling_max_7d', 0.0)),
'rolling_min_7d': float(features.get('rolling_min_7d', 0.0)),
'rolling_mean_14d': float(features.get('rolling_mean_14d', 0.0)),
'rolling_std_14d': float(features.get('rolling_std_14d', 0.0)),
'rolling_max_14d': float(features.get('rolling_max_14d', 0.0)),
'rolling_min_14d': float(features.get('rolling_min_14d', 0.0)),
'rolling_mean_30d': float(features.get('rolling_mean_30d', 0.0)),
'rolling_std_30d': float(features.get('rolling_std_30d', 0.0)),
'rolling_max_30d': float(features.get('rolling_max_30d', 0.0)),
'rolling_min_30d': float(features.get('rolling_min_30d', 0.0)),
'days_since_start': int(features.get('days_since_start', 0)),
'momentum_1_7': float(features.get('momentum_1_7', 0.0)),
'trend_7_30': float(features.get('trend_7_30', 0.0)),
'velocity_week': float(features.get('velocity_week', 0.0)),
}
# Calculate interaction features
is_holiday = new_features['is_holiday']
is_pleasant = new_features['is_pleasant_day']
is_rainy = new_features['is_rainy_day']
is_payday = new_features['is_payday']
interaction_features = {
# Weekend interactions
'weekend_temp_interaction': is_weekend * temperature,
'weekend_pleasant_weather': is_weekend * is_pleasant,
'weekend_traffic_interaction': is_weekend * traffic,
# Holiday interactions
'holiday_temp_interaction': is_holiday * temperature,
'holiday_traffic_interaction': is_holiday * traffic,
# CRITICAL FIX: Add payday_weekend_interaction to match training service
'payday_weekend_interaction': is_payday * is_weekend,
# Season interactions
'season_temp_interaction': season * temperature,
'season_traffic_interaction': season * traffic,
# Rain-traffic interactions
'rain_traffic_interaction': is_rainy * traffic,
'rain_speed_interaction': is_rainy * avg_speed,
# CRITICAL FIX: Add missing interaction features from training
'rain_weekend_interaction': is_rainy * is_weekend,
'friday_traffic_interaction': int(day_of_week == 4) * traffic,
# Day-weather interactions
'day_temp_interaction': day_of_week * temperature,
'month_temp_interaction': forecast_date.month * temperature,
# Traffic-speed interactions
'traffic_speed_interaction': traffic * avg_speed,
'pedestrian_speed_interaction': pedestrians * avg_speed,
# Congestion interactions
'congestion_temp_interaction': congestion * temperature,
'congestion_weekend_interaction': congestion * is_weekend
}
# Combine all features
all_new_features = {**new_features, **interaction_features}
# Add all features at once using pd.concat to avoid fragmentation
new_feature_df = pd.DataFrame([all_new_features])
df = pd.concat([df, new_feature_df], axis=1)
logger.debug("Complete Prophet features prepared",
feature_count=len(df.columns),
date=features['date'],
season=df['season'].iloc[0],
traffic_volume=df['traffic_volume'].iloc[0],
average_speed=df['average_speed'].iloc[0],
pedestrian_count=df['pedestrian_count'].iloc[0])
return df
except Exception as e:
logger.error("Error preparing Prophet features", error=str(e))
raise
def _get_season(self, month: int) -> int:
"""Get season from month (1-4 for Winter, Spring, Summer, Autumn) - MATCH TRAINING"""
if month in [12, 1, 2]:
return 1 # Winter
elif month in [3, 4, 5]:
return 2 # Spring
elif month in [6, 7, 8]:
return 3 # Summer
else:
return 4 # Autumn
def _is_school_holiday(self, date: datetime) -> bool:
"""Check if a date is during school holidays - MATCH TRAINING"""
month = date.month
# Approximate Spanish school holiday periods
if month in [7, 8]: # Summer holidays
return True
if month == 12 and date.day >= 20: # Christmas holidays
return True
if month == 1 and date.day <= 10: # Christmas holidays continued
return True
if month == 4 and date.day <= 15: # Easter holidays (approximate)
return True
return False
def _get_temp_category(self, temperature: float) -> int:
"""Get temperature category (0-3) - MATCH TRAINING"""
if temperature <= 5:
return 0 # Very cold
elif temperature <= 15:
return 1 # Cold
elif temperature <= 25:
return 2 # Mild
else:
return 3 # Hot
def _get_rain_intensity(self, precipitation: float) -> int:
"""Get rain intensity category (0-3) - MATCH TRAINING"""
if precipitation <= 0:
return 0 # No rain
elif precipitation <= 2:
return 1 # Light rain
elif precipitation <= 10:
return 2 # Moderate rain
else:
return 3 # Heavy rain
def _is_end_of_month(self, date: datetime) -> bool:
"""
Check if date is the last day of the month - MATCH TRAINING SERVICE
Training uses: df[date_column].dt.is_month_end
"""
import calendar
# Get the last day of the month
last_day = calendar.monthrange(date.year, date.month)[1]
return date.day == last_day