Files
bakery-ia/services/forecasting/app/services/prediction_service.py
2026-01-17 22:42:40 +01:00

1212 lines
57 KiB
Python

# services/forecasting/app/services/prediction_service.py - FIXED SEASON FEATURE
"""
Prediction service for loading models and generating predictions
FIXED: Added missing 'season' feature that matches training service exactly
"""
import structlog
from typing import Dict, List, Any, Optional
import asyncio
import pickle
import json
from datetime import datetime, date
import numpy as np
import pandas as pd
import httpx
from pathlib import Path
import os
import joblib
import io
from app.core.config import settings
from shared.monitoring.metrics import MetricsCollector
from shared.database.base import create_database_manager
from shared.clients import get_sales_client
logger = structlog.get_logger()
metrics = MetricsCollector("forecasting-service")
class PredictionService:
"""
Service for loading ML models and generating predictions with dependency injection
Interfaces with trained Prophet models from the training service
"""
def __init__(self, database_manager=None):
self.database_manager = database_manager or create_database_manager(settings.DATABASE_URL, "forecasting-service")
self.model_cache = {}
self.cache_ttl = 3600 # 1 hour cache
# Initialize sales client for fetching historical data
self.sales_client = get_sales_client(settings, "forecasting")
async def validate_prediction_request(self, request: Dict[str, Any]) -> Dict[str, Any]:
"""Validate prediction request"""
try:
required_fields = ["inventory_product_id", "model_id", "features"]
missing_fields = [field for field in required_fields if field not in request]
if missing_fields:
return {
"is_valid": False,
"errors": [f"Missing required fields: {missing_fields}"],
"validation_passed": False
}
return {
"is_valid": True,
"errors": [],
"validation_passed": True,
"validated_fields": list(request.keys())
}
except Exception as e:
logger.error("Validation error", error=str(e))
return {
"is_valid": False,
"errors": [str(e)],
"validation_passed": False
}
async def predict(self, model_id: str, model_path: str, features: Dict[str, Any],
confidence_level: float = 0.8) -> Dict[str, float]:
"""Generate prediction using trained model"""
start_time = datetime.now()
try:
logger.info("Generating prediction",
model_id=model_id,
features_count=len(features))
# Load model
model = await self._load_model(model_id, model_path)
if not model:
raise ValueError(f"Model {model_id} not found or failed to load")
# CRITICAL FIX: Fetch historical sales data and calculate historical features
# This populates lag, rolling, and trend features for better predictions
# Using 90 days for better trend analysis and more robust rolling statistics
if 'tenant_id' in features and 'inventory_product_id' in features and 'date' in features:
try:
forecast_date = pd.to_datetime(features['date'])
historical_sales = await self._fetch_historical_sales(
tenant_id=features['tenant_id'],
inventory_product_id=features['inventory_product_id'],
forecast_date=forecast_date,
days_back=90 # Changed from 30 to 90 for better historical context
)
# Calculate historical features and merge into features dict
historical_features = self._calculate_historical_features(
historical_sales, forecast_date
)
features.update(historical_features)
logger.info("Historical features enriched",
lag_1_day=historical_features.get('lag_1_day'),
rolling_mean_7d=historical_features.get('rolling_mean_7d'))
except Exception as e:
logger.warning("Failed to enrich with historical features, using defaults",
error=str(e))
# Features dict will use defaults (0.0) from _prepare_prophet_features
# CRITICAL FIX: Fetch POI (Point of Interest) features from external service
# Prophet models trained with POI features REQUIRE them during prediction
# This prevents "Regressor 'poi_retail_total_count' missing" errors
if 'tenant_id' in features:
try:
from shared.clients.external_client import ExternalServiceClient
from app.core.config import settings
external_client = ExternalServiceClient(settings, "forecasting-service")
poi_data = await external_client.get_poi_context(features['tenant_id'])
if poi_data and 'ml_features' in poi_data:
# Add all POI ML features to prediction features
poi_features = poi_data['ml_features']
features.update(poi_features)
logger.info("POI features enriched",
tenant_id=features['tenant_id'],
poi_feature_count=len(poi_features))
else:
logger.warning("No POI data available for tenant, using default POI features",
tenant_id=features['tenant_id'])
# Provide default POI features to prevent model errors
# These match ALL features generated by POI detection service
# Format: poi_{category}_{feature_name}
default_poi_features = {}
# POI categories from external service POI_CATEGORIES configuration
# These match the categories in services/external/app/core/poi_config.py
poi_categories = [
'schools', 'offices', 'gyms_sports', 'residential', 'tourism',
'competitors', 'transport_hubs', 'coworking', 'retail'
]
for category in poi_categories:
default_poi_features.update({
f'poi_{category}_proximity_score': 0.0,
f'poi_{category}_weighted_proximity_score': 0.0,
f'poi_{category}_count_0_100m': 0,
f'poi_{category}_count_100_300m': 0,
f'poi_{category}_count_300_500m': 0,
f'poi_{category}_count_500_1000m': 0,
f'poi_{category}_total_count': 0,
f'poi_{category}_distance_to_nearest_m': 9999.0,
f'poi_{category}_has_within_100m': 0,
f'poi_{category}_has_within_300m': 0,
f'poi_{category}_has_within_500m': 0,
})
features.update(default_poi_features)
logger.info("Using default POI features",
tenant_id=features['tenant_id'],
default_feature_count=len(default_poi_features))
except Exception as e:
logger.error("Failed to fetch POI features, using defaults",
error=str(e),
tenant_id=features.get('tenant_id'))
# On error, still provide default POI features to prevent prediction failures
default_poi_features = {}
# POI categories from external service POI_CATEGORIES configuration
# These match the categories in services/external/app/core/poi_config.py
poi_categories = [
'schools', 'offices', 'gyms_sports', 'residential', 'tourism',
'competitors', 'transport_hubs', 'coworking', 'retail'
]
for category in poi_categories:
default_poi_features.update({
f'poi_{category}_proximity_score': 0.0,
f'poi_{category}_weighted_proximity_score': 0.0,
f'poi_{category}_count_0_100m': 0,
f'poi_{category}_count_100_300m': 0,
f'poi_{category}_count_300_500m': 0,
f'poi_{category}_count_500_1000m': 0,
f'poi_{category}_total_count': 0,
f'poi_{category}_distance_to_nearest_m': 9999.0,
f'poi_{category}_has_within_100m': 0,
f'poi_{category}_has_within_300m': 0,
f'poi_{category}_has_within_500m': 0,
})
features.update(default_poi_features)
# Prepare features for Prophet model
prophet_df = self._prepare_prophet_features(features)
# CRITICAL FIX: Validate that model's required regressors are present
# Warn if using default values for features the model was trained with
if hasattr(model, 'extra_regressors'):
model_regressors = set(model.extra_regressors.keys()) if model.extra_regressors else set()
provided_features = set(prophet_df.columns) - {'ds'}
# Check for missing regressors
missing_regressors = model_regressors - provided_features
if missing_regressors:
logger.warning(
"Model trained with regressors that are missing in prediction",
model_id=model_id,
missing_regressors=list(missing_regressors)[:10], # Log first 10
total_missing=len(missing_regressors)
)
# Check for default-valued critical features
critical_features = {
'traffic_volume', 'temperature', 'precipitation',
'lag_1_day', 'rolling_mean_7d'
}
using_defaults = []
for feature in critical_features:
if feature in model_regressors:
value = features.get(feature, 0)
# Check if using default/fallback values
if (feature == 'traffic_volume' and value == 100.0) or \
(feature == 'temperature' and value == 15.0) or \
(feature in ['lag_1_day', 'rolling_mean_7d'] and value == 0.0):
using_defaults.append(feature)
if using_defaults:
logger.warning(
"Using default values for critical model features",
model_id=model_id,
features_with_defaults=using_defaults
)
# Generate prediction
forecast = model.predict(prophet_df)
# Extract prediction values
prediction_value = float(forecast['yhat'].iloc[0])
lower_bound = float(forecast['yhat_lower'].iloc[0])
upper_bound = float(forecast['yhat_upper'].iloc[0])
# Calculate confidence interval
confidence_interval = upper_bound - lower_bound
# Adjust confidence based on data freshness if historical features were calculated
adjusted_confidence_level = confidence_level
data_availability_score = features.get('historical_data_availability_score', 1.0) # Default to 1.0 if not available
# Reduce confidence if historical data is significantly old
if data_availability_score < 0.5:
# For data availability score < 0.5 (more than 90 days old), reduce confidence
adjusted_confidence_level = max(0.6, confidence_level * data_availability_score)
# Increase confidence interval to reflect uncertainty
adjustment_factor = 1.0 + (0.5 * (1.0 - data_availability_score)) # Up to 50% wider interval
adjusted_lower_bound = prediction_value - (prediction_value - lower_bound) * adjustment_factor
adjusted_upper_bound = prediction_value + (upper_bound - prediction_value) * adjustment_factor
logger.info("Adjusted prediction confidence due to stale historical data",
original_confidence=confidence_level,
adjusted_confidence=adjusted_confidence_level,
data_availability_score=data_availability_score,
original_interval=confidence_interval,
adjusted_interval=adjusted_upper_bound - adjusted_lower_bound)
lower_bound = max(0, adjusted_lower_bound)
upper_bound = adjusted_upper_bound
confidence_interval = upper_bound - lower_bound
result = {
"prediction": max(0, prediction_value), # Ensure non-negative
"lower_bound": max(0, lower_bound),
"upper_bound": max(0, upper_bound),
"confidence_interval": confidence_interval,
"confidence_level": adjusted_confidence_level,
"data_freshness_score": data_availability_score # Include data freshness in result
}
# Record metrics
processing_time = (datetime.now() - start_time).total_seconds()
# Record metrics with proper registration and error handling
try:
# Register metrics if not already registered
if "prediction_processing_time" not in metrics._histograms:
metrics.register_histogram(
"prediction_processing_time",
"Time taken to process predictions",
labels=['service', 'model_type']
)
if "predictions_served_total" not in metrics._counters:
try:
metrics.register_counter(
"predictions_served_total",
"Total number of predictions served",
labels=['service', 'status']
)
except Exception as reg_error:
# Metric might already exist in global registry
logger.debug("Counter already exists in registry", error=str(reg_error))
# Now record the metrics - try with expected labels, fallback if needed
try:
metrics.observe_histogram(
"prediction_processing_time",
processing_time,
labels={'service': 'forecasting-service', 'model_type': 'prophet'}
)
metrics.increment_counter(
"predictions_served_total",
labels={'service': 'forecasting-service', 'status': 'success'}
)
except Exception as label_error:
# If specific labels fail, try without labels to avoid breaking predictions
logger.warning("Failed to record metrics with labels, trying without", error=str(label_error))
try:
metrics.observe_histogram("prediction_processing_time", processing_time)
metrics.increment_counter("predictions_served_total")
except Exception as no_label_error:
logger.warning("Failed to record metrics even without labels", error=str(no_label_error))
except Exception as metrics_error:
# Log metrics error but don't fail the prediction
logger.warning("Failed to register or record metrics", error=str(metrics_error))
logger.info("Prediction generated successfully",
model_id=model_id,
prediction=result["prediction"],
processing_time=processing_time)
return result
except Exception as e:
logger.error("Error generating prediction",
error=str(e),
model_id=model_id)
# Record error metrics with robust error handling
try:
if "prediction_errors_total" not in metrics._counters:
metrics.register_counter(
"prediction_errors_total",
"Total number of prediction errors",
labels=['service', 'error_type']
)
# Try with labels first, then without if that fails
try:
metrics.increment_counter(
"prediction_errors_total",
labels={'service': 'forecasting-service', 'error_type': 'prediction_failed'}
)
except Exception as label_error:
logger.debug("Failed to record error metrics with labels", error=str(label_error))
try:
metrics.increment_counter("prediction_errors_total")
except Exception as no_label_error:
logger.warning("Failed to record error metrics even without labels", error=str(no_label_error))
except Exception as registration_error:
logger.warning("Failed to register error metrics", error=str(registration_error))
raise
async def predict_with_weather_forecast(
self,
model_id: str,
model_path: str,
features: Dict[str, Any],
tenant_id: str,
days: int = 7,
confidence_level: float = 0.8
) -> List[Dict[str, float]]:
"""
Generate predictions enriched with real weather forecast data
This method:
1. Loads the trained ML model
2. Fetches real weather forecast from external service
3. Enriches prediction features with actual forecast data
4. Generates weather-aware predictions
Args:
model_id: ID of the trained model
model_path: Path to model file
features: Base features for prediction
tenant_id: Tenant ID for weather forecast
days: Number of days to forecast
confidence_level: Confidence level for predictions
Returns:
List of predictions with weather-aware adjustments
"""
from app.services.data_client import data_client
start_time = datetime.now()
try:
logger.info("Generating weather-aware predictions",
model_id=model_id,
days=days)
# Step 1: Load ML model
model = await self._load_model(model_id, model_path)
if not model:
raise ValueError(f"Model {model_id} not found")
# Step 2: Fetch real weather forecast
latitude = features.get('latitude', 40.4168)
longitude = features.get('longitude', -3.7038)
weather_forecast = await data_client.fetch_weather_forecast(
tenant_id=tenant_id,
days=days,
latitude=latitude,
longitude=longitude
)
logger.info(f"Fetched weather forecast for {len(weather_forecast)} days",
tenant_id=tenant_id)
# Step 3: Generate predictions for each day with weather data
predictions = []
for day_offset in range(days):
# Get weather for this specific day
day_weather = weather_forecast[day_offset] if day_offset < len(weather_forecast) else {}
# Enrich features with actual weather forecast
enriched_features = features.copy()
enriched_features.update({
'temperature': day_weather.get('temperature', features.get('temperature', 20.0)),
'precipitation': day_weather.get('precipitation', features.get('precipitation', 0.0)),
'humidity': day_weather.get('humidity', features.get('humidity', 60.0)),
'wind_speed': day_weather.get('wind_speed', features.get('wind_speed', 10.0)),
'pressure': day_weather.get('pressure', features.get('pressure', 1013.0)),
'weather_description': day_weather.get('description', 'Clear')
})
# CRITICAL FIX: Fetch historical sales data and calculate historical features
# This populates lag, rolling, and trend features for better predictions
# Using 90 days for better trend analysis and more robust rolling statistics
if 'tenant_id' in enriched_features and 'inventory_product_id' in enriched_features and 'date' in enriched_features:
try:
forecast_date = pd.to_datetime(enriched_features['date'])
historical_sales = await self._fetch_historical_sales(
tenant_id=enriched_features['tenant_id'],
inventory_product_id=enriched_features['inventory_product_id'],
forecast_date=forecast_date,
days_back=90 # Changed from 30 to 90 for better historical context
)
# Calculate historical features and merge into features dict
historical_features = self._calculate_historical_features(
historical_sales, forecast_date
)
enriched_features.update(historical_features)
logger.info("Historical features enriched",
lag_1_day=historical_features.get('lag_1_day'),
rolling_mean_7d=historical_features.get('rolling_mean_7d'))
except Exception as e:
logger.warning("Failed to enrich with historical features, using defaults",
error=str(e))
# Features dict will use defaults (0.0) from _prepare_prophet_features
# Prepare Prophet dataframe with weather features
prophet_df = self._prepare_prophet_features(enriched_features)
# Generate prediction for this day
forecast = model.predict(prophet_df)
prediction_value = float(forecast['yhat'].iloc[0])
lower_bound = float(forecast['yhat_lower'].iloc[0])
upper_bound = float(forecast['yhat_upper'].iloc[0])
# Calculate confidence adjustment based on data freshness
current_confidence_level = confidence_level
data_availability_score = enriched_features.get('historical_data_availability_score', 1.0) # Default to 1.0 if not available
# Adjust confidence based on data freshness if historical features were calculated
# Reduce confidence if historical data is significantly old
if data_availability_score < 0.5:
# For data availability score < 0.5 (more than 90 days old), reduce confidence
current_confidence_level = max(0.6, confidence_level * data_availability_score)
# Increase confidence interval to reflect uncertainty
adjustment_factor = 1.0 + (0.5 * (1.0 - data_availability_score)) # Up to 50% wider interval
adjusted_lower_bound = prediction_value - (prediction_value - lower_bound) * adjustment_factor
adjusted_upper_bound = prediction_value + (upper_bound - prediction_value) * adjustment_factor
logger.info("Adjusted weather prediction confidence due to stale historical data",
original_confidence=confidence_level,
adjusted_confidence=current_confidence_level,
data_availability_score=data_availability_score)
lower_bound = max(0, adjusted_lower_bound)
upper_bound = adjusted_upper_bound
# Apply weather-based adjustments (business rules)
adjusted_prediction = self._apply_weather_adjustments(
prediction_value,
day_weather,
features.get('product_category', 'general')
)
predictions.append({
"date": enriched_features['date'],
"prediction": max(0, adjusted_prediction),
"lower_bound": max(0, lower_bound),
"upper_bound": max(0, upper_bound),
"confidence_level": current_confidence_level,
"data_freshness_score": data_availability_score, # Include data freshness in result
"weather": {
"temperature": enriched_features['temperature'],
"precipitation": enriched_features['precipitation'],
"description": enriched_features['weather_description']
}
})
processing_time = (datetime.now() - start_time).total_seconds()
logger.info("Weather-aware predictions generated",
model_id=model_id,
days=len(predictions),
processing_time=processing_time)
return predictions
except Exception as e:
logger.error("Error generating weather-aware predictions",
error=str(e),
model_id=model_id)
raise
def _apply_weather_adjustments(
self,
base_prediction: float,
weather: Dict[str, Any],
product_category: str
) -> float:
"""
Apply business rules based on weather conditions
Adjusts predictions based on real weather forecast
"""
adjusted = base_prediction
temp = weather.get('temperature', 20.0)
precip = weather.get('precipitation', 0.0)
# Temperature-based adjustments
if product_category == 'ice_cream':
if temp > 30:
adjusted *= 1.4 # +40% for very hot days
elif temp > 25:
adjusted *= 1.2 # +20% for hot days
elif temp < 15:
adjusted *= 0.7 # -30% for cold days
elif product_category == 'bread':
if temp > 30:
adjusted *= 0.9 # -10% for very hot days
elif temp < 10:
adjusted *= 1.1 # +10% for cold days
elif product_category == 'coffee':
if temp < 15:
adjusted *= 1.2 # +20% for cold days
elif precip > 5:
adjusted *= 1.15 # +15% for rainy days
# Precipitation-based adjustments
if precip > 10: # Heavy rain
if product_category in ['pastry', 'coffee']:
adjusted *= 1.2 # People stay indoors, buy comfort food
return adjusted
async def _load_model(self, model_id: str, model_path: str):
"""Load model from MinIO with improved validation and error handling"""
# Check cache first
if model_id in self.model_cache:
cached_model, cached_time = self.model_cache[model_id]
if (datetime.now() - cached_time).seconds < self.cache_ttl:
logger.debug(f"Model loaded from cache: {model_id}")
return cached_model
# Validate MinIO path format
if not await self._validate_model_file(model_path):
logger.error(f"Model path not valid: {model_path}")
return None
try:
# Load from MinIO
model = await self._load_model_safely(model_path)
if model is None:
logger.error(f"Failed to load model from MinIO: {model_path}")
return None
# Cache the model
self.model_cache[model_id] = (model, datetime.now())
logger.info(f"Model loaded successfully from MinIO: {model_path}")
return model
except Exception as e:
logger.error(f"Error loading model from MinIO: {e}")
return None
async def _load_model_safely(self, model_path: str):
"""Load model from MinIO storage (clean implementation - MinIO only)"""
try:
# Parse MinIO path: minio://bucket_name/object_path
_, bucket_and_path = model_path.split("://", 1)
bucket_name, object_name = bucket_and_path.split("/", 1)
logger.debug(f"Loading model from MinIO: {bucket_name}/{object_name}")
# Use MinIO client
from shared.clients.minio_client import minio_client
# Download model data
model_data = minio_client.get_object(bucket_name, object_name)
if not model_data:
logger.error(f"Failed to download model from MinIO: {model_path}")
return None
# Try joblib first (using BytesIO since joblib.load reads from file-like objects)
try:
buffer = io.BytesIO(model_data)
model = joblib.load(buffer)
logger.info(f"Model loaded successfully from MinIO with joblib")
return model
except Exception as e:
logger.warning(f"Joblib loading from MinIO failed: {e}")
# Try pickle as fallback
try:
model = pickle.loads(model_data)
logger.info(f"Model loaded successfully from MinIO with pickle")
return model
except Exception as e:
logger.warning(f"Pickle loading from MinIO failed: {e}")
logger.error(f"All loading methods failed for MinIO object: {model_path}")
return None
except Exception as e:
logger.error(f"Failed to load model from MinIO: {model_path}, error: {e}")
return None
async def _validate_model_file(self, model_path: str) -> bool:
"""Validate MinIO model path and check object exists"""
try:
# Validate MinIO path format
if not model_path.startswith("minio://"):
logger.error(f"Invalid model path format (expected minio://): {model_path}")
return False
# Parse MinIO path
try:
_, bucket_and_path = model_path.split("://", 1)
bucket_name, object_name = bucket_and_path.split("/", 1)
except ValueError:
logger.error(f"Cannot parse MinIO path: {model_path}")
return False
# Check if object exists in MinIO
from shared.clients.minio_client import minio_client
if not minio_client.object_exists(bucket_name, object_name):
logger.error(f"Model object not found in MinIO: {bucket_name}/{object_name}")
return False
# Check object metadata for size validation
metadata = minio_client.get_object_metadata(bucket_name, object_name)
if metadata:
file_size = metadata.get("size", 0)
if file_size < 1024:
logger.warning(f"Model object too small ({file_size} bytes): {model_path}")
return False
logger.debug(f"Model validated in MinIO: {bucket_name}/{object_name}, size={file_size}")
return True
except Exception as e:
logger.error(f"Model validation error: {e}")
return False
async def _fetch_historical_sales(
self,
tenant_id: str,
inventory_product_id: str,
forecast_date: datetime,
days_back: int = 90
) -> pd.Series:
"""
Fetch historical sales data for calculating lagged and rolling features.
Enhanced to handle cases where recent data is not available by extending
the search for the most recent data if needed.
Args:
tenant_id: Tenant UUID
inventory_product_id: Product UUID
forecast_date: The date we're forecasting for
days_back: Number of days of history to fetch (default 90 for better trend analysis)
Returns:
pandas Series with sales quantities indexed by date
"""
try:
# Calculate initial date range for recent data
end_date = forecast_date - pd.Timedelta(days=1) # Day before forecast
start_date = end_date - pd.Timedelta(days=days_back)
logger.debug("Fetching historical sales for feature calculation",
tenant_id=tenant_id,
product_id=inventory_product_id,
start_date=start_date.date(),
end_date=end_date.date(),
days_back=days_back)
# First, try to fetch sales data from the recent period
sales_data = await self.sales_client.get_sales_data(
tenant_id=tenant_id,
start_date=start_date.strftime("%Y-%m-%d"),
end_date=end_date.strftime("%Y-%m-%d"),
product_id=inventory_product_id,
aggregation="daily"
)
# If no recent data found, search for the most recent available data
if not sales_data:
logger.info("No recent sales data found, expanding search to find most recent data",
tenant_id=tenant_id,
product_id=inventory_product_id)
# Search for available data in larger time windows (up to 2 years back)
search_windows = [365, 730] # 1 year, 2 years
for window_days in search_windows:
extended_start_date = forecast_date - pd.Timedelta(days=window_days)
logger.debug("Expanding search window for historical data",
start_date=extended_start_date.date(),
end_date=end_date.date(),
window_days=window_days)
sales_data = await self.sales_client.get_sales_data(
tenant_id=tenant_id,
start_date=extended_start_date.strftime("%Y-%m-%d"),
end_date=end_date.strftime("%Y-%m-%d"),
product_id=inventory_product_id,
aggregation="daily"
)
if sales_data:
logger.info("Found historical data in expanded search window",
tenant_id=tenant_id,
product_id=inventory_product_id,
data_start=sales_data[0]['sale_date'] if sales_data else "None",
data_end=sales_data[-1]['sale_date'] if sales_data else "None",
window_days=window_days)
break
if not sales_data:
logger.warning("No historical sales data found in any search window",
tenant_id=tenant_id,
product_id=inventory_product_id)
return pd.Series(dtype=float)
# Convert to pandas DataFrame and check if it has the expected structure
df = pd.DataFrame(sales_data)
# Check if the expected 'sale_date' column exists
if df.empty:
logger.warning("No historical sales data returned from API")
return pd.Series(dtype=float)
# Check for available columns and find date column
available_columns = list(df.columns)
logger.debug(f"Available sales data columns: {available_columns}")
# Check for alternative date column names
date_columns = ['sale_date', 'date', 'forecast_date', 'datetime', 'timestamp']
date_column = None
for col in date_columns:
if col in df.columns:
date_column = col
break
if date_column is None:
logger.error(f"Sales data missing expected date column. Available columns: {available_columns}")
logger.debug(f"Sample of sales data: {df.head()}")
return pd.Series(dtype=float)
df['sale_date'] = pd.to_datetime(df[date_column])
df = df.set_index('sale_date')
# Extract quantity column (could be 'quantity' or 'total_quantity')
if 'quantity' in df.columns:
series = df['quantity']
elif 'total_quantity' in df.columns:
series = df['total_quantity']
else:
logger.warning("Sales data missing quantity field",
columns=list(df.columns))
return pd.Series(dtype=float)
logger.debug("Historical sales fetched successfully",
records=len(series),
date_range=f"{series.index.min()} to {series.index.max()}")
return series.sort_index()
except Exception as e:
logger.error("Error fetching historical sales",
error=str(e),
tenant_id=tenant_id,
product_id=inventory_product_id)
return pd.Series(dtype=float)
def _calculate_historical_features(
self,
historical_sales: pd.Series,
forecast_date: datetime
) -> Dict[str, float]:
"""
Calculate lagged, rolling, and trend features from historical sales data.
Enhanced to handle cases where recent data is not available by using
available historical data with appropriate temporal adjustments.
Now uses shared feature calculator for consistency with training service.
Args:
historical_sales: Series of sales quantities indexed by date
forecast_date: The date we're forecasting for
Returns:
Dictionary of calculated features
"""
try:
# Use shared feature calculator for consistency
from shared.ml.feature_calculator import HistoricalFeatureCalculator
calculator = HistoricalFeatureCalculator()
# Calculate all features using shared calculator
features = calculator.calculate_all_features(
sales_data=historical_sales,
reference_date=forecast_date,
mode='prediction'
)
logger.debug("Historical features calculated (using shared calculator)",
lag_1_day=features.get('lag_1_day', 0.0),
rolling_mean_7d=features.get('rolling_mean_7d', 0.0),
rolling_mean_30d=features.get('rolling_mean_30d', 0.0),
momentum=features.get('momentum_1_7', 0.0),
days_since_last_sale=features.get('days_since_last_sale', 0),
data_availability_score=features.get('historical_data_availability_score', 0.0))
return features
except Exception as e:
logger.error("Error calculating historical features",
error=str(e))
# Return default values on error
return {k: 0.0 for k in [
'lag_1_day', 'lag_7_day', 'lag_14_day',
'rolling_mean_7d', 'rolling_std_7d', 'rolling_max_7d', 'rolling_min_7d',
'rolling_mean_14d', 'rolling_std_14d', 'rolling_max_14d', 'rolling_min_14d',
'rolling_mean_30d', 'rolling_std_30d', 'rolling_max_30d', 'rolling_min_30d',
'momentum_1_7', 'trend_7_30', 'velocity_week',
'days_since_last_sale', 'historical_data_availability_score'
]}
def _prepare_prophet_features(self, features: Dict[str, Any]) -> pd.DataFrame:
"""Convert features to Prophet-compatible DataFrame - COMPLETE FEATURE MATCHING"""
try:
# Create base DataFrame with required 'ds' column
df = pd.DataFrame({
'ds': [pd.to_datetime(features['date'])]
})
# ✅ FIX: Add ALL traffic features that training service uses
# Core traffic features
df['traffic_volume'] = float(features.get('traffic_volume', 100.0))
df['pedestrian_count'] = float(features.get('pedestrian_count', 50.0))
df['congestion_level'] = float(features.get('congestion_level', 1.0))
df['average_speed'] = float(features.get('average_speed', 30.0)) # ← MISSING FEATURE!
# Weather features
df['temperature'] = float(features.get('temperature', 15.0))
df['precipitation'] = float(features.get('precipitation', 0.0))
df['humidity'] = float(features.get('humidity', 60.0))
df['wind_speed'] = float(features.get('wind_speed', 5.0))
df['pressure'] = float(features.get('pressure', 1013.0))
df['temp_category'] = self._get_temp_category(df['temperature'].iloc[0])
# Extract date information for temporal features
forecast_date = pd.to_datetime(features['date'])
day_of_week = forecast_date.weekday() # 0=Monday, 6=Sunday
# ✅ FIX: Add ALL temporal features (must match training exactly!)
df['day_of_week'] = int(day_of_week)
df['day_of_month'] = int(forecast_date.day)
df['month'] = int(forecast_date.month)
df['quarter'] = int(forecast_date.quarter)
df['week_of_year'] = int(forecast_date.isocalendar().week)
# ✅ FIX: Add the missing 'season' feature that matches training exactly
df['season'] = self._get_season(forecast_date.month)
# Bakery-specific temporal features
df['is_weekend'] = int(day_of_week >= 5)
df['is_monday'] = int(day_of_week == 0)
df['is_tuesday'] = int(day_of_week == 1)
df['is_wednesday'] = int(day_of_week == 2)
df['is_thursday'] = int(day_of_week == 3)
df['is_friday'] = int(day_of_week == 4)
df['is_saturday'] = int(day_of_week == 5)
df['is_sunday'] = int(day_of_week == 6)
df['is_working_day'] = int(day_of_week < 5) # Working days (Mon-Fri)
# Season-based features (match training service)
df['is_spring'] = int(df['season'].iloc[0] == 2)
df['is_summer'] = int(df['season'].iloc[0] == 3)
df['is_autumn'] = int(df['season'].iloc[0] == 4)
df['is_winter'] = int(df['season'].iloc[0] == 1)
# ✅ PERFORMANCE FIX: Build all features at once to avoid DataFrame fragmentation
# Extract values once to avoid repeated iloc calls
temperature = df['temperature'].iloc[0]
humidity = df['humidity'].iloc[0]
pressure = df['pressure'].iloc[0]
wind_speed = df['wind_speed'].iloc[0]
precipitation = df['precipitation'].iloc[0]
traffic = df['traffic_volume'].iloc[0]
pedestrians = df['pedestrian_count'].iloc[0]
avg_speed = df['average_speed'].iloc[0]
congestion = df['congestion_level'].iloc[0]
season = df['season'].iloc[0]
is_weekend = df['is_weekend'].iloc[0]
# Build all new features as a dictionary
new_features = {
# Holiday features
'is_holiday': int(features.get('is_holiday', False)),
'is_school_holiday': int(features.get('is_school_holiday', False)),
# Month-based features
'is_january': int(forecast_date.month == 1),
'is_february': int(forecast_date.month == 2),
'is_march': int(forecast_date.month == 3),
'is_april': int(forecast_date.month == 4),
'is_may': int(forecast_date.month == 5),
'is_june': int(forecast_date.month == 6),
'is_july': int(forecast_date.month == 7),
'is_august': int(forecast_date.month == 8),
'is_september': int(forecast_date.month == 9),
'is_october': int(forecast_date.month == 10),
'is_november': int(forecast_date.month == 11),
'is_december': int(forecast_date.month == 12),
# Special day features
'is_month_start': int(forecast_date.day <= 3),
'is_month_end': int(forecast_date.day >= 28),
'is_payday_period': int((forecast_date.day <= 5) or (forecast_date.day >= 25)),
# CRITICAL FIX: Add is_payday feature to match training service
# Training defines: is_payday = (day == 15 OR day == 28 OR is_month_end)
# Spain commonly pays on 28th, 15th, or last day of month
'is_payday': int((forecast_date.day == 15) or (forecast_date.day == 28) or self._is_end_of_month(forecast_date)),
# Weather-based derived features
'temp_squared': temperature ** 2,
'is_cold_day': int(temperature < 10),
'is_hot_day': int(temperature > 25),
'is_pleasant_day': int(10 <= temperature <= 25),
# Humidity features
'humidity_squared': humidity ** 2,
'is_high_humidity': int(humidity > 70),
'is_low_humidity': int(humidity < 40),
# Pressure features
'pressure_squared': pressure ** 2,
'is_high_pressure': int(pressure > 1020),
'is_low_pressure': int(pressure < 1000),
# Wind features
'wind_squared': wind_speed ** 2,
'is_windy': int(wind_speed > 15),
'is_calm': int(wind_speed < 5),
# Precipitation features
'precip_squared': precipitation ** 2,
'precip_log': float(np.log1p(precipitation)),
'is_rainy_day': int(precipitation > 0.1),
'is_very_rainy_day': int(precipitation > 5.0),
'is_heavy_rain': int(precipitation > 10),
'rain_intensity': self._get_rain_intensity(precipitation),
# Traffic-based features
'high_traffic': int(traffic > 150) if traffic > 0 else 0,
'low_traffic': int(traffic < 50) if traffic > 0 else 0,
# Fix: Use same normalization as training (when std=0, normalized=0.0)
# Training uses constant 100.0 values, so std=0 and normalized=0.0
'traffic_normalized': 0.0, # Match training behavior for consistent predictions
'traffic_squared': traffic ** 2,
'traffic_log': float(np.log1p(traffic)),
# Pedestrian features
'high_pedestrian_count': int(pedestrians > 100),
'low_pedestrian_count': int(pedestrians < 25),
'pedestrian_normalized': float((pedestrians - 50) / 25),
'pedestrian_squared': pedestrians ** 2,
'pedestrian_log': float(np.log1p(pedestrians)),
# Speed features
'high_speed': int(avg_speed > 40),
'low_speed': int(avg_speed < 20),
'speed_normalized': float((avg_speed - 30) / 10),
'speed_squared': avg_speed ** 2,
'speed_log': float(np.log1p(avg_speed)),
# Congestion features
'high_congestion': int(congestion > 3),
'low_congestion': int(congestion < 2),
'congestion_squared': congestion ** 2,
# Day features
'is_peak_bakery_day': int(day_of_week in [4, 5, 6]),
'is_high_demand_month': int(forecast_date.month in [6, 7, 8, 12]),
'is_warm_season': int(forecast_date.month in [4, 5, 6, 7, 8, 9]),
# CRITICAL FIX: Cyclical encoding features (MATCH TRAINING)
# These encode day_of_week and month as sin/cos for cyclical patterns
'day_of_week_sin': float(np.sin(2 * np.pi * day_of_week / 7)),
'day_of_week_cos': float(np.cos(2 * np.pi * day_of_week / 7)),
'month_sin': float(np.sin(2 * np.pi * forecast_date.month / 12)),
'month_cos': float(np.cos(2 * np.pi * forecast_date.month / 12)),
# CRITICAL FIX: Historical features (lagged, rolling, trend)
# These will be populated from historical sales data
# Default to 0.0 here, will be updated if historical data is provided
'lag_1_day': float(features.get('lag_1_day', 0.0)),
'lag_7_day': float(features.get('lag_7_day', 0.0)),
'lag_14_day': float(features.get('lag_14_day', 0.0)),
'rolling_mean_7d': float(features.get('rolling_mean_7d', 0.0)),
'rolling_std_7d': float(features.get('rolling_std_7d', 0.0)),
'rolling_max_7d': float(features.get('rolling_max_7d', 0.0)),
'rolling_min_7d': float(features.get('rolling_min_7d', 0.0)),
'rolling_mean_14d': float(features.get('rolling_mean_14d', 0.0)),
'rolling_std_14d': float(features.get('rolling_std_14d', 0.0)),
'rolling_max_14d': float(features.get('rolling_max_14d', 0.0)),
'rolling_min_14d': float(features.get('rolling_min_14d', 0.0)),
'rolling_mean_30d': float(features.get('rolling_mean_30d', 0.0)),
'rolling_std_30d': float(features.get('rolling_std_30d', 0.0)),
'rolling_max_30d': float(features.get('rolling_max_30d', 0.0)),
'rolling_min_30d': float(features.get('rolling_min_30d', 0.0)),
'days_since_start': int(features.get('days_since_start', 0)),
'momentum_1_7': float(features.get('momentum_1_7', 0.0)),
'trend_7_30': float(features.get('trend_7_30', 0.0)),
'velocity_week': float(features.get('velocity_week', 0.0)),
# Data freshness metrics to help model understand data recency
'days_since_last_sale': int(features.get('days_since_last_sale', 0)),
'historical_data_availability_score': float(features.get('historical_data_availability_score', 0.0)),
}
# Calculate interaction features
is_holiday = new_features['is_holiday']
is_pleasant = new_features['is_pleasant_day']
is_rainy = new_features['is_rainy_day']
is_payday = new_features['is_payday']
interaction_features = {
# Weekend interactions
'weekend_temp_interaction': is_weekend * temperature,
'weekend_pleasant_weather': is_weekend * is_pleasant,
'weekend_traffic_interaction': is_weekend * traffic,
# Holiday interactions
'holiday_temp_interaction': is_holiday * temperature,
'holiday_traffic_interaction': is_holiday * traffic,
# CRITICAL FIX: Add payday_weekend_interaction to match training service
'payday_weekend_interaction': is_payday * is_weekend,
# Season interactions
'season_temp_interaction': season * temperature,
'season_traffic_interaction': season * traffic,
# Rain-traffic interactions
'rain_traffic_interaction': is_rainy * traffic,
'rain_speed_interaction': is_rainy * avg_speed,
# CRITICAL FIX: Add missing interaction features from training
'rain_weekend_interaction': is_rainy * is_weekend,
'friday_traffic_interaction': int(day_of_week == 4) * traffic,
# Day-weather interactions
'day_temp_interaction': day_of_week * temperature,
'month_temp_interaction': forecast_date.month * temperature,
# Traffic-speed interactions
'traffic_speed_interaction': traffic * avg_speed,
'pedestrian_speed_interaction': pedestrians * avg_speed,
# Congestion interactions
'congestion_temp_interaction': congestion * temperature,
'congestion_weekend_interaction': congestion * is_weekend
}
# CRITICAL FIX: Extract POI (Point of Interest) features from the features dict
# POI features start with 'poi_' prefix and must be included for models trained with them
# This prevents "Regressor 'poi_retail_total_count' missing" errors
poi_features = {}
for key, value in features.items():
if key.startswith('poi_'):
# Ensure POI features are numeric (float or int)
try:
poi_features[key] = float(value) if isinstance(value, (int, float, str)) else 0.0
except (ValueError, TypeError):
poi_features[key] = 0.0
# Combine all features
all_new_features = {**new_features, **interaction_features, **poi_features}
# Add all features at once using pd.concat to avoid fragmentation
new_feature_df = pd.DataFrame([all_new_features])
df = pd.concat([df, new_feature_df], axis=1)
logger.debug("Complete Prophet features prepared",
feature_count=len(df.columns),
date=features['date'],
season=df['season'].iloc[0],
traffic_volume=df['traffic_volume'].iloc[0],
average_speed=df['average_speed'].iloc[0],
pedestrian_count=df['pedestrian_count'].iloc[0],
poi_feature_count=len(poi_features))
return df
except Exception as e:
logger.error("Error preparing Prophet features", error=str(e))
raise
def _get_season(self, month: int) -> int:
"""Get season from month (1-4 for Winter, Spring, Summer, Autumn) - MATCH TRAINING"""
if month in [12, 1, 2]:
return 1 # Winter
elif month in [3, 4, 5]:
return 2 # Spring
elif month in [6, 7, 8]:
return 3 # Summer
else:
return 4 # Autumn
def _is_school_holiday(self, date: datetime) -> bool:
"""Check if a date is during school holidays - MATCH TRAINING"""
month = date.month
# Approximate Spanish school holiday periods
if month in [7, 8]: # Summer holidays
return True
if month == 12 and date.day >= 20: # Christmas holidays
return True
if month == 1 and date.day <= 10: # Christmas holidays continued
return True
if month == 4 and date.day <= 15: # Easter holidays (approximate)
return True
return False
def _get_temp_category(self, temperature: float) -> int:
"""Get temperature category (0-3) - MATCH TRAINING"""
if temperature <= 5:
return 0 # Very cold
elif temperature <= 15:
return 1 # Cold
elif temperature <= 25:
return 2 # Mild
else:
return 3 # Hot
def _get_rain_intensity(self, precipitation: float) -> int:
"""Get rain intensity category (0-3) - MATCH TRAINING"""
if precipitation <= 0:
return 0 # No rain
elif precipitation <= 2:
return 1 # Light rain
elif precipitation <= 10:
return 2 # Moderate rain
else:
return 3 # Heavy rain
def _is_end_of_month(self, date: datetime) -> bool:
"""
Check if date is the last day of the month - MATCH TRAINING SERVICE
Training uses: df[date_column].dt.is_month_end
"""
import calendar
# Get the last day of the month
last_day = calendar.monthrange(date.year, date.month)[1]
return date.day == last_day