imporve features

This commit is contained in:
Urtzi Alfaro
2025-11-14 07:23:56 +01:00
parent 9bc048d360
commit a8d8828935
32 changed files with 5436 additions and 271 deletions

View File

@@ -56,21 +56,17 @@ class BakeryForecaster:
from app.services.poi_feature_service import POIFeatureService
self.poi_feature_service = POIFeatureService()
# Initialize enhanced data processor from shared module
if use_enhanced_features:
# Import enhanced data processor from training service
import sys
import os
# Add training service to path
training_path = os.path.join(os.path.dirname(__file__), '../../../training')
if training_path not in sys.path:
sys.path.insert(0, training_path)
try:
from app.ml.data_processor import EnhancedBakeryDataProcessor
self.data_processor = EnhancedBakeryDataProcessor(database_manager)
logger.info("Enhanced features enabled for forecasting")
from shared.ml.data_processor import EnhancedBakeryDataProcessor
self.data_processor = EnhancedBakeryDataProcessor(region='MD')
logger.info("Enhanced features enabled using shared data processor")
except ImportError as e:
logger.warning(f"Could not import EnhancedBakeryDataProcessor: {e}, falling back to basic features")
logger.warning(
f"Could not import EnhancedBakeryDataProcessor from shared module: {e}. "
"Falling back to basic features."
)
self.use_enhanced_features = False
self.data_processor = None
else:

View File

@@ -1056,13 +1056,13 @@ class EnhancedForecastingService:
- External service is unavailable
"""
try:
# Get tenant's calendar ID
calendar_id = await self.data_client.get_tenant_calendar(tenant_id)
# Get tenant's calendar information
calendar_info = await self.data_client.fetch_tenant_calendar(tenant_id)
if calendar_id:
if calendar_info:
# Check school holiday via external service
is_school_holiday = await self.data_client.check_school_holiday(
calendar_id=calendar_id,
calendar_id=calendar_info["calendar_id"],
check_date=date_obj.isoformat(),
tenant_id=tenant_id
)

View File

@@ -206,13 +206,39 @@ class PredictionService:
# Calculate confidence interval
confidence_interval = upper_bound - lower_bound
# Adjust confidence based on data freshness if historical features were calculated
adjusted_confidence_level = confidence_level
data_availability_score = features.get('historical_data_availability_score', 1.0) # Default to 1.0 if not available
# Reduce confidence if historical data is significantly old
if data_availability_score < 0.5:
# For data availability score < 0.5 (more than 90 days old), reduce confidence
adjusted_confidence_level = max(0.6, confidence_level * data_availability_score)
# Increase confidence interval to reflect uncertainty
adjustment_factor = 1.0 + (0.5 * (1.0 - data_availability_score)) # Up to 50% wider interval
adjusted_lower_bound = prediction_value - (prediction_value - lower_bound) * adjustment_factor
adjusted_upper_bound = prediction_value + (upper_bound - prediction_value) * adjustment_factor
logger.info("Adjusted prediction confidence due to stale historical data",
original_confidence=confidence_level,
adjusted_confidence=adjusted_confidence_level,
data_availability_score=data_availability_score,
original_interval=confidence_interval,
adjusted_interval=adjusted_upper_bound - adjusted_lower_bound)
lower_bound = max(0, adjusted_lower_bound)
upper_bound = adjusted_upper_bound
confidence_interval = upper_bound - lower_bound
result = {
"prediction": max(0, prediction_value), # Ensure non-negative
"lower_bound": max(0, lower_bound),
"upper_bound": max(0, upper_bound),
"confidence_interval": confidence_interval,
"confidence_level": confidence_level
"confidence_level": adjusted_confidence_level,
"data_freshness_score": data_availability_score # Include data freshness in result
}
# Record metrics
@@ -222,35 +248,45 @@ class PredictionService:
# Register metrics if not already registered
if "prediction_processing_time" not in metrics._histograms:
metrics.register_histogram(
"prediction_processing_time",
"Time taken to process predictions",
"prediction_processing_time",
"Time taken to process predictions",
labels=['service', 'model_type']
)
if "predictions_served_total" not in metrics._counters:
try:
metrics.register_counter(
"predictions_served_total",
"Total number of predictions served",
"predictions_served_total",
"Total number of predictions served",
labels=['service', 'status']
)
except Exception as reg_error:
# Metric might already exist in global registry
logger.debug("Counter already exists in registry", error=str(reg_error))
# Now record the metrics
metrics.observe_histogram(
"prediction_processing_time",
processing_time,
labels={'service': 'forecasting-service', 'model_type': 'prophet'}
)
metrics.increment_counter(
"predictions_served_total",
labels={'service': 'forecasting-service', 'status': 'success'}
)
# Now record the metrics - try with expected labels, fallback if needed
try:
metrics.observe_histogram(
"prediction_processing_time",
processing_time,
labels={'service': 'forecasting-service', 'model_type': 'prophet'}
)
metrics.increment_counter(
"predictions_served_total",
labels={'service': 'forecasting-service', 'status': 'success'}
)
except Exception as label_error:
# If specific labels fail, try without labels to avoid breaking predictions
logger.warning("Failed to record metrics with labels, trying without", error=str(label_error))
try:
metrics.observe_histogram("prediction_processing_time", processing_time)
metrics.increment_counter("predictions_served_total")
except Exception as no_label_error:
logger.warning("Failed to record metrics even without labels", error=str(no_label_error))
except Exception as metrics_error:
# Log metrics error but don't fail the prediction
logger.warning("Failed to record metrics", error=str(metrics_error))
logger.warning("Failed to register or record metrics", error=str(metrics_error))
logger.info("Prediction generated successfully",
model_id=model_id,
@@ -260,22 +296,32 @@ class PredictionService:
return result
except Exception as e:
logger.error("Error generating prediction",
error=str(e),
logger.error("Error generating prediction",
error=str(e),
model_id=model_id)
# Record error metrics with robust error handling
try:
if "prediction_errors_total" not in metrics._counters:
metrics.register_counter(
"prediction_errors_total",
"Total number of prediction errors",
"prediction_errors_total",
"Total number of prediction errors",
labels=['service', 'error_type']
)
metrics.increment_counter(
"prediction_errors_total",
labels={'service': 'forecasting-service', 'error_type': 'prediction_failed'}
)
except Exception:
pass # Don't fail on metrics errors
# Try with labels first, then without if that fails
try:
metrics.increment_counter(
"prediction_errors_total",
labels={'service': 'forecasting-service', 'error_type': 'prediction_failed'}
)
except Exception as label_error:
logger.debug("Failed to record error metrics with labels", error=str(label_error))
try:
metrics.increment_counter("prediction_errors_total")
except Exception as no_label_error:
logger.warning("Failed to record error metrics even without labels", error=str(no_label_error))
except Exception as registration_error:
logger.warning("Failed to register error metrics", error=str(registration_error))
raise
async def predict_with_weather_forecast(
@@ -353,6 +399,33 @@ class PredictionService:
'weather_description': day_weather.get('description', 'Clear')
})
# CRITICAL FIX: Fetch historical sales data and calculate historical features
# This populates lag, rolling, and trend features for better predictions
# Using 90 days for better trend analysis and more robust rolling statistics
if 'tenant_id' in enriched_features and 'inventory_product_id' in enriched_features and 'date' in enriched_features:
try:
forecast_date = pd.to_datetime(enriched_features['date'])
historical_sales = await self._fetch_historical_sales(
tenant_id=enriched_features['tenant_id'],
inventory_product_id=enriched_features['inventory_product_id'],
forecast_date=forecast_date,
days_back=90 # Changed from 30 to 90 for better historical context
)
# Calculate historical features and merge into features dict
historical_features = self._calculate_historical_features(
historical_sales, forecast_date
)
enriched_features.update(historical_features)
logger.info("Historical features enriched",
lag_1_day=historical_features.get('lag_1_day'),
rolling_mean_7d=historical_features.get('rolling_mean_7d'))
except Exception as e:
logger.warning("Failed to enrich with historical features, using defaults",
error=str(e))
# Features dict will use defaults (0.0) from _prepare_prophet_features
# Prepare Prophet dataframe with weather features
prophet_df = self._prepare_prophet_features(enriched_features)
@@ -363,6 +436,29 @@ class PredictionService:
lower_bound = float(forecast['yhat_lower'].iloc[0])
upper_bound = float(forecast['yhat_upper'].iloc[0])
# Calculate confidence adjustment based on data freshness
current_confidence_level = confidence_level
data_availability_score = enriched_features.get('historical_data_availability_score', 1.0) # Default to 1.0 if not available
# Adjust confidence based on data freshness if historical features were calculated
# Reduce confidence if historical data is significantly old
if data_availability_score < 0.5:
# For data availability score < 0.5 (more than 90 days old), reduce confidence
current_confidence_level = max(0.6, confidence_level * data_availability_score)
# Increase confidence interval to reflect uncertainty
adjustment_factor = 1.0 + (0.5 * (1.0 - data_availability_score)) # Up to 50% wider interval
adjusted_lower_bound = prediction_value - (prediction_value - lower_bound) * adjustment_factor
adjusted_upper_bound = prediction_value + (upper_bound - prediction_value) * adjustment_factor
logger.info("Adjusted weather prediction confidence due to stale historical data",
original_confidence=confidence_level,
adjusted_confidence=current_confidence_level,
data_availability_score=data_availability_score)
lower_bound = max(0, adjusted_lower_bound)
upper_bound = adjusted_upper_bound
# Apply weather-based adjustments (business rules)
adjusted_prediction = self._apply_weather_adjustments(
prediction_value,
@@ -375,7 +471,8 @@ class PredictionService:
"prediction": max(0, adjusted_prediction),
"lower_bound": max(0, lower_bound),
"upper_bound": max(0, upper_bound),
"confidence_level": confidence_level,
"confidence_level": current_confidence_level,
"data_freshness_score": data_availability_score, # Include data freshness in result
"weather": {
"temperature": enriched_features['temperature'],
"precipitation": enriched_features['precipitation'],
@@ -567,6 +664,8 @@ class PredictionService:
) -> pd.Series:
"""
Fetch historical sales data for calculating lagged and rolling features.
Enhanced to handle cases where recent data is not available by extending
the search for the most recent data if needed.
Args:
tenant_id: Tenant UUID
@@ -578,7 +677,7 @@ class PredictionService:
pandas Series with sales quantities indexed by date
"""
try:
# Calculate date range
# Calculate initial date range for recent data
end_date = forecast_date - pd.Timedelta(days=1) # Day before forecast
start_date = end_date - pd.Timedelta(days=days_back)
@@ -589,7 +688,7 @@ class PredictionService:
end_date=end_date.date(),
days_back=days_back)
# Fetch sales data from sales service
# First, try to fetch sales data from the recent period
sales_data = await self.sales_client.get_sales_data(
tenant_id=tenant_id,
start_date=start_date.strftime("%Y-%m-%d"),
@@ -598,15 +697,72 @@ class PredictionService:
aggregation="daily"
)
# If no recent data found, search for the most recent available data
if not sales_data:
logger.warning("No historical sales data found",
logger.info("No recent sales data found, expanding search to find most recent data",
tenant_id=tenant_id,
product_id=inventory_product_id)
# Search for available data in larger time windows (up to 2 years back)
search_windows = [365, 730] # 1 year, 2 years
for window_days in search_windows:
extended_start_date = forecast_date - pd.Timedelta(days=window_days)
logger.debug("Expanding search window for historical data",
start_date=extended_start_date.date(),
end_date=end_date.date(),
window_days=window_days)
sales_data = await self.sales_client.get_sales_data(
tenant_id=tenant_id,
start_date=extended_start_date.strftime("%Y-%m-%d"),
end_date=end_date.strftime("%Y-%m-%d"),
product_id=inventory_product_id,
aggregation="daily"
)
if sales_data:
logger.info("Found historical data in expanded search window",
tenant_id=tenant_id,
product_id=inventory_product_id,
data_start=sales_data[0]['sale_date'] if sales_data else "None",
data_end=sales_data[-1]['sale_date'] if sales_data else "None",
window_days=window_days)
break
if not sales_data:
logger.warning("No historical sales data found in any search window",
tenant_id=tenant_id,
product_id=inventory_product_id)
return pd.Series(dtype=float)
# Convert to pandas Series indexed by date
# Convert to pandas DataFrame and check if it has the expected structure
df = pd.DataFrame(sales_data)
df['sale_date'] = pd.to_datetime(df['sale_date'])
# Check if the expected 'sale_date' column exists
if df.empty:
logger.warning("No historical sales data returned from API")
return pd.Series(dtype=float)
# Check for available columns and find date column
available_columns = list(df.columns)
logger.debug(f"Available sales data columns: {available_columns}")
# Check for alternative date column names
date_columns = ['sale_date', 'date', 'forecast_date', 'datetime', 'timestamp']
date_column = None
for col in date_columns:
if col in df.columns:
date_column = col
break
if date_column is None:
logger.error(f"Sales data missing expected date column. Available columns: {available_columns}")
logger.debug(f"Sample of sales data: {df.head()}")
return pd.Series(dtype=float)
df['sale_date'] = pd.to_datetime(df[date_column])
df = df.set_index('sale_date')
# Extract quantity column (could be 'quantity' or 'total_quantity')
@@ -639,6 +795,10 @@ class PredictionService:
) -> Dict[str, float]:
"""
Calculate lagged, rolling, and trend features from historical sales data.
Enhanced to handle cases where recent data is not available by using
available historical data with appropriate temporal adjustments.
Now uses shared feature calculator for consistency with training service.
Args:
historical_sales: Series of sales quantities indexed by date
@@ -647,117 +807,26 @@ class PredictionService:
Returns:
Dictionary of calculated features
"""
features = {}
try:
if len(historical_sales) == 0:
logger.warning("No historical data available, using default values")
# Return all features with default values (0.0)
return {
# Lagged features
'lag_1_day': 0.0,
'lag_7_day': 0.0,
'lag_14_day': 0.0,
# Rolling statistics (7-day window)
'rolling_mean_7d': 0.0,
'rolling_std_7d': 0.0,
'rolling_max_7d': 0.0,
'rolling_min_7d': 0.0,
# Rolling statistics (14-day window)
'rolling_mean_14d': 0.0,
'rolling_std_14d': 0.0,
'rolling_max_14d': 0.0,
'rolling_min_14d': 0.0,
# Rolling statistics (30-day window)
'rolling_mean_30d': 0.0,
'rolling_std_30d': 0.0,
'rolling_max_30d': 0.0,
'rolling_min_30d': 0.0,
# Trend features
'days_since_start': 0,
'momentum_1_7': 0.0,
'trend_7_30': 0.0,
'velocity_week': 0.0,
}
# Use shared feature calculator for consistency
from shared.ml.feature_calculator import HistoricalFeatureCalculator
# Calculate lagged features
features['lag_1_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) >= 1 else 0.0
features['lag_7_day'] = float(historical_sales.iloc[-7]) if len(historical_sales) >= 7 else features['lag_1_day']
features['lag_14_day'] = float(historical_sales.iloc[-14]) if len(historical_sales) >= 14 else features['lag_7_day']
calculator = HistoricalFeatureCalculator()
# Calculate rolling statistics (7-day window)
if len(historical_sales) >= 7:
window_7d = historical_sales.iloc[-7:]
features['rolling_mean_7d'] = float(window_7d.mean())
features['rolling_std_7d'] = float(window_7d.std())
features['rolling_max_7d'] = float(window_7d.max())
features['rolling_min_7d'] = float(window_7d.min())
else:
features['rolling_mean_7d'] = features['lag_1_day']
features['rolling_std_7d'] = 0.0
features['rolling_max_7d'] = features['lag_1_day']
features['rolling_min_7d'] = features['lag_1_day']
# Calculate all features using shared calculator
features = calculator.calculate_all_features(
sales_data=historical_sales,
reference_date=forecast_date,
mode='prediction'
)
# Calculate rolling statistics (14-day window)
if len(historical_sales) >= 14:
window_14d = historical_sales.iloc[-14:]
features['rolling_mean_14d'] = float(window_14d.mean())
features['rolling_std_14d'] = float(window_14d.std())
features['rolling_max_14d'] = float(window_14d.max())
features['rolling_min_14d'] = float(window_14d.min())
else:
features['rolling_mean_14d'] = features['rolling_mean_7d']
features['rolling_std_14d'] = features['rolling_std_7d']
features['rolling_max_14d'] = features['rolling_max_7d']
features['rolling_min_14d'] = features['rolling_min_7d']
# Calculate rolling statistics (30-day window)
if len(historical_sales) >= 30:
window_30d = historical_sales.iloc[-30:]
features['rolling_mean_30d'] = float(window_30d.mean())
features['rolling_std_30d'] = float(window_30d.std())
features['rolling_max_30d'] = float(window_30d.max())
features['rolling_min_30d'] = float(window_30d.min())
else:
features['rolling_mean_30d'] = features['rolling_mean_14d']
features['rolling_std_30d'] = features['rolling_std_14d']
features['rolling_max_30d'] = features['rolling_max_14d']
features['rolling_min_30d'] = features['rolling_min_14d']
# Calculate trend features
if len(historical_sales) > 0:
# Days since first sale
features['days_since_start'] = (forecast_date - historical_sales.index[0]).days
# Momentum (difference between recent lag_1_day and lag_7_day)
if len(historical_sales) >= 7:
features['momentum_1_7'] = features['lag_1_day'] - features['lag_7_day']
else:
features['momentum_1_7'] = 0.0
# Trend (difference between recent 7-day and 30-day averages)
if len(historical_sales) >= 30:
features['trend_7_30'] = features['rolling_mean_7d'] - features['rolling_mean_30d']
else:
features['trend_7_30'] = 0.0
# Velocity (rate of change over the last week)
if len(historical_sales) >= 7:
week_change = historical_sales.iloc[-1] - historical_sales.iloc[-7]
features['velocity_week'] = float(week_change / 7.0)
else:
features['velocity_week'] = 0.0
else:
features['days_since_start'] = 0
features['momentum_1_7'] = 0.0
features['trend_7_30'] = 0.0
features['velocity_week'] = 0.0
logger.debug("Historical features calculated",
lag_1_day=features['lag_1_day'],
rolling_mean_7d=features['rolling_mean_7d'],
rolling_mean_30d=features['rolling_mean_30d'],
momentum=features['momentum_1_7'])
logger.debug("Historical features calculated (using shared calculator)",
lag_1_day=features.get('lag_1_day', 0.0),
rolling_mean_7d=features.get('rolling_mean_7d', 0.0),
rolling_mean_30d=features.get('rolling_mean_30d', 0.0),
momentum=features.get('momentum_1_7', 0.0),
days_since_last_sale=features.get('days_since_last_sale', 0),
data_availability_score=features.get('historical_data_availability_score', 0.0))
return features
@@ -770,8 +839,9 @@ class PredictionService:
'rolling_mean_7d', 'rolling_std_7d', 'rolling_max_7d', 'rolling_min_7d',
'rolling_mean_14d', 'rolling_std_14d', 'rolling_max_14d', 'rolling_min_14d',
'rolling_mean_30d', 'rolling_std_30d', 'rolling_max_30d', 'rolling_min_30d',
'momentum_1_7', 'trend_7_30', 'velocity_week'
]} | {'days_since_start': 0}
'momentum_1_7', 'trend_7_30', 'velocity_week',
'days_since_last_sale', 'historical_data_availability_score'
]}
def _prepare_prophet_features(self, features: Dict[str, Any]) -> pd.DataFrame:
"""Convert features to Prophet-compatible DataFrame - COMPLETE FEATURE MATCHING"""
@@ -962,6 +1032,9 @@ class PredictionService:
'momentum_1_7': float(features.get('momentum_1_7', 0.0)),
'trend_7_30': float(features.get('trend_7_30', 0.0)),
'velocity_week': float(features.get('velocity_week', 0.0)),
# Data freshness metrics to help model understand data recency
'days_since_last_sale': int(features.get('days_since_last_sale', 0)),
'historical_data_availability_score': float(features.get('historical_data_availability_score', 0.0)),
}
# Calculate interaction features