Improve training code
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
# services/training/app/ml/data_processor.py
|
||||
"""
|
||||
Data Processor for Training Service
|
||||
Handles data preparation and feature engineering for ML training
|
||||
Enhanced Data Processor for Training Service
|
||||
Handles data preparation, date alignment, cleaning, and feature engineering for ML training
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
@@ -12,17 +12,20 @@ import logging
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.impute import SimpleImputer
|
||||
|
||||
from app.services.date_alignment_service import DateAlignmentService, DateRange, DataSourceType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class BakeryDataProcessor:
|
||||
"""
|
||||
Enhanced data processor for bakery forecasting training service.
|
||||
Handles data cleaning, feature engineering, and preparation for ML models.
|
||||
Integrates date alignment, data cleaning, feature engineering, and preparation for ML models.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.scalers = {} # Store scalers for each feature
|
||||
self.imputers = {} # Store imputers for missing value handling
|
||||
self.date_alignment_service = DateAlignmentService()
|
||||
|
||||
async def prepare_training_data(self,
|
||||
sales_data: pd.DataFrame,
|
||||
@@ -30,7 +33,7 @@ class BakeryDataProcessor:
|
||||
traffic_data: pd.DataFrame,
|
||||
product_name: str) -> pd.DataFrame:
|
||||
"""
|
||||
Prepare comprehensive training data for a specific product.
|
||||
Prepare comprehensive training data for a specific product with date alignment.
|
||||
|
||||
Args:
|
||||
sales_data: Historical sales data for the product
|
||||
@@ -44,26 +47,29 @@ class BakeryDataProcessor:
|
||||
try:
|
||||
logger.info(f"Preparing training data for product: {product_name}")
|
||||
|
||||
# Convert and validate sales data
|
||||
# Step 1: Convert and validate sales data
|
||||
sales_clean = await self._process_sales_data(sales_data, product_name)
|
||||
|
||||
# Aggregate to daily level
|
||||
# Step 2: Apply date alignment if we have date constraints
|
||||
sales_clean = await self._apply_date_alignment(sales_clean, weather_data, traffic_data)
|
||||
|
||||
# Step 3: Aggregate to daily level
|
||||
daily_sales = await self._aggregate_daily_sales(sales_clean)
|
||||
|
||||
# Add temporal features
|
||||
# Step 4: Add temporal features
|
||||
daily_sales = self._add_temporal_features(daily_sales)
|
||||
|
||||
# Merge external data sources
|
||||
# Step 5: Merge external data sources
|
||||
daily_sales = self._merge_weather_features(daily_sales, weather_data)
|
||||
daily_sales = self._merge_traffic_features(daily_sales, traffic_data)
|
||||
|
||||
# Engineer additional features
|
||||
# Step 6: Engineer additional features
|
||||
daily_sales = self._engineer_features(daily_sales)
|
||||
|
||||
# Handle missing values
|
||||
# Step 7: Handle missing values
|
||||
daily_sales = self._handle_missing_values(daily_sales)
|
||||
|
||||
# Prepare for Prophet (rename columns and validate)
|
||||
# Step 8: Prepare for Prophet (rename columns and validate)
|
||||
prophet_data = self._prepare_prophet_format(daily_sales)
|
||||
|
||||
logger.info(f"Prepared {len(prophet_data)} data points for {product_name}")
|
||||
@@ -78,7 +84,7 @@ class BakeryDataProcessor:
|
||||
weather_forecast: pd.DataFrame = None,
|
||||
traffic_forecast: pd.DataFrame = None) -> pd.DataFrame:
|
||||
"""
|
||||
Create features for future predictions.
|
||||
Create features for future predictions with proper date handling.
|
||||
|
||||
Args:
|
||||
future_dates: Future dates to predict
|
||||
@@ -118,20 +124,7 @@ class BakeryDataProcessor:
|
||||
future_df = future_df.rename(columns={'date': 'ds'})
|
||||
|
||||
# Handle missing values in future data
|
||||
numeric_columns = future_df.select_dtypes(include=[np.number]).columns
|
||||
for col in numeric_columns:
|
||||
if future_df[col].isna().any():
|
||||
# Use reasonable defaults for Madrid
|
||||
if col == 'temperature':
|
||||
future_df[col] = future_df[col].fillna(15.0) # Default Madrid temp
|
||||
elif col == 'precipitation':
|
||||
future_df[col] = future_df[col].fillna(0.0) # Default no rain
|
||||
elif col == 'humidity':
|
||||
future_df[col] = future_df[col].fillna(60.0) # Default humidity
|
||||
elif col == 'traffic_volume':
|
||||
future_df[col] = future_df[col].fillna(100.0) # Default traffic
|
||||
else:
|
||||
future_df[col] = future_df[col].fillna(future_df[col].median())
|
||||
future_df = self._handle_missing_values_future(future_df)
|
||||
|
||||
return future_df
|
||||
|
||||
@@ -140,8 +133,48 @@ class BakeryDataProcessor:
|
||||
# Return minimal features if error
|
||||
return pd.DataFrame({'ds': future_dates})
|
||||
|
||||
async def _apply_date_alignment(self,
|
||||
sales_data: pd.DataFrame,
|
||||
weather_data: pd.DataFrame,
|
||||
traffic_data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Apply date alignment constraints to ensure data consistency across sources.
|
||||
"""
|
||||
try:
|
||||
if sales_data.empty:
|
||||
return sales_data
|
||||
|
||||
# Create date range from sales data
|
||||
sales_dates = pd.to_datetime(sales_data['date'])
|
||||
sales_date_range = DateRange(
|
||||
start=sales_dates.min(),
|
||||
end=sales_dates.max(),
|
||||
source=DataSourceType.BAKERY_SALES
|
||||
)
|
||||
|
||||
# Get aligned date range considering all constraints
|
||||
aligned_range = self.date_alignment_service.validate_and_align_dates(
|
||||
user_sales_range=sales_date_range
|
||||
)
|
||||
|
||||
# Filter sales data to aligned range
|
||||
mask = (sales_dates >= aligned_range.start) & (sales_dates <= aligned_range.end)
|
||||
filtered_sales = sales_data[mask].copy()
|
||||
|
||||
logger.info(f"Date alignment: {len(sales_data)} → {len(filtered_sales)} records")
|
||||
logger.info(f"Aligned date range: {aligned_range.start.date()} to {aligned_range.end.date()}")
|
||||
|
||||
if aligned_range.constraints:
|
||||
logger.info(f"Applied constraints: {aligned_range.constraints}")
|
||||
|
||||
return filtered_sales
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Date alignment failed, using original data: {str(e)}")
|
||||
return sales_data
|
||||
|
||||
async def _process_sales_data(self, sales_data: pd.DataFrame, product_name: str) -> pd.DataFrame:
|
||||
"""Process and clean sales data"""
|
||||
"""Process and clean sales data with enhanced validation"""
|
||||
sales_clean = sales_data.copy()
|
||||
|
||||
# Ensure date column exists and is datetime
|
||||
@@ -150,9 +183,22 @@ class BakeryDataProcessor:
|
||||
|
||||
sales_clean['date'] = pd.to_datetime(sales_clean['date'])
|
||||
|
||||
# Ensure quantity column exists and is numeric
|
||||
if 'quantity' not in sales_clean.columns:
|
||||
raise ValueError("Sales data must have a 'quantity' column")
|
||||
# Handle different quantity column names
|
||||
quantity_columns = ['quantity', 'quantity_sold', 'sales', 'units_sold']
|
||||
quantity_col = None
|
||||
|
||||
for col in quantity_columns:
|
||||
if col in sales_clean.columns:
|
||||
quantity_col = col
|
||||
break
|
||||
|
||||
if quantity_col is None:
|
||||
raise ValueError(f"Sales data must have one of these columns: {quantity_columns}")
|
||||
|
||||
# Standardize to 'quantity'
|
||||
if quantity_col != 'quantity':
|
||||
sales_clean['quantity'] = sales_clean[quantity_col]
|
||||
logger.info(f"Mapped '{quantity_col}' to 'quantity' column")
|
||||
|
||||
sales_clean['quantity'] = pd.to_numeric(sales_clean['quantity'], errors='coerce')
|
||||
|
||||
@@ -164,15 +210,23 @@ class BakeryDataProcessor:
|
||||
if 'product_name' in sales_clean.columns:
|
||||
sales_clean = sales_clean[sales_clean['product_name'] == product_name]
|
||||
|
||||
# Remove duplicate dates (keep the one with highest quantity)
|
||||
sales_clean = sales_clean.sort_values(['date', 'quantity'], ascending=[True, False])
|
||||
sales_clean = sales_clean.drop_duplicates(subset=['date'], keep='first')
|
||||
|
||||
return sales_clean
|
||||
|
||||
async def _aggregate_daily_sales(self, sales_data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Aggregate sales to daily level"""
|
||||
"""Aggregate sales to daily level with improved date handling"""
|
||||
if sales_data.empty:
|
||||
return pd.DataFrame(columns=['date', 'quantity'])
|
||||
|
||||
# Group by date and sum quantities
|
||||
daily_sales = sales_data.groupby('date').agg({
|
||||
'quantity': 'sum'
|
||||
}).reset_index()
|
||||
|
||||
# Ensure we have data for all dates in the range
|
||||
# Ensure we have data for all dates in the range (fill gaps with 0)
|
||||
date_range = pd.date_range(
|
||||
start=daily_sales['date'].min(),
|
||||
end=daily_sales['date'].max(),
|
||||
@@ -186,7 +240,7 @@ class BakeryDataProcessor:
|
||||
return daily_sales
|
||||
|
||||
def _add_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Add temporal features like day of week, month, etc."""
|
||||
"""Add comprehensive temporal features for bakery demand patterns"""
|
||||
df = df.copy()
|
||||
|
||||
# Ensure we have a date column
|
||||
@@ -195,37 +249,43 @@ class BakeryDataProcessor:
|
||||
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
|
||||
# Day of week (0=Monday, 6=Sunday)
|
||||
df['day_of_week'] = df['date'].dt.dayofweek
|
||||
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
|
||||
|
||||
# Month and season
|
||||
# Basic temporal features
|
||||
df['day_of_week'] = df['date'].dt.dayofweek # 0=Monday, 6=Sunday
|
||||
df['day_of_month'] = df['date'].dt.day
|
||||
df['month'] = df['date'].dt.month
|
||||
df['season'] = df['month'].apply(self._get_season)
|
||||
|
||||
# Week of year
|
||||
df['quarter'] = df['date'].dt.quarter
|
||||
df['week_of_year'] = df['date'].dt.isocalendar().week
|
||||
|
||||
# Quarter
|
||||
df['quarter'] = df['date'].dt.quarter
|
||||
# Bakery-specific features
|
||||
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
|
||||
df['is_monday'] = (df['day_of_week'] == 0).astype(int) # Monday often has different patterns
|
||||
df['is_friday'] = (df['day_of_week'] == 4).astype(int) # Friday often busy
|
||||
|
||||
# Holiday indicators (basic Spanish holidays)
|
||||
# Season mapping for Madrid
|
||||
df['season'] = df['month'].apply(self._get_season)
|
||||
df['is_summer'] = (df['season'] == 3).astype(int) # Summer seasonality
|
||||
df['is_winter'] = (df['season'] == 1).astype(int) # Winter seasonality
|
||||
|
||||
# Holiday and special day indicators
|
||||
df['is_holiday'] = df['date'].apply(self._is_spanish_holiday).astype(int)
|
||||
|
||||
# School calendar effects (approximate)
|
||||
df['is_school_holiday'] = df['date'].apply(self._is_school_holiday).astype(int)
|
||||
df['is_month_start'] = (df['day_of_month'] <= 3).astype(int)
|
||||
df['is_month_end'] = (df['day_of_month'] >= 28).astype(int)
|
||||
|
||||
# Payday patterns (common in Spain: end/beginning of month)
|
||||
df['is_payday_period'] = ((df['day_of_month'] <= 5) | (df['day_of_month'] >= 25)).astype(int)
|
||||
|
||||
return df
|
||||
|
||||
def _merge_weather_features(self,
|
||||
daily_sales: pd.DataFrame,
|
||||
weather_data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Merge weather features with sales data"""
|
||||
"""Merge weather features with enhanced handling"""
|
||||
|
||||
if weather_data.empty:
|
||||
# Add default weather columns with neutral values
|
||||
daily_sales['temperature'] = 15.0 # Mild temperature
|
||||
daily_sales['precipitation'] = 0.0 # No rain
|
||||
# Add default weather columns with Madrid-appropriate values
|
||||
daily_sales['temperature'] = 15.0 # Average Madrid temperature
|
||||
daily_sales['precipitation'] = 0.0 # Default no rain
|
||||
daily_sales['humidity'] = 60.0 # Moderate humidity
|
||||
daily_sales['wind_speed'] = 5.0 # Light wind
|
||||
return daily_sales
|
||||
@@ -233,27 +293,27 @@ class BakeryDataProcessor:
|
||||
try:
|
||||
weather_clean = weather_data.copy()
|
||||
|
||||
# Ensure weather data has date column
|
||||
# Standardize date column
|
||||
if 'date' not in weather_clean.columns and 'ds' in weather_clean.columns:
|
||||
weather_clean = weather_clean.rename(columns={'ds': 'date'})
|
||||
|
||||
weather_clean['date'] = pd.to_datetime(weather_clean['date'])
|
||||
|
||||
# Select relevant weather features
|
||||
weather_features = ['date']
|
||||
|
||||
# Add available weather columns with default names
|
||||
# Map weather columns to standard names
|
||||
weather_mapping = {
|
||||
'temperature': ['temperature', 'temp', 'temperatura'],
|
||||
'precipitation': ['precipitation', 'rain', 'lluvia', 'precipitacion'],
|
||||
'humidity': ['humidity', 'humedad'],
|
||||
'wind_speed': ['wind_speed', 'viento', 'wind']
|
||||
'temperature': ['temperature', 'temp', 'temperatura', 'temp_avg', 'temperature_avg'],
|
||||
'precipitation': ['precipitation', 'rain', 'lluvia', 'precipitacion', 'rainfall'],
|
||||
'humidity': ['humidity', 'humedad', 'relative_humidity'],
|
||||
'wind_speed': ['wind_speed', 'viento', 'wind', 'wind_avg'],
|
||||
'pressure': ['pressure', 'presion', 'atmospheric_pressure']
|
||||
}
|
||||
|
||||
weather_features = ['date']
|
||||
|
||||
for standard_name, possible_names in weather_mapping.items():
|
||||
for possible_name in possible_names:
|
||||
if possible_name in weather_clean.columns:
|
||||
weather_clean[standard_name] = weather_clean[possible_name]
|
||||
weather_clean[standard_name] = pd.to_numeric(weather_clean[possible_name], errors='coerce')
|
||||
weather_features.append(standard_name)
|
||||
break
|
||||
|
||||
@@ -263,31 +323,32 @@ class BakeryDataProcessor:
|
||||
# Merge with sales data
|
||||
merged = daily_sales.merge(weather_clean, on='date', how='left')
|
||||
|
||||
# Fill missing weather values with reasonable defaults
|
||||
if 'temperature' in merged.columns:
|
||||
merged['temperature'] = merged['temperature'].fillna(15.0)
|
||||
if 'precipitation' in merged.columns:
|
||||
merged['precipitation'] = merged['precipitation'].fillna(0.0)
|
||||
if 'humidity' in merged.columns:
|
||||
merged['humidity'] = merged['humidity'].fillna(60.0)
|
||||
if 'wind_speed' in merged.columns:
|
||||
merged['wind_speed'] = merged['wind_speed'].fillna(5.0)
|
||||
# Fill missing weather values with Madrid-appropriate defaults
|
||||
weather_defaults = {
|
||||
'temperature': 15.0,
|
||||
'precipitation': 0.0,
|
||||
'humidity': 60.0,
|
||||
'wind_speed': 5.0,
|
||||
'pressure': 1013.0
|
||||
}
|
||||
|
||||
for feature, default_value in weather_defaults.items():
|
||||
if feature in merged.columns:
|
||||
merged[feature] = merged[feature].fillna(default_value)
|
||||
|
||||
return merged
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error merging weather data: {e}")
|
||||
# Add default weather columns if merge fails
|
||||
daily_sales['temperature'] = 15.0
|
||||
daily_sales['precipitation'] = 0.0
|
||||
daily_sales['humidity'] = 60.0
|
||||
daily_sales['wind_speed'] = 5.0
|
||||
for feature, default_value in weather_defaults.items():
|
||||
daily_sales[feature] = default_value
|
||||
return daily_sales
|
||||
|
||||
def _merge_traffic_features(self,
|
||||
daily_sales: pd.DataFrame,
|
||||
traffic_data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Merge traffic features with sales data"""
|
||||
"""Merge traffic features with enhanced Madrid-specific handling"""
|
||||
|
||||
if traffic_data.empty:
|
||||
# Add default traffic column
|
||||
@@ -297,26 +358,26 @@ class BakeryDataProcessor:
|
||||
try:
|
||||
traffic_clean = traffic_data.copy()
|
||||
|
||||
# Ensure traffic data has date column
|
||||
# Standardize date column
|
||||
if 'date' not in traffic_clean.columns and 'ds' in traffic_clean.columns:
|
||||
traffic_clean = traffic_clean.rename(columns={'ds': 'date'})
|
||||
|
||||
traffic_clean['date'] = pd.to_datetime(traffic_clean['date'])
|
||||
|
||||
# Select relevant traffic features
|
||||
traffic_features = ['date']
|
||||
|
||||
# Map traffic column names
|
||||
# Map traffic columns to standard names
|
||||
traffic_mapping = {
|
||||
'traffic_volume': ['traffic_volume', 'traffic_intensity', 'trafico', 'intensidad'],
|
||||
'pedestrian_count': ['pedestrian_count', 'peatones'],
|
||||
'occupancy_rate': ['occupancy_rate', 'ocupacion']
|
||||
'traffic_volume': ['traffic_volume', 'traffic_intensity', 'trafico', 'intensidad', 'volume'],
|
||||
'pedestrian_count': ['pedestrian_count', 'peatones', 'pedestrians'],
|
||||
'congestion_level': ['congestion_level', 'congestion', 'nivel_congestion'],
|
||||
'average_speed': ['average_speed', 'speed', 'velocidad_media', 'avg_speed']
|
||||
}
|
||||
|
||||
traffic_features = ['date']
|
||||
|
||||
for standard_name, possible_names in traffic_mapping.items():
|
||||
for possible_name in possible_names:
|
||||
if possible_name in traffic_clean.columns:
|
||||
traffic_clean[standard_name] = traffic_clean[possible_name]
|
||||
traffic_clean[standard_name] = pd.to_numeric(traffic_clean[possible_name], errors='coerce')
|
||||
traffic_features.append(standard_name)
|
||||
break
|
||||
|
||||
@@ -326,13 +387,17 @@ class BakeryDataProcessor:
|
||||
# Merge with sales data
|
||||
merged = daily_sales.merge(traffic_clean, on='date', how='left')
|
||||
|
||||
# Fill missing traffic values
|
||||
if 'traffic_volume' in merged.columns:
|
||||
merged['traffic_volume'] = merged['traffic_volume'].fillna(100.0)
|
||||
if 'pedestrian_count' in merged.columns:
|
||||
merged['pedestrian_count'] = merged['pedestrian_count'].fillna(50.0)
|
||||
if 'occupancy_rate' in merged.columns:
|
||||
merged['occupancy_rate'] = merged['occupancy_rate'].fillna(0.5)
|
||||
# Fill missing traffic values with reasonable defaults
|
||||
traffic_defaults = {
|
||||
'traffic_volume': 100.0,
|
||||
'pedestrian_count': 50.0,
|
||||
'congestion_level': 1.0, # Low congestion
|
||||
'average_speed': 30.0 # km/h typical for Madrid
|
||||
}
|
||||
|
||||
for feature, default_value in traffic_defaults.items():
|
||||
if feature in merged.columns:
|
||||
merged[feature] = merged[feature].fillna(default_value)
|
||||
|
||||
return merged
|
||||
|
||||
@@ -343,49 +408,150 @@ class BakeryDataProcessor:
|
||||
return daily_sales
|
||||
|
||||
def _engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Engineer additional features from existing data"""
|
||||
"""Engineer additional features from existing data with bakery-specific insights"""
|
||||
df = df.copy()
|
||||
|
||||
# Weather-based features
|
||||
if 'temperature' in df.columns:
|
||||
df['temp_squared'] = df['temperature'] ** 2
|
||||
df['is_hot_day'] = (df['temperature'] > 25).astype(int)
|
||||
df['is_cold_day'] = (df['temperature'] < 10).astype(int)
|
||||
df['is_hot_day'] = (df['temperature'] > 25).astype(int) # Hot days in Madrid
|
||||
df['is_cold_day'] = (df['temperature'] < 10).astype(int) # Cold days
|
||||
df['is_pleasant_day'] = ((df['temperature'] >= 18) & (df['temperature'] <= 25)).astype(int)
|
||||
|
||||
# Temperature categories for bakery products
|
||||
df['temp_category'] = pd.cut(df['temperature'],
|
||||
bins=[-np.inf, 5, 15, 25, np.inf],
|
||||
labels=[0, 1, 2, 3]).astype(int)
|
||||
|
||||
if 'precipitation' in df.columns:
|
||||
df['is_rainy_day'] = (df['precipitation'] > 0).astype(int)
|
||||
df['heavy_rain'] = (df['precipitation'] > 10).astype(int)
|
||||
df['is_rainy_day'] = (df['precipitation'] > 0.1).astype(int)
|
||||
df['is_heavy_rain'] = (df['precipitation'] > 10).astype(int)
|
||||
df['rain_intensity'] = pd.cut(df['precipitation'],
|
||||
bins=[-0.1, 0, 2, 10, np.inf],
|
||||
labels=[0, 1, 2, 3]).astype(int)
|
||||
|
||||
# Traffic-based features
|
||||
if 'traffic_volume' in df.columns:
|
||||
df['high_traffic'] = (df['traffic_volume'] > df['traffic_volume'].quantile(0.75)).astype(int)
|
||||
df['low_traffic'] = (df['traffic_volume'] < df['traffic_volume'].quantile(0.25)).astype(int)
|
||||
# Calculate traffic quantiles for relative measures
|
||||
q75 = df['traffic_volume'].quantile(0.75)
|
||||
q25 = df['traffic_volume'].quantile(0.25)
|
||||
|
||||
df['high_traffic'] = (df['traffic_volume'] > q75).astype(int)
|
||||
df['low_traffic'] = (df['traffic_volume'] < q25).astype(int)
|
||||
df['traffic_normalized'] = (df['traffic_volume'] - df['traffic_volume'].mean()) / df['traffic_volume'].std()
|
||||
|
||||
# Interaction features
|
||||
# Interaction features - bakery specific
|
||||
if 'is_weekend' in df.columns and 'temperature' in df.columns:
|
||||
df['weekend_temp_interaction'] = df['is_weekend'] * df['temperature']
|
||||
df['weekend_pleasant_weather'] = df['is_weekend'] * df.get('is_pleasant_day', 0)
|
||||
|
||||
if 'is_rainy_day' in df.columns and 'traffic_volume' in df.columns:
|
||||
df['rain_traffic_interaction'] = df['is_rainy_day'] * df['traffic_volume']
|
||||
|
||||
if 'is_holiday' in df.columns and 'temperature' in df.columns:
|
||||
df['holiday_temp_interaction'] = df['is_holiday'] * df['temperature']
|
||||
|
||||
# Seasonal interactions
|
||||
if 'season' in df.columns and 'temperature' in df.columns:
|
||||
df['season_temp_interaction'] = df['season'] * df['temperature']
|
||||
|
||||
# Day-of-week specific features
|
||||
if 'day_of_week' in df.columns:
|
||||
# Working days vs weekends
|
||||
df['is_working_day'] = (~df['day_of_week'].isin([5, 6])).astype(int)
|
||||
|
||||
# Peak bakery days (Friday, Saturday, Sunday often busy)
|
||||
df['is_peak_bakery_day'] = df['day_of_week'].isin([4, 5, 6]).astype(int)
|
||||
|
||||
# Month-specific features for bakery seasonality
|
||||
if 'month' in df.columns:
|
||||
# Tourist season in Madrid (spring/summer)
|
||||
df['is_tourist_season'] = df['month'].isin([4, 5, 6, 7, 8, 9]).astype(int)
|
||||
|
||||
# Christmas season (affects bakery sales significantly)
|
||||
df['is_christmas_season'] = df['month'].isin([11, 12]).astype(int)
|
||||
|
||||
# Back-to-school/work season
|
||||
df['is_back_to_work_season'] = df['month'].isin([1, 9]).astype(int)
|
||||
|
||||
# Lagged features (if we have enough data)
|
||||
if len(df) > 7 and 'quantity' in df.columns:
|
||||
# Rolling averages for trend detection
|
||||
df['sales_7day_avg'] = df['quantity'].rolling(window=7, min_periods=3).mean()
|
||||
df['sales_14day_avg'] = df['quantity'].rolling(window=14, min_periods=7).mean()
|
||||
|
||||
# Day-over-day changes
|
||||
df['sales_change_1day'] = df['quantity'].diff()
|
||||
df['sales_change_7day'] = df['quantity'].diff(7) # Week-over-week
|
||||
|
||||
# Fill NaN values for lagged features
|
||||
df['sales_7day_avg'] = df['sales_7day_avg'].fillna(df['quantity'])
|
||||
df['sales_14day_avg'] = df['sales_14day_avg'].fillna(df['quantity'])
|
||||
df['sales_change_1day'] = df['sales_change_1day'].fillna(0)
|
||||
df['sales_change_7day'] = df['sales_change_7day'].fillna(0)
|
||||
|
||||
return df
|
||||
|
||||
def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Handle missing values in the dataset"""
|
||||
"""Handle missing values in the dataset with improved strategies"""
|
||||
df = df.copy()
|
||||
|
||||
# For numeric columns, use median imputation
|
||||
# For numeric columns, use appropriate imputation strategies
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
||||
|
||||
for col in numeric_columns:
|
||||
if col != 'quantity' and df[col].isna().any():
|
||||
median_value = df[col].median()
|
||||
df[col] = df[col].fillna(median_value)
|
||||
# Use different strategies based on column type
|
||||
if 'temperature' in col:
|
||||
df[col] = df[col].fillna(15.0) # Madrid average
|
||||
elif 'precipitation' in col or 'rain' in col:
|
||||
df[col] = df[col].fillna(0.0) # Default no rain
|
||||
elif 'humidity' in col:
|
||||
df[col] = df[col].fillna(60.0) # Moderate humidity
|
||||
elif 'traffic' in col:
|
||||
df[col] = df[col].fillna(df[col].median()) # Use median for traffic
|
||||
elif 'wind' in col:
|
||||
df[col] = df[col].fillna(5.0) # Light wind
|
||||
elif 'pressure' in col:
|
||||
df[col] = df[col].fillna(1013.0) # Standard atmospheric pressure
|
||||
else:
|
||||
# For other columns, use median or forward fill
|
||||
if df[col].count() > 0:
|
||||
df[col] = df[col].fillna(df[col].median())
|
||||
else:
|
||||
df[col] = df[col].fillna(0)
|
||||
|
||||
return df
|
||||
|
||||
def _handle_missing_values_future(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Handle missing values in future prediction data"""
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns
|
||||
|
||||
madrid_defaults = {
|
||||
'temperature': 15.0,
|
||||
'precipitation': 0.0,
|
||||
'humidity': 60.0,
|
||||
'wind_speed': 5.0,
|
||||
'traffic_volume': 100.0,
|
||||
'pedestrian_count': 50.0,
|
||||
'pressure': 1013.0
|
||||
}
|
||||
|
||||
for col in numeric_columns:
|
||||
if df[col].isna().any():
|
||||
# Find appropriate default value
|
||||
default_value = 0
|
||||
for key, value in madrid_defaults.items():
|
||||
if key in col.lower():
|
||||
default_value = value
|
||||
break
|
||||
|
||||
df[col] = df[col].fillna(default_value)
|
||||
|
||||
return df
|
||||
|
||||
def _prepare_prophet_format(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Prepare data in Prophet format with 'ds' and 'y' columns"""
|
||||
"""Prepare data in Prophet format with enhanced validation"""
|
||||
prophet_df = df.copy()
|
||||
|
||||
# Rename columns for Prophet
|
||||
@@ -395,20 +561,33 @@ class BakeryDataProcessor:
|
||||
if 'quantity' in prophet_df.columns:
|
||||
prophet_df = prophet_df.rename(columns={'quantity': 'y'})
|
||||
|
||||
# Ensure ds is datetime
|
||||
# Ensure ds is datetime and remove timezone info
|
||||
if 'ds' in prophet_df.columns:
|
||||
prophet_df['ds'] = pd.to_datetime(prophet_df['ds'])
|
||||
if prophet_df['ds'].dt.tz is not None:
|
||||
prophet_df['ds'] = prophet_df['ds'].dt.tz_localize(None)
|
||||
|
||||
# Validate required columns
|
||||
if 'ds' not in prophet_df.columns or 'y' not in prophet_df.columns:
|
||||
raise ValueError("Prophet data must have 'ds' and 'y' columns")
|
||||
|
||||
# Remove any rows with missing target values
|
||||
# Clean target values
|
||||
prophet_df = prophet_df.dropna(subset=['y'])
|
||||
prophet_df['y'] = prophet_df['y'].clip(lower=0) # No negative sales
|
||||
|
||||
# Remove any duplicate dates (keep last occurrence)
|
||||
prophet_df = prophet_df.drop_duplicates(subset=['ds'], keep='last')
|
||||
|
||||
# Sort by date
|
||||
prophet_df = prophet_df.sort_values('ds').reset_index(drop=True)
|
||||
|
||||
# Final validation
|
||||
if len(prophet_df) == 0:
|
||||
raise ValueError("No valid data points after cleaning")
|
||||
|
||||
logger.info(f"Prophet data prepared: {len(prophet_df)} rows, "
|
||||
f"date range: {prophet_df['ds'].min()} to {prophet_df['ds'].max()}")
|
||||
|
||||
return prophet_df
|
||||
|
||||
def _get_season(self, month: int) -> int:
|
||||
@@ -429,7 +608,7 @@ class BakeryDataProcessor:
|
||||
# Major Spanish holidays that affect bakery sales
|
||||
spanish_holidays = [
|
||||
(1, 1), # New Year
|
||||
(1, 6), # Epiphany
|
||||
(1, 6), # Epiphany (Reyes)
|
||||
(5, 1), # Labour Day
|
||||
(8, 15), # Assumption
|
||||
(10, 12), # National Day
|
||||
@@ -437,7 +616,7 @@ class BakeryDataProcessor:
|
||||
(12, 6), # Constitution
|
||||
(12, 8), # Immaculate Conception
|
||||
(12, 25), # Christmas
|
||||
(5, 15), # San Isidro (Madrid)
|
||||
(5, 15), # San Isidro (Madrid patron saint)
|
||||
(5, 2), # Madrid Community Day
|
||||
]
|
||||
|
||||
@@ -458,8 +637,8 @@ class BakeryDataProcessor:
|
||||
if month == 1 and date.day <= 10:
|
||||
return True
|
||||
|
||||
# Easter holidays (approximate - first two weeks of April)
|
||||
if month == 4 and date.day <= 14:
|
||||
# Easter holidays (approximate - early April)
|
||||
if month == 4 and date.day <= 15:
|
||||
return True
|
||||
|
||||
return False
|
||||
@@ -468,26 +647,89 @@ class BakeryDataProcessor:
|
||||
model_data: pd.DataFrame,
|
||||
target_column: str = 'y') -> Dict[str, float]:
|
||||
"""
|
||||
Calculate feature importance for the model.
|
||||
Calculate feature importance for the model using correlation analysis.
|
||||
"""
|
||||
try:
|
||||
# Simple correlation-based importance
|
||||
# Get numeric features
|
||||
numeric_features = model_data.select_dtypes(include=[np.number]).columns
|
||||
numeric_features = [col for col in numeric_features if col != target_column]
|
||||
|
||||
importance_scores = {}
|
||||
|
||||
if target_column not in model_data.columns:
|
||||
logger.warning(f"Target column '{target_column}' not found")
|
||||
return {}
|
||||
|
||||
for feature in numeric_features:
|
||||
if feature in model_data.columns:
|
||||
correlation = model_data[feature].corr(model_data[target_column])
|
||||
importance_scores[feature] = abs(correlation) if not pd.isna(correlation) else 0.0
|
||||
if not pd.isna(correlation) and not np.isinf(correlation):
|
||||
importance_scores[feature] = abs(correlation)
|
||||
|
||||
# Sort by importance
|
||||
importance_scores = dict(sorted(importance_scores.items(),
|
||||
key=lambda x: x[1], reverse=True))
|
||||
|
||||
logger.info(f"Calculated feature importance for {len(importance_scores)} features")
|
||||
return importance_scores
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating feature importance: {e}")
|
||||
return {}
|
||||
return {}
|
||||
|
||||
def get_data_quality_report(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate a comprehensive data quality report.
|
||||
"""
|
||||
try:
|
||||
report = {
|
||||
"total_records": len(df),
|
||||
"date_range": {
|
||||
"start": df['ds'].min().isoformat() if 'ds' in df.columns else None,
|
||||
"end": df['ds'].max().isoformat() if 'ds' in df.columns else None,
|
||||
"duration_days": (df['ds'].max() - df['ds'].min()).days if 'ds' in df.columns else 0
|
||||
},
|
||||
"missing_values": {},
|
||||
"data_completeness": 0.0,
|
||||
"target_statistics": {},
|
||||
"feature_count": 0
|
||||
}
|
||||
|
||||
# Calculate missing values
|
||||
missing_counts = df.isnull().sum()
|
||||
total_cells = len(df)
|
||||
|
||||
for col in df.columns:
|
||||
missing_count = missing_counts[col]
|
||||
report["missing_values"][col] = {
|
||||
"count": int(missing_count),
|
||||
"percentage": round((missing_count / total_cells) * 100, 2)
|
||||
}
|
||||
|
||||
# Overall completeness
|
||||
total_missing = missing_counts.sum()
|
||||
total_possible = len(df) * len(df.columns)
|
||||
report["data_completeness"] = round(((total_possible - total_missing) / total_possible) * 100, 2)
|
||||
|
||||
# Target variable statistics
|
||||
if 'y' in df.columns:
|
||||
y_col = df['y']
|
||||
report["target_statistics"] = {
|
||||
"mean": round(y_col.mean(), 2),
|
||||
"median": round(y_col.median(), 2),
|
||||
"std": round(y_col.std(), 2),
|
||||
"min": round(y_col.min(), 2),
|
||||
"max": round(y_col.max(), 2),
|
||||
"zero_count": int((y_col == 0).sum()),
|
||||
"zero_percentage": round(((y_col == 0).sum() / len(y_col)) * 100, 2)
|
||||
}
|
||||
|
||||
# Feature count
|
||||
numeric_features = df.select_dtypes(include=[np.number]).columns
|
||||
report["feature_count"] = len([col for col in numeric_features if col not in ['y', 'ds']])
|
||||
|
||||
return report
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating data quality report: {e}")
|
||||
return {"error": str(e)}
|
||||
@@ -1,24 +1,33 @@
|
||||
# services/training/app/ml/prophet_manager.py
|
||||
"""
|
||||
Enhanced Prophet Manager for Training Service
|
||||
Migrated from the monolithic backend to microservices architecture
|
||||
Simplified Prophet Manager with Built-in Hyperparameter Optimization
|
||||
Direct replacement for existing BakeryProphetManager - optimization always enabled.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from prophet import Prophet
|
||||
import pickle
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
import uuid
|
||||
import asyncio
|
||||
import os
|
||||
import joblib
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||
from sklearn.model_selection import TimeSeriesSplit
|
||||
import json
|
||||
from pathlib import Path
|
||||
import math
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from app.models.training import TrainedModel
|
||||
from app.core.database import get_db_session
|
||||
|
||||
# Simple optimization import
|
||||
import optuna
|
||||
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
@@ -26,15 +35,15 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class BakeryProphetManager:
|
||||
"""
|
||||
Enhanced Prophet model manager for the training service.
|
||||
Handles training, validation, and model persistence for bakery forecasting.
|
||||
Simplified Prophet Manager with built-in hyperparameter optimization.
|
||||
Drop-in replacement for the existing manager - optimization runs automatically.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, db_session: AsyncSession = None):
|
||||
self.models = {} # In-memory model storage
|
||||
self.model_metadata = {} # Store model metadata
|
||||
self.feature_scalers = {} # Store feature scalers per model
|
||||
|
||||
self.db_session = db_session # Add database session
|
||||
|
||||
# Ensure model storage directory exists
|
||||
os.makedirs(settings.MODEL_STORAGE_PATH, exist_ok=True)
|
||||
|
||||
@@ -44,19 +53,11 @@ class BakeryProphetManager:
|
||||
df: pd.DataFrame,
|
||||
job_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Train a Prophet model for bakery forecasting with enhanced features.
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant identifier
|
||||
product_name: Product name
|
||||
df: Training data with 'ds' and 'y' columns plus regressors
|
||||
job_id: Training job identifier
|
||||
|
||||
Returns:
|
||||
Dictionary with model information and metrics
|
||||
Train a Prophet model with automatic hyperparameter optimization.
|
||||
Same interface as before - optimization happens automatically.
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Training bakery model for tenant {tenant_id}, product {product_name}")
|
||||
logger.info(f"Training optimized bakery model for {product_name}")
|
||||
|
||||
# Validate input data
|
||||
await self._validate_training_data(df, product_name)
|
||||
@@ -67,8 +68,12 @@ class BakeryProphetManager:
|
||||
# Get regressor columns
|
||||
regressor_columns = self._extract_regressor_columns(prophet_data)
|
||||
|
||||
# Initialize Prophet model with bakery-specific settings
|
||||
model = self._create_prophet_model(regressor_columns)
|
||||
# Automatically optimize hyperparameters (this is the new part)
|
||||
logger.info(f"Optimizing hyperparameters for {product_name}...")
|
||||
best_params = await self._optimize_hyperparameters(prophet_data, product_name, regressor_columns)
|
||||
|
||||
# Create optimized Prophet model
|
||||
model = self._create_optimized_prophet_model(best_params, regressor_columns)
|
||||
|
||||
# Add regressors to model
|
||||
for regressor in regressor_columns:
|
||||
@@ -78,28 +83,23 @@ class BakeryProphetManager:
|
||||
# Fit the model
|
||||
model.fit(prophet_data)
|
||||
|
||||
# Generate model ID and store model
|
||||
# Store model and calculate metrics (same as before)
|
||||
model_id = f"{job_id}_{product_name}_{uuid.uuid4().hex[:8]}"
|
||||
model_path = await self._store_model(
|
||||
tenant_id, product_name, model, model_id, prophet_data, regressor_columns
|
||||
tenant_id, product_name, model, model_id, prophet_data, regressor_columns, best_params
|
||||
)
|
||||
|
||||
# Calculate training metrics
|
||||
training_metrics = await self._calculate_training_metrics(model, prophet_data)
|
||||
# Calculate enhanced training metrics
|
||||
training_metrics = await self._calculate_training_metrics(model, prophet_data, best_params)
|
||||
|
||||
# Prepare model information
|
||||
# Return same format as before, but with optimization info
|
||||
model_info = {
|
||||
"model_id": model_id,
|
||||
"model_path": model_path,
|
||||
"type": "prophet",
|
||||
"type": "prophet_optimized", # Changed from "prophet"
|
||||
"training_samples": len(prophet_data),
|
||||
"features": regressor_columns,
|
||||
"hyperparameters": {
|
||||
"seasonality_mode": settings.PROPHET_SEASONALITY_MODE,
|
||||
"daily_seasonality": settings.PROPHET_DAILY_SEASONALITY,
|
||||
"weekly_seasonality": settings.PROPHET_WEEKLY_SEASONALITY,
|
||||
"yearly_seasonality": settings.PROPHET_YEARLY_SEASONALITY
|
||||
},
|
||||
"hyperparameters": best_params, # Now contains optimized params
|
||||
"training_metrics": training_metrics,
|
||||
"trained_at": datetime.now().isoformat(),
|
||||
"data_period": {
|
||||
@@ -109,41 +109,491 @@ class BakeryProphetManager:
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(f"Model trained successfully for {product_name}")
|
||||
logger.info(f"Optimized model trained successfully for {product_name}. "
|
||||
f"MAPE: {training_metrics.get('optimized_mape', 'N/A')}%")
|
||||
return model_info
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to train bakery model for {product_name}: {str(e)}")
|
||||
logger.error(f"Failed to train optimized bakery model for {product_name}: {str(e)}")
|
||||
raise
|
||||
|
||||
async def _optimize_hyperparameters(self,
|
||||
df: pd.DataFrame,
|
||||
product_name: str,
|
||||
regressor_columns: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
Automatically optimize Prophet hyperparameters using Bayesian optimization.
|
||||
Simplified - no configuration needed.
|
||||
"""
|
||||
|
||||
# Determine product category automatically
|
||||
product_category = self._classify_product(product_name, df)
|
||||
|
||||
# Set optimization parameters based on category
|
||||
n_trials = {
|
||||
'high_volume': 30, # Reduced from 75 for speed
|
||||
'medium_volume': 25, # Reduced from 50
|
||||
'low_volume': 20, # Reduced from 30
|
||||
'intermittent': 15 # Reduced from 25
|
||||
}.get(product_category, 25)
|
||||
|
||||
logger.info(f"Product {product_name} classified as {product_category}, using {n_trials} trials")
|
||||
|
||||
# Check data quality and adjust strategy
|
||||
total_sales = df['y'].sum()
|
||||
zero_ratio = (df['y'] == 0).sum() / len(df)
|
||||
mean_sales = df['y'].mean()
|
||||
non_zero_days = len(df[df['y'] > 0])
|
||||
|
||||
logger.info(f"Data analysis for {product_name}: total_sales={total_sales:.1f}, "
|
||||
f"zero_ratio={zero_ratio:.2f}, mean_sales={mean_sales:.2f}, non_zero_days={non_zero_days}")
|
||||
|
||||
# Adjust strategy based on data characteristics
|
||||
if zero_ratio > 0.8 or non_zero_days < 30:
|
||||
logger.warning(f"Very sparse data for {product_name}, using minimal optimization")
|
||||
return {
|
||||
'changepoint_prior_scale': 0.001,
|
||||
'seasonality_prior_scale': 0.01,
|
||||
'holidays_prior_scale': 0.01,
|
||||
'changepoint_range': 0.8,
|
||||
'seasonality_mode': 'additive',
|
||||
'daily_seasonality': False,
|
||||
'weekly_seasonality': True,
|
||||
'yearly_seasonality': False
|
||||
}
|
||||
elif zero_ratio > 0.6:
|
||||
logger.info(f"Moderate sparsity for {product_name}, using conservative optimization")
|
||||
return {
|
||||
'changepoint_prior_scale': 0.01,
|
||||
'seasonality_prior_scale': 0.1,
|
||||
'holidays_prior_scale': 0.1,
|
||||
'changepoint_range': 0.8,
|
||||
'seasonality_mode': 'additive',
|
||||
'daily_seasonality': False,
|
||||
'weekly_seasonality': True,
|
||||
'yearly_seasonality': len(df) > 365 # Only if we have enough data
|
||||
}
|
||||
|
||||
# Use unique seed for each product to avoid identical results
|
||||
product_seed = hash(product_name) % 10000
|
||||
|
||||
def objective(trial):
|
||||
try:
|
||||
# Sample hyperparameters with product-specific ranges
|
||||
if product_category == 'high_volume':
|
||||
# More conservative for high volume (less overfitting)
|
||||
changepoint_scale_range = (0.001, 0.1)
|
||||
seasonality_scale_range = (1.0, 10.0)
|
||||
elif product_category == 'intermittent':
|
||||
# Very conservative for intermittent
|
||||
changepoint_scale_range = (0.001, 0.05)
|
||||
seasonality_scale_range = (0.01, 1.0)
|
||||
else:
|
||||
# Default ranges
|
||||
changepoint_scale_range = (0.001, 0.5)
|
||||
seasonality_scale_range = (0.01, 10.0)
|
||||
|
||||
params = {
|
||||
'changepoint_prior_scale': trial.suggest_float(
|
||||
'changepoint_prior_scale',
|
||||
changepoint_scale_range[0],
|
||||
changepoint_scale_range[1],
|
||||
log=True
|
||||
),
|
||||
'seasonality_prior_scale': trial.suggest_float(
|
||||
'seasonality_prior_scale',
|
||||
seasonality_scale_range[0],
|
||||
seasonality_scale_range[1],
|
||||
log=True
|
||||
),
|
||||
'holidays_prior_scale': trial.suggest_float('holidays_prior_scale', 0.01, 10.0, log=True),
|
||||
'changepoint_range': trial.suggest_float('changepoint_range', 0.8, 0.95),
|
||||
'seasonality_mode': 'additive' if product_category == 'high_volume' else trial.suggest_categorical('seasonality_mode', ['additive', 'multiplicative']),
|
||||
'daily_seasonality': trial.suggest_categorical('daily_seasonality', [True, False]),
|
||||
'weekly_seasonality': True, # Always keep weekly
|
||||
'yearly_seasonality': trial.suggest_categorical('yearly_seasonality', [True, False])
|
||||
}
|
||||
|
||||
# Simple 2-fold cross-validation for speed
|
||||
tscv = TimeSeriesSplit(n_splits=2)
|
||||
cv_scores = []
|
||||
|
||||
for train_idx, val_idx in tscv.split(df):
|
||||
train_data = df.iloc[train_idx].copy()
|
||||
val_data = df.iloc[val_idx].copy()
|
||||
|
||||
if len(val_data) < 7: # Need at least a week
|
||||
continue
|
||||
|
||||
try:
|
||||
# Create and train model
|
||||
model = Prophet(**params, interval_width=0.8, uncertainty_samples=100)
|
||||
|
||||
for regressor in regressor_columns:
|
||||
if regressor in train_data.columns:
|
||||
model.add_regressor(regressor)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
model.fit(train_data)
|
||||
|
||||
# Predict on validation set
|
||||
future_df = model.make_future_dataframe(periods=0)
|
||||
for regressor in regressor_columns:
|
||||
if regressor in df.columns:
|
||||
future_df[regressor] = df[regressor].values[:len(future_df)]
|
||||
|
||||
forecast = model.predict(future_df)
|
||||
val_predictions = forecast['yhat'].iloc[train_idx[-1]+1:train_idx[-1]+1+len(val_data)]
|
||||
val_actual = val_data['y'].values
|
||||
|
||||
# Calculate MAPE with improved handling for low values
|
||||
if len(val_predictions) > 0 and len(val_actual) > 0:
|
||||
# Use MAE for very low sales values to avoid MAPE issues
|
||||
if val_actual.mean() < 1:
|
||||
mae = np.mean(np.abs(val_actual - val_predictions.values))
|
||||
# Convert MAE to percentage-like metric
|
||||
mape_like = (mae / max(val_actual.mean(), 0.1)) * 100
|
||||
else:
|
||||
non_zero_mask = val_actual > 0.1 # Use threshold instead of zero
|
||||
if np.sum(non_zero_mask) > 0:
|
||||
mape = np.mean(np.abs((val_actual[non_zero_mask] - val_predictions.values[non_zero_mask]) / val_actual[non_zero_mask])) * 100
|
||||
mape_like = min(mape, 200) # Cap at 200%
|
||||
else:
|
||||
mape_like = 100
|
||||
|
||||
if not np.isnan(mape_like) and not np.isinf(mape_like):
|
||||
cv_scores.append(mape_like)
|
||||
|
||||
except Exception as fold_error:
|
||||
logger.debug(f"Fold failed for {product_name} trial {trial.number}: {str(fold_error)}")
|
||||
continue
|
||||
|
||||
return np.mean(cv_scores) if len(cv_scores) > 0 else 100.0
|
||||
|
||||
except Exception as trial_error:
|
||||
logger.debug(f"Trial {trial.number} failed for {product_name}: {str(trial_error)}")
|
||||
return 100.0
|
||||
|
||||
# Run optimization with product-specific seed
|
||||
study = optuna.create_study(
|
||||
direction='minimize',
|
||||
sampler=optuna.samplers.TPESampler(seed=product_seed) # Unique seed per product
|
||||
)
|
||||
study.optimize(objective, n_trials=n_trials, timeout=600, show_progress_bar=False)
|
||||
|
||||
# Return best parameters
|
||||
best_params = study.best_params
|
||||
best_score = study.best_value
|
||||
|
||||
logger.info(f"Optimization completed for {product_name}. Best score: {best_score:.2f}%. "
|
||||
f"Parameters: {best_params}")
|
||||
return best_params
|
||||
|
||||
def _classify_product(self, product_name: str, sales_data: pd.DataFrame) -> str:
|
||||
"""Automatically classify product for optimization strategy - improved for bakery data"""
|
||||
product_lower = product_name.lower()
|
||||
|
||||
# Calculate sales statistics
|
||||
total_sales = sales_data['y'].sum()
|
||||
mean_sales = sales_data['y'].mean()
|
||||
zero_ratio = (sales_data['y'] == 0).sum() / len(sales_data)
|
||||
non_zero_days = len(sales_data[sales_data['y'] > 0])
|
||||
|
||||
logger.info(f"Product classification for {product_name}: total_sales={total_sales:.1f}, "
|
||||
f"mean_sales={mean_sales:.2f}, zero_ratio={zero_ratio:.2f}, non_zero_days={non_zero_days}")
|
||||
|
||||
# Improved classification logic for bakery products
|
||||
# Consider both volume and consistency
|
||||
|
||||
# Check for truly intermittent demand (high zero ratio)
|
||||
if zero_ratio > 0.8 or non_zero_days < 30:
|
||||
return 'intermittent'
|
||||
|
||||
# High volume products (consistent daily sales)
|
||||
if any(pattern in product_lower for pattern in ['cafe', 'pan', 'bread', 'coffee']):
|
||||
# Even if absolute volume is low, these are core products
|
||||
return 'high_volume' if zero_ratio < 0.3 else 'medium_volume'
|
||||
|
||||
# Volume-based classification for other products
|
||||
if mean_sales >= 10 and zero_ratio < 0.4:
|
||||
return 'high_volume'
|
||||
elif mean_sales >= 5 and zero_ratio < 0.6:
|
||||
return 'medium_volume'
|
||||
elif mean_sales >= 2 and zero_ratio < 0.7:
|
||||
return 'low_volume'
|
||||
else:
|
||||
return 'intermittent'
|
||||
|
||||
def _create_optimized_prophet_model(self, optimized_params: Dict[str, Any], regressor_columns: List[str]) -> Prophet:
|
||||
"""Create Prophet model with optimized parameters"""
|
||||
holidays = self._get_spanish_holidays()
|
||||
|
||||
model = Prophet(
|
||||
holidays=holidays if not holidays.empty else None,
|
||||
daily_seasonality=optimized_params.get('daily_seasonality', True),
|
||||
weekly_seasonality=optimized_params.get('weekly_seasonality', True),
|
||||
yearly_seasonality=optimized_params.get('yearly_seasonality', True),
|
||||
seasonality_mode=optimized_params.get('seasonality_mode', 'additive'),
|
||||
changepoint_prior_scale=optimized_params.get('changepoint_prior_scale', 0.05),
|
||||
seasonality_prior_scale=optimized_params.get('seasonality_prior_scale', 10.0),
|
||||
holidays_prior_scale=optimized_params.get('holidays_prior_scale', 10.0),
|
||||
changepoint_range=optimized_params.get('changepoint_range', 0.8),
|
||||
interval_width=0.8,
|
||||
mcmc_samples=0,
|
||||
uncertainty_samples=1000
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
# All the existing methods remain the same, just with enhanced metrics
|
||||
|
||||
async def _calculate_training_metrics(self,
|
||||
model: Prophet,
|
||||
training_data: pd.DataFrame,
|
||||
optimized_params: Dict[str, Any] = None) -> Dict[str, float]:
|
||||
"""Calculate training metrics with optimization info and improved MAPE handling"""
|
||||
try:
|
||||
# Generate in-sample predictions
|
||||
forecast = model.predict(training_data[['ds'] + [col for col in training_data.columns if col not in ['ds', 'y']]])
|
||||
|
||||
# Calculate metrics
|
||||
y_true = training_data['y'].values
|
||||
y_pred = forecast['yhat'].values
|
||||
|
||||
# Basic metrics
|
||||
mae = mean_absolute_error(y_true, y_pred)
|
||||
mse = mean_squared_error(y_true, y_pred)
|
||||
rmse = np.sqrt(mse)
|
||||
|
||||
# Improved MAPE calculation for bakery data
|
||||
mean_actual = y_true.mean()
|
||||
median_actual = np.median(y_true[y_true > 0]) if np.any(y_true > 0) else 1.0
|
||||
|
||||
# Use different strategies based on sales volume
|
||||
if mean_actual < 2.0:
|
||||
# For very low volume products, use normalized MAE
|
||||
normalized_mae = mae / max(median_actual, 1.0)
|
||||
mape = min(normalized_mae * 100, 200) # Cap at 200%
|
||||
logger.info(f"Using normalized MAE for low-volume product (mean={mean_actual:.2f})")
|
||||
elif mean_actual < 5.0:
|
||||
# For low-medium volume, use modified MAPE with higher threshold
|
||||
threshold = 1.0
|
||||
valid_mask = y_true >= threshold
|
||||
|
||||
if np.sum(valid_mask) == 0:
|
||||
mape = 150.0 # High but not extreme
|
||||
else:
|
||||
mape_values = np.abs((y_true[valid_mask] - y_pred[valid_mask]) / y_true[valid_mask])
|
||||
mape = np.median(mape_values) * 100 # Use median instead of mean to reduce outlier impact
|
||||
mape = min(mape, 150) # Cap at reasonable level
|
||||
else:
|
||||
# Standard MAPE for higher volume products
|
||||
threshold = 0.5
|
||||
valid_mask = y_true > threshold
|
||||
|
||||
if np.sum(valid_mask) == 0:
|
||||
mape = 100.0
|
||||
else:
|
||||
mape_values = np.abs((y_true[valid_mask] - y_pred[valid_mask]) / y_true[valid_mask])
|
||||
mape = np.mean(mape_values) * 100
|
||||
|
||||
# Cap MAPE at reasonable maximum
|
||||
if math.isinf(mape) or math.isnan(mape) or mape > 200:
|
||||
mape = min(200.0, (mae / max(mean_actual, 1.0)) * 100)
|
||||
|
||||
# R-squared
|
||||
ss_res = np.sum((y_true - y_pred) ** 2)
|
||||
ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
|
||||
r2 = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0.0
|
||||
|
||||
# Calculate realistic improvement estimate based on actual product performance
|
||||
# Use more granular categories and realistic baselines
|
||||
total_sales = training_data['y'].sum()
|
||||
zero_ratio = (training_data['y'] == 0).sum() / len(training_data)
|
||||
mean_sales = training_data['y'].mean()
|
||||
non_zero_days = len(training_data[training_data['y'] > 0])
|
||||
|
||||
# More nuanced categorization
|
||||
if zero_ratio > 0.8 or non_zero_days < 30:
|
||||
category = 'very_sparse'
|
||||
baseline_mape = 80.0
|
||||
elif zero_ratio > 0.6:
|
||||
category = 'sparse'
|
||||
baseline_mape = 60.0
|
||||
elif mean_sales >= 10 and zero_ratio < 0.3:
|
||||
category = 'high_volume'
|
||||
baseline_mape = 25.0
|
||||
elif mean_sales >= 5 and zero_ratio < 0.5:
|
||||
category = 'medium_volume'
|
||||
baseline_mape = 35.0
|
||||
else:
|
||||
category = 'low_volume'
|
||||
baseline_mape = 45.0
|
||||
|
||||
# Calculate improvement - be more conservative
|
||||
if mape < baseline_mape * 0.8: # Only claim improvement if significant
|
||||
improvement_pct = (baseline_mape - mape) / baseline_mape * 100
|
||||
else:
|
||||
improvement_pct = 0 # No meaningful improvement
|
||||
|
||||
# Quality score based on data characteristics
|
||||
quality_score = max(0.1, min(1.0, (1 - zero_ratio) * (non_zero_days / len(training_data))))
|
||||
|
||||
# Enhanced metrics with optimization info
|
||||
metrics = {
|
||||
"mae": round(mae, 2),
|
||||
"mse": round(mse, 2),
|
||||
"rmse": round(rmse, 2),
|
||||
"mape": round(mape, 2),
|
||||
"r2": round(r2, 3),
|
||||
"optimized": True,
|
||||
"optimized_mape": round(mape, 2),
|
||||
"baseline_mape_estimate": round(baseline_mape, 2),
|
||||
"improvement_estimated": round(improvement_pct, 1),
|
||||
"product_category": category,
|
||||
"data_quality_score": round(quality_score, 2),
|
||||
"mean_sales_volume": round(mean_sales, 2),
|
||||
"sales_consistency": round(non_zero_days / len(training_data), 2),
|
||||
"total_demand": round(total_sales, 1)
|
||||
}
|
||||
|
||||
logger.info(f"Training metrics calculated: MAPE={mape:.1f}%, "
|
||||
f"Category={category}, Improvement={improvement_pct:.1f}%")
|
||||
|
||||
return metrics
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating training metrics: {str(e)}")
|
||||
return {
|
||||
"mae": 0.0, "mse": 0.0, "rmse": 0.0, "mape": 100.0, "r2": 0.0,
|
||||
"optimized": False, "improvement_estimated": 0.0
|
||||
}
|
||||
|
||||
async def _store_model(self,
|
||||
tenant_id: str,
|
||||
product_name: str,
|
||||
model: Prophet,
|
||||
model_id: str,
|
||||
training_data: pd.DataFrame,
|
||||
regressor_columns: List[str],
|
||||
optimized_params: Dict[str, Any] = None,
|
||||
training_metrics: Dict[str, Any] = None) -> str:
|
||||
"""Store model with database integration"""
|
||||
|
||||
# Create model directory
|
||||
model_dir = Path(settings.MODEL_STORAGE_PATH) / tenant_id
|
||||
model_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Store model file
|
||||
model_path = model_dir / f"{model_id}.pkl"
|
||||
joblib.dump(model, model_path)
|
||||
|
||||
# Enhanced metadata
|
||||
metadata = {
|
||||
"model_id": model_id,
|
||||
"tenant_id": tenant_id,
|
||||
"product_name": product_name,
|
||||
"regressor_columns": regressor_columns,
|
||||
"training_samples": len(training_data),
|
||||
"data_period": {
|
||||
"start_date": training_data['ds'].min().isoformat(),
|
||||
"end_date": training_data['ds'].max().isoformat()
|
||||
},
|
||||
"optimized": True,
|
||||
"optimized_parameters": optimized_params or {},
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"model_type": "prophet_optimized",
|
||||
"file_path": str(model_path)
|
||||
}
|
||||
|
||||
metadata_path = model_path.with_suffix('.json')
|
||||
with open(metadata_path, 'w') as f:
|
||||
json.dump(metadata, f, indent=2, default=str)
|
||||
|
||||
# Store in memory
|
||||
model_key = f"{tenant_id}:{product_name}"
|
||||
self.models[model_key] = model
|
||||
self.model_metadata[model_key] = metadata
|
||||
|
||||
# 🆕 NEW: Store in database
|
||||
if self.db_session:
|
||||
try:
|
||||
# Deactivate previous models for this product
|
||||
await self._deactivate_previous_models(tenant_id, product_name)
|
||||
|
||||
# Create new database record
|
||||
db_model = TrainedModel(
|
||||
id=model_id,
|
||||
tenant_id=tenant_id,
|
||||
product_name=product_name,
|
||||
model_type="prophet_optimized",
|
||||
job_id=model_id.split('_')[0], # Extract job_id from model_id
|
||||
model_path=str(model_path),
|
||||
metadata_path=str(metadata_path),
|
||||
hyperparameters=optimized_params or {},
|
||||
features_used=regressor_columns,
|
||||
is_active=True,
|
||||
is_production=True, # New models are production-ready
|
||||
training_start_date=training_data['ds'].min(),
|
||||
training_end_date=training_data['ds'].max(),
|
||||
training_samples=len(training_data)
|
||||
)
|
||||
|
||||
# Add training metrics if available
|
||||
if training_metrics:
|
||||
db_model.mape = training_metrics.get('mape')
|
||||
db_model.mae = training_metrics.get('mae')
|
||||
db_model.rmse = training_metrics.get('rmse')
|
||||
db_model.r2_score = training_metrics.get('r2')
|
||||
db_model.data_quality_score = training_metrics.get('data_quality_score')
|
||||
|
||||
self.db_session.add(db_model)
|
||||
await self.db_session.commit()
|
||||
|
||||
logger.info(f"Model {model_id} stored in database successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store model in database: {str(e)}")
|
||||
await self.db_session.rollback()
|
||||
# Continue execution - file storage succeeded
|
||||
|
||||
logger.info(f"Optimized model stored at: {model_path}")
|
||||
return str(model_path)
|
||||
|
||||
async def _deactivate_previous_models(self, tenant_id: str, product_name: str):
|
||||
"""Deactivate previous models for the same product"""
|
||||
if self.db_session:
|
||||
try:
|
||||
# Update previous models to inactive
|
||||
query = """
|
||||
UPDATE trained_models
|
||||
SET is_active = false, is_production = false
|
||||
WHERE tenant_id = :tenant_id AND product_name = :product_name
|
||||
"""
|
||||
await self.db_session.execute(query, {
|
||||
"tenant_id": tenant_id,
|
||||
"product_name": product_name
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to deactivate previous models: {str(e)}")
|
||||
|
||||
# Keep all existing methods unchanged
|
||||
async def generate_forecast(self,
|
||||
model_path: str,
|
||||
future_dates: pd.DataFrame,
|
||||
regressor_columns: List[str]) -> pd.DataFrame:
|
||||
"""
|
||||
Generate forecast using a stored Prophet model.
|
||||
|
||||
Args:
|
||||
model_path: Path to the stored model
|
||||
future_dates: DataFrame with future dates and regressors
|
||||
regressor_columns: List of regressor column names
|
||||
|
||||
Returns:
|
||||
DataFrame with forecast results
|
||||
"""
|
||||
"""Generate forecast using stored model (unchanged)"""
|
||||
try:
|
||||
# Load the model
|
||||
model = joblib.load(model_path)
|
||||
|
||||
# Validate future data has required regressors
|
||||
for regressor in regressor_columns:
|
||||
if regressor not in future_dates.columns:
|
||||
logger.warning(f"Missing regressor {regressor}, filling with median")
|
||||
future_dates[regressor] = 0 # Default value
|
||||
future_dates[regressor] = 0
|
||||
|
||||
# Generate forecast
|
||||
forecast = model.predict(future_dates)
|
||||
|
||||
return forecast
|
||||
|
||||
except Exception as e:
|
||||
@@ -151,7 +601,7 @@ class BakeryProphetManager:
|
||||
raise
|
||||
|
||||
async def _validate_training_data(self, df: pd.DataFrame, product_name: str):
|
||||
"""Validate training data quality"""
|
||||
"""Validate training data quality (unchanged)"""
|
||||
if df.empty:
|
||||
raise ValueError(f"No training data available for {product_name}")
|
||||
|
||||
@@ -166,65 +616,47 @@ class BakeryProphetManager:
|
||||
if missing_columns:
|
||||
raise ValueError(f"Missing required columns: {missing_columns}")
|
||||
|
||||
# Check for valid date range
|
||||
if df['ds'].isna().any():
|
||||
raise ValueError("Invalid dates found in training data")
|
||||
|
||||
# Check for valid target values
|
||||
if df['y'].isna().all():
|
||||
raise ValueError("No valid target values found")
|
||||
|
||||
async def _prepare_prophet_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Prepare data for Prophet training"""
|
||||
"""Prepare data for Prophet training with timezone handling"""
|
||||
prophet_data = df.copy()
|
||||
|
||||
# Prophet column mapping
|
||||
if 'date' in prophet_data.columns:
|
||||
prophet_data['ds'] = prophet_data['date']
|
||||
if 'quantity' in prophet_data.columns:
|
||||
prophet_data['y'] = prophet_data['quantity']
|
||||
|
||||
# ✅ CRITICAL FIX: Remove timezone from ds column
|
||||
if 'ds' in prophet_data.columns:
|
||||
prophet_data['ds'] = pd.to_datetime(prophet_data['ds']).dt.tz_localize(None)
|
||||
logger.info(f"Removed timezone from ds column")
|
||||
if 'ds' not in prophet_data.columns:
|
||||
raise ValueError("Missing 'ds' column in training data")
|
||||
if 'y' not in prophet_data.columns:
|
||||
raise ValueError("Missing 'y' column in training data")
|
||||
|
||||
# Handle missing values in target
|
||||
if prophet_data['y'].isna().any():
|
||||
logger.warning("Filling missing target values with interpolation")
|
||||
prophet_data['y'] = prophet_data['y'].interpolate(method='linear')
|
||||
# Convert to datetime and remove timezone information
|
||||
prophet_data['ds'] = pd.to_datetime(prophet_data['ds'])
|
||||
|
||||
# Remove extreme outliers (values > 3 standard deviations)
|
||||
mean_val = prophet_data['y'].mean()
|
||||
std_val = prophet_data['y'].std()
|
||||
# Remove timezone if present (Prophet doesn't support timezones)
|
||||
if prophet_data['ds'].dt.tz is not None:
|
||||
logger.info("Removing timezone information from 'ds' column for Prophet compatibility")
|
||||
prophet_data['ds'] = prophet_data['ds'].dt.tz_localize(None)
|
||||
|
||||
if std_val > 0: # Avoid division by zero
|
||||
lower_bound = mean_val - 3 * std_val
|
||||
upper_bound = mean_val + 3 * std_val
|
||||
|
||||
before_count = len(prophet_data)
|
||||
prophet_data = prophet_data[
|
||||
(prophet_data['y'] >= lower_bound) &
|
||||
(prophet_data['y'] <= upper_bound)
|
||||
]
|
||||
after_count = len(prophet_data)
|
||||
|
||||
if before_count != after_count:
|
||||
logger.info(f"Removed {before_count - after_count} outliers")
|
||||
|
||||
# Ensure chronological order
|
||||
# Sort by date and clean data
|
||||
prophet_data = prophet_data.sort_values('ds').reset_index(drop=True)
|
||||
prophet_data['y'] = pd.to_numeric(prophet_data['y'], errors='coerce')
|
||||
prophet_data = prophet_data.dropna(subset=['y'])
|
||||
|
||||
# Fill missing values in regressors
|
||||
numeric_columns = prophet_data.select_dtypes(include=[np.number]).columns
|
||||
for col in numeric_columns:
|
||||
if col != 'y' and prophet_data[col].isna().any():
|
||||
prophet_data[col] = prophet_data[col].fillna(prophet_data[col].median())
|
||||
# Additional data cleaning for Prophet
|
||||
# Remove any duplicate dates (keep last occurrence)
|
||||
prophet_data = prophet_data.drop_duplicates(subset=['ds'], keep='last')
|
||||
|
||||
# Ensure y values are non-negative (Prophet works better with non-negative values)
|
||||
prophet_data['y'] = prophet_data['y'].clip(lower=0)
|
||||
|
||||
logger.info(f"Prepared Prophet data: {len(prophet_data)} rows, date range: {prophet_data['ds'].min()} to {prophet_data['ds'].max()}")
|
||||
|
||||
return prophet_data
|
||||
|
||||
def _extract_regressor_columns(self, df: pd.DataFrame) -> List[str]:
|
||||
"""Extract regressor columns from the dataframe"""
|
||||
"""Extract regressor columns (unchanged)"""
|
||||
excluded_columns = ['ds', 'y']
|
||||
regressor_columns = []
|
||||
|
||||
@@ -235,190 +667,32 @@ class BakeryProphetManager:
|
||||
logger.info(f"Identified regressor columns: {regressor_columns}")
|
||||
return regressor_columns
|
||||
|
||||
def _create_prophet_model(self, regressor_columns: List[str]) -> Prophet:
|
||||
"""Create Prophet model with bakery-specific settings"""
|
||||
|
||||
# Get Spanish holidays
|
||||
holidays = self._get_spanish_holidays()
|
||||
|
||||
# Bakery-specific Prophet configuration
|
||||
model = Prophet(
|
||||
holidays=holidays if not holidays.empty else None,
|
||||
daily_seasonality=settings.PROPHET_DAILY_SEASONALITY,
|
||||
weekly_seasonality=settings.PROPHET_WEEKLY_SEASONALITY,
|
||||
yearly_seasonality=settings.PROPHET_YEARLY_SEASONALITY,
|
||||
seasonality_mode=settings.PROPHET_SEASONALITY_MODE,
|
||||
changepoint_prior_scale=0.05, # Conservative changepoint detection
|
||||
seasonality_prior_scale=10, # Strong seasonality for bakeries
|
||||
holidays_prior_scale=10, # Strong holiday effects
|
||||
interval_width=0.8, # 80% confidence intervals
|
||||
mcmc_samples=0, # Use MAP estimation (faster)
|
||||
uncertainty_samples=1000 # For uncertainty estimation
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
def _get_spanish_holidays(self) -> pd.DataFrame:
|
||||
"""Get Spanish holidays for Prophet model"""
|
||||
"""Get Spanish holidays (unchanged)"""
|
||||
try:
|
||||
# Define major Spanish holidays that affect bakery sales
|
||||
holidays_list = []
|
||||
|
||||
years = range(2020, 2030) # Cover training and prediction period
|
||||
years = range(2020, 2030)
|
||||
|
||||
for year in years:
|
||||
holidays_list.extend([
|
||||
{'holiday': 'new_year', 'ds': f'{year}-01-01'},
|
||||
{'holiday': 'epiphany', 'ds': f'{year}-01-06'},
|
||||
{'holiday': 'may_day', 'ds': f'{year}-05-01'},
|
||||
{'holiday': 'labor_day', 'ds': f'{year}-05-01'},
|
||||
{'holiday': 'assumption', 'ds': f'{year}-08-15'},
|
||||
{'holiday': 'national_day', 'ds': f'{year}-10-12'},
|
||||
{'holiday': 'all_saints', 'ds': f'{year}-11-01'},
|
||||
{'holiday': 'constitution', 'ds': f'{year}-12-06'},
|
||||
{'holiday': 'immaculate', 'ds': f'{year}-12-08'},
|
||||
{'holiday': 'christmas', 'ds': f'{year}-12-25'},
|
||||
|
||||
# Madrid specific holidays
|
||||
{'holiday': 'madrid_patron', 'ds': f'{year}-05-15'}, # San Isidro
|
||||
{'holiday': 'madrid_community', 'ds': f'{year}-05-02'},
|
||||
{'holiday': 'constitution_day', 'ds': f'{year}-12-06'},
|
||||
{'holiday': 'immaculate_conception', 'ds': f'{year}-12-08'},
|
||||
{'holiday': 'christmas', 'ds': f'{year}-12-25'}
|
||||
])
|
||||
|
||||
holidays_df = pd.DataFrame(holidays_list)
|
||||
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])
|
||||
|
||||
return holidays_df
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error creating holidays dataframe: {e}")
|
||||
return pd.DataFrame()
|
||||
|
||||
async def _store_model(self,
|
||||
tenant_id: str,
|
||||
product_name: str,
|
||||
model: Prophet,
|
||||
model_id: str,
|
||||
training_data: pd.DataFrame,
|
||||
regressor_columns: List[str]) -> str:
|
||||
"""Store model and metadata to filesystem"""
|
||||
|
||||
# Create model filename
|
||||
model_filename = f"{model_id}_prophet_model.pkl"
|
||||
model_path = os.path.join(settings.MODEL_STORAGE_PATH, model_filename)
|
||||
|
||||
# Store the model
|
||||
joblib.dump(model, model_path)
|
||||
|
||||
# Store metadata
|
||||
metadata = {
|
||||
"tenant_id": tenant_id,
|
||||
"product_name": product_name,
|
||||
"model_id": model_id,
|
||||
"regressor_columns": regressor_columns,
|
||||
"training_samples": len(training_data),
|
||||
"training_period": {
|
||||
"start": training_data['ds'].min().isoformat(),
|
||||
"end": training_data['ds'].max().isoformat()
|
||||
},
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"model_type": "prophet",
|
||||
"file_path": model_path
|
||||
}
|
||||
|
||||
metadata_path = model_path.replace('.pkl', '_metadata.json')
|
||||
with open(metadata_path, 'w') as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
|
||||
# Store in memory for quick access
|
||||
model_key = f"{tenant_id}:{product_name}"
|
||||
self.models[model_key] = model
|
||||
self.model_metadata[model_key] = metadata
|
||||
|
||||
logger.info(f"Model stored at: {model_path}")
|
||||
return model_path
|
||||
|
||||
async def _calculate_training_metrics(self,
|
||||
model: Prophet,
|
||||
training_data: pd.DataFrame) -> Dict[str, float]:
|
||||
"""Calculate training metrics for the model"""
|
||||
try:
|
||||
# Generate in-sample predictions
|
||||
forecast = model.predict(training_data[['ds'] + [col for col in training_data.columns if col not in ['ds', 'y']]])
|
||||
|
||||
# Calculate metrics
|
||||
y_true = training_data['y'].values
|
||||
y_pred = forecast['yhat'].values
|
||||
|
||||
# Basic metrics
|
||||
mae = mean_absolute_error(y_true, y_pred)
|
||||
mse = mean_squared_error(y_true, y_pred)
|
||||
rmse = np.sqrt(mse)
|
||||
|
||||
# MAPE (Mean Absolute Percentage Error)
|
||||
non_zero_mask = y_true != 0
|
||||
if np.sum(non_zero_mask) == 0:
|
||||
mape = 0.0 # Return 0 instead of Infinity
|
||||
if holidays_list:
|
||||
holidays_df = pd.DataFrame(holidays_list)
|
||||
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])
|
||||
return holidays_df
|
||||
else:
|
||||
mape_values = np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])
|
||||
mape = np.mean(mape_values) * 100
|
||||
if math.isinf(mape) or math.isnan(mape):
|
||||
mape = 0.0
|
||||
|
||||
# R-squared
|
||||
r2 = r2_score(y_true, y_pred)
|
||||
|
||||
return {
|
||||
"mae": round(mae, 2),
|
||||
"mse": round(mse, 2),
|
||||
"rmse": round(rmse, 2),
|
||||
"mape": round(mape, 2),
|
||||
"r2_score": round(r2, 4),
|
||||
"mean_actual": round(np.mean(y_true), 2),
|
||||
"mean_predicted": round(np.mean(y_pred), 2)
|
||||
}
|
||||
|
||||
return pd.DataFrame()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating training metrics: {e}")
|
||||
return {
|
||||
"mae": 0.0,
|
||||
"mse": 0.0,
|
||||
"rmse": 0.0,
|
||||
"mape": 0.0,
|
||||
"r2_score": 0.0,
|
||||
"mean_actual": 0.0,
|
||||
"mean_predicted": 0.0
|
||||
}
|
||||
|
||||
def get_model_info(self, tenant_id: str, product_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get model information for a specific tenant and product"""
|
||||
model_key = f"{tenant_id}:{product_name}"
|
||||
return self.model_metadata.get(model_key)
|
||||
|
||||
def list_models(self, tenant_id: str) -> List[Dict[str, Any]]:
|
||||
"""List all models for a tenant"""
|
||||
tenant_models = []
|
||||
|
||||
for model_key, metadata in self.model_metadata.items():
|
||||
if metadata['tenant_id'] == tenant_id:
|
||||
tenant_models.append(metadata)
|
||||
|
||||
return tenant_models
|
||||
|
||||
async def cleanup_old_models(self, days_old: int = 30):
|
||||
"""Clean up old model files"""
|
||||
try:
|
||||
cutoff_date = datetime.now() - timedelta(days=days_old)
|
||||
|
||||
for model_path in Path(settings.MODEL_STORAGE_PATH).glob("*.pkl"):
|
||||
# Check file modification time
|
||||
if model_path.stat().st_mtime < cutoff_date.timestamp():
|
||||
# Remove model and metadata files
|
||||
model_path.unlink()
|
||||
|
||||
metadata_path = model_path.with_suffix('.json')
|
||||
if metadata_path.exists():
|
||||
metadata_path.unlink()
|
||||
|
||||
logger.info(f"Cleaned up old model: {model_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during model cleanup: {e}")
|
||||
logger.warning(f"Could not load Spanish holidays: {str(e)}")
|
||||
return pd.DataFrame()
|
||||
@@ -1,77 +1,76 @@
|
||||
# services/training/app/ml/trainer.py
|
||||
"""
|
||||
ML Trainer for Training Service
|
||||
Orchestrates the complete training process
|
||||
ML Trainer - Main ML pipeline coordinator
|
||||
Receives prepared data and orchestrates the complete ML training process
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
from typing import Dict, List, Any, Optional
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import asyncio
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from app.ml.prophet_manager import BakeryProphetManager
|
||||
from app.ml.data_processor import BakeryDataProcessor
|
||||
from app.ml.prophet_manager import BakeryProphetManager
|
||||
from app.services.training_orchestrator import TrainingDataSet
|
||||
from app.core.config import settings
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class BakeryMLTrainer:
|
||||
"""
|
||||
Main ML trainer that orchestrates the complete training process.
|
||||
Replaces the old Celery-based training system with clean async implementation.
|
||||
Main ML trainer that orchestrates the complete ML training pipeline.
|
||||
Receives prepared TrainingDataSet and coordinates data processing and model training.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.prophet_manager = BakeryProphetManager()
|
||||
def __init__(self, db_session: AsyncSession = None):
|
||||
self.data_processor = BakeryDataProcessor()
|
||||
self.prophet_manager = BakeryProphetManager(db_session=db_session)
|
||||
|
||||
async def train_tenant_models(self,
|
||||
tenant_id: str,
|
||||
sales_data: List[Dict],
|
||||
weather_data: List[Dict] = None,
|
||||
traffic_data: List[Dict] = None,
|
||||
job_id: str = None) -> Dict[str, Any]:
|
||||
training_dataset: TrainingDataSet,
|
||||
job_id: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Train models for all products of a tenant.
|
||||
Train models for all products using prepared training dataset.
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant identifier
|
||||
sales_data: Historical sales data
|
||||
weather_data: Weather data (optional)
|
||||
traffic_data: Traffic data (optional)
|
||||
training_dataset: Prepared training dataset with aligned dates
|
||||
job_id: Training job identifier
|
||||
|
||||
Returns:
|
||||
Dictionary with training results for each product
|
||||
"""
|
||||
if not job_id:
|
||||
job_id = f"training_{tenant_id}_{uuid.uuid4().hex[:8]}"
|
||||
job_id = f"ml_training_{tenant_id}_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
logger.info(f"Starting training job {job_id} for tenant {tenant_id}")
|
||||
logger.info(f"Starting ML training pipeline {job_id} for tenant {tenant_id}")
|
||||
|
||||
try:
|
||||
# Convert input data to DataFrames
|
||||
sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
|
||||
weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
|
||||
traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
|
||||
# Convert sales data to DataFrame
|
||||
sales_df = pd.DataFrame(training_dataset.sales_data)
|
||||
weather_df = pd.DataFrame(training_dataset.weather_data)
|
||||
traffic_df = pd.DataFrame(training_dataset.traffic_data)
|
||||
|
||||
# Validate input data
|
||||
await self._validate_input_data(sales_df, tenant_id)
|
||||
|
||||
# Get unique products
|
||||
# Get unique products from the sales data
|
||||
products = sales_df['product_name'].unique().tolist()
|
||||
logger.info(f"Training models for {len(products)} products: {products}")
|
||||
|
||||
# Process data for each product
|
||||
logger.info("Processing data for all products...")
|
||||
processed_data = await self._process_all_products(
|
||||
sales_df, weather_df, traffic_df, products
|
||||
)
|
||||
|
||||
# Train models for each product
|
||||
# Train models for each processed product
|
||||
logger.info("Training models for all products...")
|
||||
training_results = await self._train_all_models(
|
||||
tenant_id, processed_data, job_id
|
||||
)
|
||||
@@ -85,50 +84,56 @@ class BakeryMLTrainer:
|
||||
"status": "completed",
|
||||
"products_trained": len([r for r in training_results.values() if r.get('status') == 'success']),
|
||||
"products_failed": len([r for r in training_results.values() if r.get('status') == 'error']),
|
||||
"products_skipped": len([r for r in training_results.values() if r.get('status') == 'skipped']),
|
||||
"total_products": len(products),
|
||||
"training_results": training_results,
|
||||
"summary": summary,
|
||||
"data_info": {
|
||||
"date_range": {
|
||||
"start": training_dataset.date_range.start.isoformat(),
|
||||
"end": training_dataset.date_range.end.isoformat(),
|
||||
"duration_days": (training_dataset.date_range.end - training_dataset.date_range.start).days
|
||||
},
|
||||
"data_sources": [source.value for source in training_dataset.date_range.available_sources],
|
||||
"constraints_applied": training_dataset.date_range.constraints
|
||||
},
|
||||
"completed_at": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
logger.info(f"Training job {job_id} completed successfully")
|
||||
logger.info(f"ML training pipeline {job_id} completed successfully")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Training job {job_id} failed: {str(e)}")
|
||||
logger.error(f"ML training pipeline {job_id} failed: {str(e)}")
|
||||
raise
|
||||
|
||||
async def train_single_product(self,
|
||||
tenant_id: str,
|
||||
product_name: str,
|
||||
sales_data: List[Dict],
|
||||
weather_data: List[Dict] = None,
|
||||
traffic_data: List[Dict] = None,
|
||||
job_id: str = None) -> Dict[str, Any]:
|
||||
async def train_single_product_model(self,
|
||||
tenant_id: str,
|
||||
product_name: str,
|
||||
training_dataset: TrainingDataSet,
|
||||
job_id: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Train model for a single product.
|
||||
Train model for a single product using prepared training dataset.
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant identifier
|
||||
product_name: Product name
|
||||
sales_data: Historical sales data
|
||||
weather_data: Weather data (optional)
|
||||
traffic_data: Traffic data (optional)
|
||||
training_dataset: Prepared training dataset
|
||||
job_id: Training job identifier
|
||||
|
||||
Returns:
|
||||
Training result for the product
|
||||
"""
|
||||
if not job_id:
|
||||
job_id = f"training_{tenant_id}_{product_name}_{uuid.uuid4().hex[:8]}"
|
||||
job_id = f"single_ml_{tenant_id}_{product_name}_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
logger.info(f"Starting single product training {job_id} for {product_name}")
|
||||
logger.info(f"Starting single product ML training {job_id} for {product_name}")
|
||||
|
||||
try:
|
||||
# Convert input data to DataFrames
|
||||
sales_df = pd.DataFrame(sales_data) if sales_data else pd.DataFrame()
|
||||
weather_df = pd.DataFrame(weather_data) if weather_data else pd.DataFrame()
|
||||
traffic_df = pd.DataFrame(traffic_data) if traffic_data else pd.DataFrame()
|
||||
# Convert training data to DataFrames
|
||||
sales_df = pd.DataFrame(training_dataset.sales_data)
|
||||
weather_df = pd.DataFrame(training_dataset.weather_data)
|
||||
traffic_df = pd.DataFrame(training_dataset.traffic_data)
|
||||
|
||||
# Filter sales data for the specific product
|
||||
product_sales = sales_df[sales_df['product_name'] == product_name].copy()
|
||||
@@ -137,7 +142,7 @@ class BakeryMLTrainer:
|
||||
if product_sales.empty:
|
||||
raise ValueError(f"No sales data found for product: {product_name}")
|
||||
|
||||
# Prepare training data
|
||||
# Process data for this specific product
|
||||
processed_data = await self.data_processor.prepare_training_data(
|
||||
sales_data=product_sales,
|
||||
weather_data=weather_df,
|
||||
@@ -160,29 +165,38 @@ class BakeryMLTrainer:
|
||||
"status": "success",
|
||||
"model_info": model_info,
|
||||
"data_points": len(processed_data),
|
||||
"data_info": {
|
||||
"date_range": {
|
||||
"start": training_dataset.date_range.start.isoformat(),
|
||||
"end": training_dataset.date_range.end.isoformat(),
|
||||
"duration_days": (training_dataset.date_range.end - training_dataset.date_range.start).days
|
||||
},
|
||||
"data_sources": [source.value for source in training_dataset.date_range.available_sources],
|
||||
"constraints_applied": training_dataset.date_range.constraints
|
||||
},
|
||||
"completed_at": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
logger.info(f"Single product training {job_id} completed successfully")
|
||||
logger.info(f"Single product ML training {job_id} completed successfully")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Single product training {job_id} failed: {str(e)}")
|
||||
logger.error(f"Single product ML training {job_id} failed: {str(e)}")
|
||||
raise
|
||||
|
||||
async def evaluate_model_performance(self,
|
||||
tenant_id: str,
|
||||
product_name: str,
|
||||
model_path: str,
|
||||
test_data: List[Dict]) -> Dict[str, Any]:
|
||||
test_dataset: TrainingDataSet) -> Dict[str, Any]:
|
||||
"""
|
||||
Evaluate model performance on test data.
|
||||
Evaluate model performance using test dataset.
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant identifier
|
||||
product_name: Product name
|
||||
model_path: Path to the trained model
|
||||
test_data: Test data for evaluation
|
||||
test_dataset: Test dataset for evaluation
|
||||
|
||||
Returns:
|
||||
Performance metrics
|
||||
@@ -190,46 +204,75 @@ class BakeryMLTrainer:
|
||||
try:
|
||||
logger.info(f"Evaluating model performance for {product_name}")
|
||||
|
||||
# Convert test data to DataFrame
|
||||
test_df = pd.DataFrame(test_data)
|
||||
# Convert test data to DataFrames
|
||||
test_sales_df = pd.DataFrame(test_dataset.sales_data)
|
||||
test_weather_df = pd.DataFrame(test_dataset.weather_data)
|
||||
test_traffic_df = pd.DataFrame(test_dataset.traffic_data)
|
||||
|
||||
# Prepare test data
|
||||
test_prepared = await self.data_processor.prepare_prediction_features(
|
||||
future_dates=test_df['ds'],
|
||||
weather_forecast=test_df if 'temperature' in test_df.columns else pd.DataFrame(),
|
||||
traffic_forecast=test_df if 'traffic_volume' in test_df.columns else pd.DataFrame()
|
||||
# Filter for specific product
|
||||
product_test_sales = test_sales_df[test_sales_df['product_name'] == product_name].copy()
|
||||
|
||||
if product_test_sales.empty:
|
||||
raise ValueError(f"No test data found for product: {product_name}")
|
||||
|
||||
# Process test data
|
||||
processed_test_data = await self.data_processor.prepare_training_data(
|
||||
sales_data=product_test_sales,
|
||||
weather_data=test_weather_df,
|
||||
traffic_data=test_traffic_df,
|
||||
product_name=product_name
|
||||
)
|
||||
|
||||
# Get regressor columns
|
||||
regressor_columns = [col for col in test_prepared.columns if col not in ['ds', 'y']]
|
||||
# Create future dataframe for prediction
|
||||
future_dates = processed_test_data[['ds']].copy()
|
||||
|
||||
# Add regressor columns
|
||||
regressor_columns = [col for col in processed_test_data.columns if col not in ['ds', 'y']]
|
||||
for col in regressor_columns:
|
||||
future_dates[col] = processed_test_data[col]
|
||||
|
||||
# Generate predictions
|
||||
forecast = await self.prophet_manager.generate_forecast(
|
||||
model_path=model_path,
|
||||
future_dates=test_prepared,
|
||||
future_dates=future_dates,
|
||||
regressor_columns=regressor_columns
|
||||
)
|
||||
|
||||
# Calculate performance metrics if we have actual values
|
||||
metrics = {}
|
||||
if 'y' in test_df.columns:
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||
|
||||
y_true = test_df['y'].values
|
||||
y_pred = forecast['yhat'].values
|
||||
|
||||
metrics = {
|
||||
"mae": float(mean_absolute_error(y_true, y_pred)),
|
||||
"rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
|
||||
"mape": float(np.mean(np.abs((y_true - y_pred) / y_true)) * 100),
|
||||
"r2_score": float(r2_score(y_true, y_pred))
|
||||
}
|
||||
# Calculate performance metrics
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||
|
||||
y_true = processed_test_data['y'].values
|
||||
y_pred = forecast['yhat'].values
|
||||
|
||||
# Ensure arrays are the same length
|
||||
min_len = min(len(y_true), len(y_pred))
|
||||
y_true = y_true[:min_len]
|
||||
y_pred = y_pred[:min_len]
|
||||
|
||||
metrics = {
|
||||
"mae": float(mean_absolute_error(y_true, y_pred)),
|
||||
"rmse": float(np.sqrt(mean_squared_error(y_true, y_pred))),
|
||||
"r2_score": float(r2_score(y_true, y_pred))
|
||||
}
|
||||
|
||||
# Calculate MAPE safely
|
||||
non_zero_mask = y_true > 0.1
|
||||
if np.sum(non_zero_mask) > 0:
|
||||
mape = np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
|
||||
metrics["mape"] = float(min(mape, 200)) # Cap at 200%
|
||||
else:
|
||||
metrics["mape"] = 100.0
|
||||
|
||||
result = {
|
||||
"tenant_id": tenant_id,
|
||||
"product_name": product_name,
|
||||
"evaluation_metrics": metrics,
|
||||
"forecast_samples": len(forecast),
|
||||
"test_samples": len(processed_test_data),
|
||||
"prediction_samples": len(forecast),
|
||||
"test_period": {
|
||||
"start": test_dataset.date_range.start.isoformat(),
|
||||
"end": test_dataset.date_range.end.isoformat()
|
||||
},
|
||||
"evaluated_at": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
@@ -244,6 +287,7 @@ class BakeryMLTrainer:
|
||||
if sales_df.empty:
|
||||
raise ValueError(f"No sales data provided for tenant {tenant_id}")
|
||||
|
||||
# Handle quantity column mapping
|
||||
if 'quantity_sold' in sales_df.columns and 'quantity' not in sales_df.columns:
|
||||
sales_df['quantity'] = sales_df['quantity_sold']
|
||||
logger.info("Mapped 'quantity_sold' to 'quantity' column")
|
||||
@@ -261,14 +305,17 @@ class BakeryMLTrainer:
|
||||
|
||||
# Check for valid quantities
|
||||
if not sales_df['quantity'].dtype in ['int64', 'float64']:
|
||||
raise ValueError("Quantity column must be numeric")
|
||||
try:
|
||||
sales_df['quantity'] = pd.to_numeric(sales_df['quantity'], errors='coerce')
|
||||
except Exception:
|
||||
raise ValueError("Quantity column must be numeric")
|
||||
|
||||
async def _process_all_products(self,
|
||||
sales_df: pd.DataFrame,
|
||||
weather_df: pd.DataFrame,
|
||||
traffic_df: pd.DataFrame,
|
||||
products: List[str]) -> Dict[str, pd.DataFrame]:
|
||||
"""Process data for all products"""
|
||||
"""Process data for all products using the data processor"""
|
||||
processed_data = {}
|
||||
|
||||
for product_name in products:
|
||||
@@ -278,7 +325,11 @@ class BakeryMLTrainer:
|
||||
# Filter sales data for this product
|
||||
product_sales = sales_df[sales_df['product_name'] == product_name].copy()
|
||||
|
||||
# Process the product data
|
||||
if product_sales.empty:
|
||||
logger.warning(f"No sales data found for product: {product_name}")
|
||||
continue
|
||||
|
||||
# Use data processor to prepare training data
|
||||
processed_product_data = await self.data_processor.prepare_training_data(
|
||||
sales_data=product_sales,
|
||||
weather_data=weather_df,
|
||||
@@ -300,7 +351,7 @@ class BakeryMLTrainer:
|
||||
tenant_id: str,
|
||||
processed_data: Dict[str, pd.DataFrame],
|
||||
job_id: str) -> Dict[str, Any]:
|
||||
"""Train models for all processed products"""
|
||||
"""Train models for all processed products using Prophet manager"""
|
||||
training_results = {}
|
||||
|
||||
for product_name, product_data in processed_data.items():
|
||||
@@ -313,11 +364,13 @@ class BakeryMLTrainer:
|
||||
'status': 'skipped',
|
||||
'reason': 'insufficient_data',
|
||||
'data_points': len(product_data),
|
||||
'min_required': settings.MIN_TRAINING_DATA_DAYS
|
||||
'min_required': settings.MIN_TRAINING_DATA_DAYS,
|
||||
'message': f'Need at least {settings.MIN_TRAINING_DATA_DAYS} data points, got {len(product_data)}'
|
||||
}
|
||||
logger.warning(f"Skipping {product_name}: insufficient data ({len(product_data)} < {settings.MIN_TRAINING_DATA_DAYS})")
|
||||
continue
|
||||
|
||||
# Train the model
|
||||
# Train the model using Prophet manager
|
||||
model_info = await self.prophet_manager.train_bakery_model(
|
||||
tenant_id=tenant_id,
|
||||
product_name=product_name,
|
||||
@@ -339,7 +392,8 @@ class BakeryMLTrainer:
|
||||
training_results[product_name] = {
|
||||
'status': 'error',
|
||||
'error_message': str(e),
|
||||
'data_points': len(product_data) if product_data is not None else 0
|
||||
'data_points': len(product_data) if product_data is not None else 0,
|
||||
'failed_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
return training_results
|
||||
@@ -360,17 +414,27 @@ class BakeryMLTrainer:
|
||||
|
||||
if metrics_list and all(metrics_list):
|
||||
avg_metrics = {
|
||||
'avg_mae': np.mean([m.get('mae', 0) for m in metrics_list]),
|
||||
'avg_rmse': np.mean([m.get('rmse', 0) for m in metrics_list]),
|
||||
'avg_mape': np.mean([m.get('mape', 0) for m in metrics_list]),
|
||||
'avg_r2': np.mean([m.get('r2_score', 0) for m in metrics_list])
|
||||
'avg_mae': round(np.mean([m.get('mae', 0) for m in metrics_list]), 2),
|
||||
'avg_rmse': round(np.mean([m.get('rmse', 0) for m in metrics_list]), 2),
|
||||
'avg_mape': round(np.mean([m.get('mape', 0) for m in metrics_list]), 2),
|
||||
'avg_r2': round(np.mean([m.get('r2', 0) for m in metrics_list]), 3),
|
||||
'avg_improvement': round(np.mean([m.get('improvement_estimated', 0) for m in metrics_list]), 1)
|
||||
}
|
||||
|
||||
# Calculate data quality insights
|
||||
data_points_list = [r.get('data_points', 0) for r in training_results.values()]
|
||||
|
||||
return {
|
||||
'total_products': total_products,
|
||||
'successful_products': successful_products,
|
||||
'failed_products': failed_products,
|
||||
'skipped_products': skipped_products,
|
||||
'success_rate': round(successful_products / total_products * 100, 2) if total_products > 0 else 0,
|
||||
'average_metrics': avg_metrics
|
||||
'average_metrics': avg_metrics,
|
||||
'data_summary': {
|
||||
'total_data_points': sum(data_points_list),
|
||||
'avg_data_points_per_product': round(np.mean(data_points_list), 1) if data_points_list else 0,
|
||||
'min_data_points': min(data_points_list) if data_points_list else 0,
|
||||
'max_data_points': max(data_points_list) if data_points_list else 0
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user