Files
bakery-ia/services/training/app/ml/product_categorizer.py

362 lines
14 KiB
Python
Raw Normal View History

2025-11-05 13:34:56 +01:00
"""
Product Categorization System
Classifies bakery products into categories for category-specific forecasting
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Tuple
from enum import Enum
import structlog
logger = structlog.get_logger()
class ProductCategory(str, Enum):
"""Product categories for bakery items"""
BREAD = "bread"
PASTRIES = "pastries"
CAKES = "cakes"
DRINKS = "drinks"
SEASONAL = "seasonal"
SAVORY = "savory"
UNKNOWN = "unknown"
class ProductCategorizer:
"""
Automatic product categorization based on product name and sales patterns.
Categories have different characteristics:
- BREAD: Daily staple, high volume, consistent demand, short shelf life (1 day)
- PASTRIES: Morning peak, weekend boost, medium shelf life (2-3 days)
- CAKES: Event-driven, weekends, advance orders, longer shelf life (3-5 days)
- DRINKS: Weather-dependent, hot/cold seasonal patterns
- SEASONAL: Holiday-specific (roscón, panettone, etc.)
- SAVORY: Lunch peak, weekday focus
"""
def __init__(self):
# Keywords for automatic classification
self.category_keywords = {
ProductCategory.BREAD: [
'pan', 'baguette', 'hogaza', 'chapata', 'integral', 'centeno',
'bread', 'loaf', 'barra', 'molde', 'candeal'
],
ProductCategory.PASTRIES: [
'croissant', 'napolitana', 'palmera', 'ensaimada', 'magdalena',
'bollo', 'brioche', 'suizo', 'caracola', 'donut', 'berlina'
],
ProductCategory.CAKES: [
'tarta', 'pastel', 'bizcocho', 'cake', 'torta', 'milhojas',
'saint honoré', 'selva negra', 'tres leches'
],
ProductCategory.DRINKS: [
'café', 'coffee', '', 'tea', 'zumo', 'juice', 'batido',
'smoothie', 'refresco', 'agua', 'water'
],
ProductCategory.SEASONAL: [
'roscón', 'panettone', 'turrón', 'polvorón', 'mona de pascua',
'huevo de pascua', 'buñuelo', 'torrija'
],
ProductCategory.SAVORY: [
'empanada', 'quiche', 'pizza', 'focaccia', 'salado', 'bocadillo',
'sandwich', 'croqueta', 'hojaldre salado'
]
}
def categorize_product(
self,
product_name: str,
product_id: str = None,
sales_data: pd.DataFrame = None
) -> ProductCategory:
"""
Categorize a product based on name and optional sales patterns.
Args:
product_name: Product name
product_id: Optional product ID
sales_data: Optional historical sales data for pattern analysis
Returns:
ProductCategory enum
"""
# First try keyword matching
category = self._categorize_by_keywords(product_name)
if category != ProductCategory.UNKNOWN:
logger.info(f"Product categorized by keywords",
product=product_name,
category=category.value)
return category
# If no keyword match and we have sales data, analyze patterns
if sales_data is not None and len(sales_data) > 30:
category = self._categorize_by_sales_pattern(product_name, sales_data)
logger.info(f"Product categorized by sales pattern",
product=product_name,
category=category.value)
return category
logger.warning(f"Could not categorize product, using UNKNOWN",
product=product_name)
return ProductCategory.UNKNOWN
def _categorize_by_keywords(self, product_name: str) -> ProductCategory:
"""Categorize by matching keywords in product name"""
product_name_lower = product_name.lower()
# Check each category's keywords
for category, keywords in self.category_keywords.items():
for keyword in keywords:
if keyword in product_name_lower:
return category
return ProductCategory.UNKNOWN
def _categorize_by_sales_pattern(
self,
product_name: str,
sales_data: pd.DataFrame
) -> ProductCategory:
"""
Categorize by analyzing sales patterns.
Patterns:
- BREAD: Consistent daily sales, low variance
- PASTRIES: Weekend boost, morning peak
- CAKES: Weekend spike, event correlation
- DRINKS: Temperature correlation
- SEASONAL: Concentrated in specific months
- SAVORY: Weekday focus, lunch peak
"""
try:
# Ensure we have required columns
if 'date' not in sales_data.columns or 'quantity' not in sales_data.columns:
return ProductCategory.UNKNOWN
sales_data = sales_data.copy()
sales_data['date'] = pd.to_datetime(sales_data['date'])
sales_data['day_of_week'] = sales_data['date'].dt.dayofweek
sales_data['month'] = sales_data['date'].dt.month
sales_data['is_weekend'] = sales_data['day_of_week'].isin([5, 6])
# Calculate pattern metrics
weekend_avg = sales_data[sales_data['is_weekend']]['quantity'].mean()
weekday_avg = sales_data[~sales_data['is_weekend']]['quantity'].mean()
overall_avg = sales_data['quantity'].mean()
cv = sales_data['quantity'].std() / overall_avg if overall_avg > 0 else 0
# Weekend ratio
weekend_ratio = weekend_avg / weekday_avg if weekday_avg > 0 else 1.0
# Seasonal concentration (Gini coefficient for months)
monthly_sales = sales_data.groupby('month')['quantity'].sum()
seasonal_concentration = self._gini_coefficient(monthly_sales.values)
# Decision rules based on patterns
if seasonal_concentration > 0.6:
# High concentration in specific months = seasonal
return ProductCategory.SEASONAL
elif cv < 0.3 and weekend_ratio < 1.2:
# Low variance, consistent daily = bread
return ProductCategory.BREAD
elif weekend_ratio > 1.5:
# Strong weekend boost = cakes
return ProductCategory.CAKES
elif weekend_ratio > 1.2:
# Moderate weekend boost = pastries
return ProductCategory.PASTRIES
elif weekend_ratio < 0.9:
# Weekday focus = savory
return ProductCategory.SAVORY
else:
return ProductCategory.UNKNOWN
except Exception as e:
logger.error(f"Error analyzing sales pattern: {e}")
return ProductCategory.UNKNOWN
def _gini_coefficient(self, values: np.ndarray) -> float:
"""Calculate Gini coefficient for concentration measurement"""
if len(values) == 0:
return 0.0
sorted_values = np.sort(values)
n = len(values)
cumsum = np.cumsum(sorted_values)
# Gini coefficient formula
return (2 * np.sum((np.arange(1, n + 1) * sorted_values))) / (n * cumsum[-1]) - (n + 1) / n
def get_category_characteristics(self, category: ProductCategory) -> Dict[str, any]:
"""
Get forecasting characteristics for a category.
Returns hyperparameters and settings specific to the category.
"""
characteristics = {
ProductCategory.BREAD: {
"shelf_life_days": 1,
"demand_stability": "high",
"seasonality_strength": "low",
"weekend_factor": 0.95, # Slightly lower on weekends
"holiday_factor": 0.7, # Much lower on holidays
"weather_sensitivity": "low",
"prophet_params": {
"seasonality_mode": "additive",
"yearly_seasonality": False,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.01, # Very stable
"seasonality_prior_scale": 5.0
}
},
ProductCategory.PASTRIES: {
"shelf_life_days": 2,
"demand_stability": "medium",
"seasonality_strength": "medium",
"weekend_factor": 1.3, # Boost on weekends
"holiday_factor": 1.1, # Slight boost on holidays
"weather_sensitivity": "medium",
"prophet_params": {
"seasonality_mode": "multiplicative",
"yearly_seasonality": True,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.05,
"seasonality_prior_scale": 10.0
}
},
ProductCategory.CAKES: {
"shelf_life_days": 4,
"demand_stability": "low",
"seasonality_strength": "high",
"weekend_factor": 2.0, # Large weekend boost
"holiday_factor": 1.5, # Holiday boost
"weather_sensitivity": "low",
"prophet_params": {
"seasonality_mode": "multiplicative",
"yearly_seasonality": True,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.1, # More flexible
"seasonality_prior_scale": 15.0
}
},
ProductCategory.DRINKS: {
"shelf_life_days": 1,
"demand_stability": "medium",
"seasonality_strength": "high",
"weekend_factor": 1.1,
"holiday_factor": 1.2,
"weather_sensitivity": "very_high",
"prophet_params": {
"seasonality_mode": "multiplicative",
"yearly_seasonality": True,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.08,
"seasonality_prior_scale": 12.0
}
},
ProductCategory.SEASONAL: {
"shelf_life_days": 7,
"demand_stability": "very_low",
"seasonality_strength": "very_high",
"weekend_factor": 1.2,
"holiday_factor": 3.0, # Massive holiday boost
"weather_sensitivity": "low",
"prophet_params": {
"seasonality_mode": "multiplicative",
"yearly_seasonality": True,
"weekly_seasonality": False,
"daily_seasonality": False,
"changepoint_prior_scale": 0.2, # Very flexible
"seasonality_prior_scale": 20.0
}
},
ProductCategory.SAVORY: {
"shelf_life_days": 1,
"demand_stability": "medium",
"seasonality_strength": "low",
"weekend_factor": 0.8, # Lower on weekends
"holiday_factor": 0.6, # Much lower on holidays
"weather_sensitivity": "medium",
"prophet_params": {
"seasonality_mode": "additive",
"yearly_seasonality": False,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.03,
"seasonality_prior_scale": 7.0
}
},
ProductCategory.UNKNOWN: {
"shelf_life_days": 2,
"demand_stability": "medium",
"seasonality_strength": "medium",
"weekend_factor": 1.0,
"holiday_factor": 1.0,
"weather_sensitivity": "medium",
"prophet_params": {
"seasonality_mode": "multiplicative",
"yearly_seasonality": True,
"weekly_seasonality": True,
"daily_seasonality": False,
"changepoint_prior_scale": 0.05,
"seasonality_prior_scale": 10.0
}
}
}
return characteristics.get(category, characteristics[ProductCategory.UNKNOWN])
def batch_categorize(
self,
products: List[Dict[str, any]],
sales_data: pd.DataFrame = None
) -> Dict[str, ProductCategory]:
"""
Categorize multiple products at once.
Args:
products: List of dicts with 'id' and 'name' keys
sales_data: Optional sales data with 'inventory_product_id' column
Returns:
Dict mapping product_id to category
"""
results = {}
for product in products:
product_id = product.get('id')
product_name = product.get('name', '')
# Filter sales data for this product if available
product_sales = None
if sales_data is not None and 'inventory_product_id' in sales_data.columns:
product_sales = sales_data[
sales_data['inventory_product_id'] == product_id
].copy()
category = self.categorize_product(
product_name=product_name,
product_id=product_id,
sales_data=product_sales
)
results[product_id] = category
logger.info(f"Batch categorization complete",
total_products=len(products),
categories=dict(pd.Series(list(results.values())).value_counts()))
return results