362 lines
14 KiB
Python
362 lines
14 KiB
Python
"""
|
|
Product Categorization System
|
|
Classifies bakery products into categories for category-specific forecasting
|
|
"""
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
from typing import Dict, List, Optional, Tuple
|
|
from enum import Enum
|
|
import structlog
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
|
|
class ProductCategory(str, Enum):
|
|
"""Product categories for bakery items"""
|
|
BREAD = "bread"
|
|
PASTRIES = "pastries"
|
|
CAKES = "cakes"
|
|
DRINKS = "drinks"
|
|
SEASONAL = "seasonal"
|
|
SAVORY = "savory"
|
|
UNKNOWN = "unknown"
|
|
|
|
|
|
class ProductCategorizer:
|
|
"""
|
|
Automatic product categorization based on product name and sales patterns.
|
|
|
|
Categories have different characteristics:
|
|
- BREAD: Daily staple, high volume, consistent demand, short shelf life (1 day)
|
|
- PASTRIES: Morning peak, weekend boost, medium shelf life (2-3 days)
|
|
- CAKES: Event-driven, weekends, advance orders, longer shelf life (3-5 days)
|
|
- DRINKS: Weather-dependent, hot/cold seasonal patterns
|
|
- SEASONAL: Holiday-specific (roscón, panettone, etc.)
|
|
- SAVORY: Lunch peak, weekday focus
|
|
"""
|
|
|
|
def __init__(self):
|
|
# Keywords for automatic classification
|
|
self.category_keywords = {
|
|
ProductCategory.BREAD: [
|
|
'pan', 'baguette', 'hogaza', 'chapata', 'integral', 'centeno',
|
|
'bread', 'loaf', 'barra', 'molde', 'candeal'
|
|
],
|
|
ProductCategory.PASTRIES: [
|
|
'croissant', 'napolitana', 'palmera', 'ensaimada', 'magdalena',
|
|
'bollo', 'brioche', 'suizo', 'caracola', 'donut', 'berlina'
|
|
],
|
|
ProductCategory.CAKES: [
|
|
'tarta', 'pastel', 'bizcocho', 'cake', 'torta', 'milhojas',
|
|
'saint honoré', 'selva negra', 'tres leches'
|
|
],
|
|
ProductCategory.DRINKS: [
|
|
'café', 'coffee', 'té', 'tea', 'zumo', 'juice', 'batido',
|
|
'smoothie', 'refresco', 'agua', 'water'
|
|
],
|
|
ProductCategory.SEASONAL: [
|
|
'roscón', 'panettone', 'turrón', 'polvorón', 'mona de pascua',
|
|
'huevo de pascua', 'buñuelo', 'torrija'
|
|
],
|
|
ProductCategory.SAVORY: [
|
|
'empanada', 'quiche', 'pizza', 'focaccia', 'salado', 'bocadillo',
|
|
'sandwich', 'croqueta', 'hojaldre salado'
|
|
]
|
|
}
|
|
|
|
def categorize_product(
|
|
self,
|
|
product_name: str,
|
|
product_id: str = None,
|
|
sales_data: pd.DataFrame = None
|
|
) -> ProductCategory:
|
|
"""
|
|
Categorize a product based on name and optional sales patterns.
|
|
|
|
Args:
|
|
product_name: Product name
|
|
product_id: Optional product ID
|
|
sales_data: Optional historical sales data for pattern analysis
|
|
|
|
Returns:
|
|
ProductCategory enum
|
|
"""
|
|
# First try keyword matching
|
|
category = self._categorize_by_keywords(product_name)
|
|
|
|
if category != ProductCategory.UNKNOWN:
|
|
logger.info(f"Product categorized by keywords",
|
|
product=product_name,
|
|
category=category.value)
|
|
return category
|
|
|
|
# If no keyword match and we have sales data, analyze patterns
|
|
if sales_data is not None and len(sales_data) > 30:
|
|
category = self._categorize_by_sales_pattern(product_name, sales_data)
|
|
logger.info(f"Product categorized by sales pattern",
|
|
product=product_name,
|
|
category=category.value)
|
|
return category
|
|
|
|
logger.warning(f"Could not categorize product, using UNKNOWN",
|
|
product=product_name)
|
|
return ProductCategory.UNKNOWN
|
|
|
|
def _categorize_by_keywords(self, product_name: str) -> ProductCategory:
|
|
"""Categorize by matching keywords in product name"""
|
|
product_name_lower = product_name.lower()
|
|
|
|
# Check each category's keywords
|
|
for category, keywords in self.category_keywords.items():
|
|
for keyword in keywords:
|
|
if keyword in product_name_lower:
|
|
return category
|
|
|
|
return ProductCategory.UNKNOWN
|
|
|
|
def _categorize_by_sales_pattern(
|
|
self,
|
|
product_name: str,
|
|
sales_data: pd.DataFrame
|
|
) -> ProductCategory:
|
|
"""
|
|
Categorize by analyzing sales patterns.
|
|
|
|
Patterns:
|
|
- BREAD: Consistent daily sales, low variance
|
|
- PASTRIES: Weekend boost, morning peak
|
|
- CAKES: Weekend spike, event correlation
|
|
- DRINKS: Temperature correlation
|
|
- SEASONAL: Concentrated in specific months
|
|
- SAVORY: Weekday focus, lunch peak
|
|
"""
|
|
try:
|
|
# Ensure we have required columns
|
|
if 'date' not in sales_data.columns or 'quantity' not in sales_data.columns:
|
|
return ProductCategory.UNKNOWN
|
|
|
|
sales_data = sales_data.copy()
|
|
sales_data['date'] = pd.to_datetime(sales_data['date'])
|
|
sales_data['day_of_week'] = sales_data['date'].dt.dayofweek
|
|
sales_data['month'] = sales_data['date'].dt.month
|
|
sales_data['is_weekend'] = sales_data['day_of_week'].isin([5, 6])
|
|
|
|
# Calculate pattern metrics
|
|
weekend_avg = sales_data[sales_data['is_weekend']]['quantity'].mean()
|
|
weekday_avg = sales_data[~sales_data['is_weekend']]['quantity'].mean()
|
|
overall_avg = sales_data['quantity'].mean()
|
|
cv = sales_data['quantity'].std() / overall_avg if overall_avg > 0 else 0
|
|
|
|
# Weekend ratio
|
|
weekend_ratio = weekend_avg / weekday_avg if weekday_avg > 0 else 1.0
|
|
|
|
# Seasonal concentration (Gini coefficient for months)
|
|
monthly_sales = sales_data.groupby('month')['quantity'].sum()
|
|
seasonal_concentration = self._gini_coefficient(monthly_sales.values)
|
|
|
|
# Decision rules based on patterns
|
|
if seasonal_concentration > 0.6:
|
|
# High concentration in specific months = seasonal
|
|
return ProductCategory.SEASONAL
|
|
|
|
elif cv < 0.3 and weekend_ratio < 1.2:
|
|
# Low variance, consistent daily = bread
|
|
return ProductCategory.BREAD
|
|
|
|
elif weekend_ratio > 1.5:
|
|
# Strong weekend boost = cakes
|
|
return ProductCategory.CAKES
|
|
|
|
elif weekend_ratio > 1.2:
|
|
# Moderate weekend boost = pastries
|
|
return ProductCategory.PASTRIES
|
|
|
|
elif weekend_ratio < 0.9:
|
|
# Weekday focus = savory
|
|
return ProductCategory.SAVORY
|
|
|
|
else:
|
|
return ProductCategory.UNKNOWN
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing sales pattern: {e}")
|
|
return ProductCategory.UNKNOWN
|
|
|
|
def _gini_coefficient(self, values: np.ndarray) -> float:
|
|
"""Calculate Gini coefficient for concentration measurement"""
|
|
if len(values) == 0:
|
|
return 0.0
|
|
|
|
sorted_values = np.sort(values)
|
|
n = len(values)
|
|
cumsum = np.cumsum(sorted_values)
|
|
|
|
# Gini coefficient formula
|
|
return (2 * np.sum((np.arange(1, n + 1) * sorted_values))) / (n * cumsum[-1]) - (n + 1) / n
|
|
|
|
def get_category_characteristics(self, category: ProductCategory) -> Dict[str, any]:
|
|
"""
|
|
Get forecasting characteristics for a category.
|
|
|
|
Returns hyperparameters and settings specific to the category.
|
|
"""
|
|
characteristics = {
|
|
ProductCategory.BREAD: {
|
|
"shelf_life_days": 1,
|
|
"demand_stability": "high",
|
|
"seasonality_strength": "low",
|
|
"weekend_factor": 0.95, # Slightly lower on weekends
|
|
"holiday_factor": 0.7, # Much lower on holidays
|
|
"weather_sensitivity": "low",
|
|
"prophet_params": {
|
|
"seasonality_mode": "additive",
|
|
"yearly_seasonality": False,
|
|
"weekly_seasonality": True,
|
|
"daily_seasonality": False,
|
|
"changepoint_prior_scale": 0.01, # Very stable
|
|
"seasonality_prior_scale": 5.0
|
|
}
|
|
},
|
|
ProductCategory.PASTRIES: {
|
|
"shelf_life_days": 2,
|
|
"demand_stability": "medium",
|
|
"seasonality_strength": "medium",
|
|
"weekend_factor": 1.3, # Boost on weekends
|
|
"holiday_factor": 1.1, # Slight boost on holidays
|
|
"weather_sensitivity": "medium",
|
|
"prophet_params": {
|
|
"seasonality_mode": "multiplicative",
|
|
"yearly_seasonality": True,
|
|
"weekly_seasonality": True,
|
|
"daily_seasonality": False,
|
|
"changepoint_prior_scale": 0.05,
|
|
"seasonality_prior_scale": 10.0
|
|
}
|
|
},
|
|
ProductCategory.CAKES: {
|
|
"shelf_life_days": 4,
|
|
"demand_stability": "low",
|
|
"seasonality_strength": "high",
|
|
"weekend_factor": 2.0, # Large weekend boost
|
|
"holiday_factor": 1.5, # Holiday boost
|
|
"weather_sensitivity": "low",
|
|
"prophet_params": {
|
|
"seasonality_mode": "multiplicative",
|
|
"yearly_seasonality": True,
|
|
"weekly_seasonality": True,
|
|
"daily_seasonality": False,
|
|
"changepoint_prior_scale": 0.1, # More flexible
|
|
"seasonality_prior_scale": 15.0
|
|
}
|
|
},
|
|
ProductCategory.DRINKS: {
|
|
"shelf_life_days": 1,
|
|
"demand_stability": "medium",
|
|
"seasonality_strength": "high",
|
|
"weekend_factor": 1.1,
|
|
"holiday_factor": 1.2,
|
|
"weather_sensitivity": "very_high",
|
|
"prophet_params": {
|
|
"seasonality_mode": "multiplicative",
|
|
"yearly_seasonality": True,
|
|
"weekly_seasonality": True,
|
|
"daily_seasonality": False,
|
|
"changepoint_prior_scale": 0.08,
|
|
"seasonality_prior_scale": 12.0
|
|
}
|
|
},
|
|
ProductCategory.SEASONAL: {
|
|
"shelf_life_days": 7,
|
|
"demand_stability": "very_low",
|
|
"seasonality_strength": "very_high",
|
|
"weekend_factor": 1.2,
|
|
"holiday_factor": 3.0, # Massive holiday boost
|
|
"weather_sensitivity": "low",
|
|
"prophet_params": {
|
|
"seasonality_mode": "multiplicative",
|
|
"yearly_seasonality": True,
|
|
"weekly_seasonality": False,
|
|
"daily_seasonality": False,
|
|
"changepoint_prior_scale": 0.2, # Very flexible
|
|
"seasonality_prior_scale": 20.0
|
|
}
|
|
},
|
|
ProductCategory.SAVORY: {
|
|
"shelf_life_days": 1,
|
|
"demand_stability": "medium",
|
|
"seasonality_strength": "low",
|
|
"weekend_factor": 0.8, # Lower on weekends
|
|
"holiday_factor": 0.6, # Much lower on holidays
|
|
"weather_sensitivity": "medium",
|
|
"prophet_params": {
|
|
"seasonality_mode": "additive",
|
|
"yearly_seasonality": False,
|
|
"weekly_seasonality": True,
|
|
"daily_seasonality": False,
|
|
"changepoint_prior_scale": 0.03,
|
|
"seasonality_prior_scale": 7.0
|
|
}
|
|
},
|
|
ProductCategory.UNKNOWN: {
|
|
"shelf_life_days": 2,
|
|
"demand_stability": "medium",
|
|
"seasonality_strength": "medium",
|
|
"weekend_factor": 1.0,
|
|
"holiday_factor": 1.0,
|
|
"weather_sensitivity": "medium",
|
|
"prophet_params": {
|
|
"seasonality_mode": "multiplicative",
|
|
"yearly_seasonality": True,
|
|
"weekly_seasonality": True,
|
|
"daily_seasonality": False,
|
|
"changepoint_prior_scale": 0.05,
|
|
"seasonality_prior_scale": 10.0
|
|
}
|
|
}
|
|
}
|
|
|
|
return characteristics.get(category, characteristics[ProductCategory.UNKNOWN])
|
|
|
|
def batch_categorize(
|
|
self,
|
|
products: List[Dict[str, any]],
|
|
sales_data: pd.DataFrame = None
|
|
) -> Dict[str, ProductCategory]:
|
|
"""
|
|
Categorize multiple products at once.
|
|
|
|
Args:
|
|
products: List of dicts with 'id' and 'name' keys
|
|
sales_data: Optional sales data with 'inventory_product_id' column
|
|
|
|
Returns:
|
|
Dict mapping product_id to category
|
|
"""
|
|
results = {}
|
|
|
|
for product in products:
|
|
product_id = product.get('id')
|
|
product_name = product.get('name', '')
|
|
|
|
# Filter sales data for this product if available
|
|
product_sales = None
|
|
if sales_data is not None and 'inventory_product_id' in sales_data.columns:
|
|
product_sales = sales_data[
|
|
sales_data['inventory_product_id'] == product_id
|
|
].copy()
|
|
|
|
category = self.categorize_product(
|
|
product_name=product_name,
|
|
product_id=product_id,
|
|
sales_data=product_sales
|
|
)
|
|
|
|
results[product_id] = category
|
|
|
|
logger.info(f"Batch categorization complete",
|
|
total_products=len(products),
|
|
categories=dict(pd.Series(list(results.values())).value_counts()))
|
|
|
|
return results
|