""" Product Categorization System Classifies bakery products into categories for category-specific forecasting """ import pandas as pd import numpy as np from typing import Dict, List, Optional, Tuple from enum import Enum import structlog logger = structlog.get_logger() class ProductCategory(str, Enum): """Product categories for bakery items""" BREAD = "bread" PASTRIES = "pastries" CAKES = "cakes" DRINKS = "drinks" SEASONAL = "seasonal" SAVORY = "savory" UNKNOWN = "unknown" class ProductCategorizer: """ Automatic product categorization based on product name and sales patterns. Categories have different characteristics: - BREAD: Daily staple, high volume, consistent demand, short shelf life (1 day) - PASTRIES: Morning peak, weekend boost, medium shelf life (2-3 days) - CAKES: Event-driven, weekends, advance orders, longer shelf life (3-5 days) - DRINKS: Weather-dependent, hot/cold seasonal patterns - SEASONAL: Holiday-specific (roscón, panettone, etc.) - SAVORY: Lunch peak, weekday focus """ def __init__(self): # Keywords for automatic classification self.category_keywords = { ProductCategory.BREAD: [ 'pan', 'baguette', 'hogaza', 'chapata', 'integral', 'centeno', 'bread', 'loaf', 'barra', 'molde', 'candeal' ], ProductCategory.PASTRIES: [ 'croissant', 'napolitana', 'palmera', 'ensaimada', 'magdalena', 'bollo', 'brioche', 'suizo', 'caracola', 'donut', 'berlina' ], ProductCategory.CAKES: [ 'tarta', 'pastel', 'bizcocho', 'cake', 'torta', 'milhojas', 'saint honoré', 'selva negra', 'tres leches' ], ProductCategory.DRINKS: [ 'café', 'coffee', 'té', 'tea', 'zumo', 'juice', 'batido', 'smoothie', 'refresco', 'agua', 'water' ], ProductCategory.SEASONAL: [ 'roscón', 'panettone', 'turrón', 'polvorón', 'mona de pascua', 'huevo de pascua', 'buñuelo', 'torrija' ], ProductCategory.SAVORY: [ 'empanada', 'quiche', 'pizza', 'focaccia', 'salado', 'bocadillo', 'sandwich', 'croqueta', 'hojaldre salado' ] } def categorize_product( self, product_name: str, product_id: str = None, sales_data: pd.DataFrame = None ) -> ProductCategory: """ Categorize a product based on name and optional sales patterns. Args: product_name: Product name product_id: Optional product ID sales_data: Optional historical sales data for pattern analysis Returns: ProductCategory enum """ # First try keyword matching category = self._categorize_by_keywords(product_name) if category != ProductCategory.UNKNOWN: logger.info(f"Product categorized by keywords", product=product_name, category=category.value) return category # If no keyword match and we have sales data, analyze patterns if sales_data is not None and len(sales_data) > 30: category = self._categorize_by_sales_pattern(product_name, sales_data) logger.info(f"Product categorized by sales pattern", product=product_name, category=category.value) return category logger.warning(f"Could not categorize product, using UNKNOWN", product=product_name) return ProductCategory.UNKNOWN def _categorize_by_keywords(self, product_name: str) -> ProductCategory: """Categorize by matching keywords in product name""" product_name_lower = product_name.lower() # Check each category's keywords for category, keywords in self.category_keywords.items(): for keyword in keywords: if keyword in product_name_lower: return category return ProductCategory.UNKNOWN def _categorize_by_sales_pattern( self, product_name: str, sales_data: pd.DataFrame ) -> ProductCategory: """ Categorize by analyzing sales patterns. Patterns: - BREAD: Consistent daily sales, low variance - PASTRIES: Weekend boost, morning peak - CAKES: Weekend spike, event correlation - DRINKS: Temperature correlation - SEASONAL: Concentrated in specific months - SAVORY: Weekday focus, lunch peak """ try: # Ensure we have required columns if 'date' not in sales_data.columns or 'quantity' not in sales_data.columns: return ProductCategory.UNKNOWN sales_data = sales_data.copy() sales_data['date'] = pd.to_datetime(sales_data['date']) sales_data['day_of_week'] = sales_data['date'].dt.dayofweek sales_data['month'] = sales_data['date'].dt.month sales_data['is_weekend'] = sales_data['day_of_week'].isin([5, 6]) # Calculate pattern metrics weekend_avg = sales_data[sales_data['is_weekend']]['quantity'].mean() weekday_avg = sales_data[~sales_data['is_weekend']]['quantity'].mean() overall_avg = sales_data['quantity'].mean() cv = sales_data['quantity'].std() / overall_avg if overall_avg > 0 else 0 # Weekend ratio weekend_ratio = weekend_avg / weekday_avg if weekday_avg > 0 else 1.0 # Seasonal concentration (Gini coefficient for months) monthly_sales = sales_data.groupby('month')['quantity'].sum() seasonal_concentration = self._gini_coefficient(monthly_sales.values) # Decision rules based on patterns if seasonal_concentration > 0.6: # High concentration in specific months = seasonal return ProductCategory.SEASONAL elif cv < 0.3 and weekend_ratio < 1.2: # Low variance, consistent daily = bread return ProductCategory.BREAD elif weekend_ratio > 1.5: # Strong weekend boost = cakes return ProductCategory.CAKES elif weekend_ratio > 1.2: # Moderate weekend boost = pastries return ProductCategory.PASTRIES elif weekend_ratio < 0.9: # Weekday focus = savory return ProductCategory.SAVORY else: return ProductCategory.UNKNOWN except Exception as e: logger.error(f"Error analyzing sales pattern: {e}") return ProductCategory.UNKNOWN def _gini_coefficient(self, values: np.ndarray) -> float: """Calculate Gini coefficient for concentration measurement""" if len(values) == 0: return 0.0 sorted_values = np.sort(values) n = len(values) cumsum = np.cumsum(sorted_values) # Gini coefficient formula return (2 * np.sum((np.arange(1, n + 1) * sorted_values))) / (n * cumsum[-1]) - (n + 1) / n def get_category_characteristics(self, category: ProductCategory) -> Dict[str, any]: """ Get forecasting characteristics for a category. Returns hyperparameters and settings specific to the category. """ characteristics = { ProductCategory.BREAD: { "shelf_life_days": 1, "demand_stability": "high", "seasonality_strength": "low", "weekend_factor": 0.95, # Slightly lower on weekends "holiday_factor": 0.7, # Much lower on holidays "weather_sensitivity": "low", "prophet_params": { "seasonality_mode": "additive", "yearly_seasonality": False, "weekly_seasonality": True, "daily_seasonality": False, "changepoint_prior_scale": 0.01, # Very stable "seasonality_prior_scale": 5.0 } }, ProductCategory.PASTRIES: { "shelf_life_days": 2, "demand_stability": "medium", "seasonality_strength": "medium", "weekend_factor": 1.3, # Boost on weekends "holiday_factor": 1.1, # Slight boost on holidays "weather_sensitivity": "medium", "prophet_params": { "seasonality_mode": "multiplicative", "yearly_seasonality": True, "weekly_seasonality": True, "daily_seasonality": False, "changepoint_prior_scale": 0.05, "seasonality_prior_scale": 10.0 } }, ProductCategory.CAKES: { "shelf_life_days": 4, "demand_stability": "low", "seasonality_strength": "high", "weekend_factor": 2.0, # Large weekend boost "holiday_factor": 1.5, # Holiday boost "weather_sensitivity": "low", "prophet_params": { "seasonality_mode": "multiplicative", "yearly_seasonality": True, "weekly_seasonality": True, "daily_seasonality": False, "changepoint_prior_scale": 0.1, # More flexible "seasonality_prior_scale": 15.0 } }, ProductCategory.DRINKS: { "shelf_life_days": 1, "demand_stability": "medium", "seasonality_strength": "high", "weekend_factor": 1.1, "holiday_factor": 1.2, "weather_sensitivity": "very_high", "prophet_params": { "seasonality_mode": "multiplicative", "yearly_seasonality": True, "weekly_seasonality": True, "daily_seasonality": False, "changepoint_prior_scale": 0.08, "seasonality_prior_scale": 12.0 } }, ProductCategory.SEASONAL: { "shelf_life_days": 7, "demand_stability": "very_low", "seasonality_strength": "very_high", "weekend_factor": 1.2, "holiday_factor": 3.0, # Massive holiday boost "weather_sensitivity": "low", "prophet_params": { "seasonality_mode": "multiplicative", "yearly_seasonality": True, "weekly_seasonality": False, "daily_seasonality": False, "changepoint_prior_scale": 0.2, # Very flexible "seasonality_prior_scale": 20.0 } }, ProductCategory.SAVORY: { "shelf_life_days": 1, "demand_stability": "medium", "seasonality_strength": "low", "weekend_factor": 0.8, # Lower on weekends "holiday_factor": 0.6, # Much lower on holidays "weather_sensitivity": "medium", "prophet_params": { "seasonality_mode": "additive", "yearly_seasonality": False, "weekly_seasonality": True, "daily_seasonality": False, "changepoint_prior_scale": 0.03, "seasonality_prior_scale": 7.0 } }, ProductCategory.UNKNOWN: { "shelf_life_days": 2, "demand_stability": "medium", "seasonality_strength": "medium", "weekend_factor": 1.0, "holiday_factor": 1.0, "weather_sensitivity": "medium", "prophet_params": { "seasonality_mode": "multiplicative", "yearly_seasonality": True, "weekly_seasonality": True, "daily_seasonality": False, "changepoint_prior_scale": 0.05, "seasonality_prior_scale": 10.0 } } } return characteristics.get(category, characteristics[ProductCategory.UNKNOWN]) def batch_categorize( self, products: List[Dict[str, any]], sales_data: pd.DataFrame = None ) -> Dict[str, ProductCategory]: """ Categorize multiple products at once. Args: products: List of dicts with 'id' and 'name' keys sales_data: Optional sales data with 'inventory_product_id' column Returns: Dict mapping product_id to category """ results = {} for product in products: product_id = product.get('id') product_name = product.get('name', '') # Filter sales data for this product if available product_sales = None if sales_data is not None and 'inventory_product_id' in sales_data.columns: product_sales = sales_data[ sales_data['inventory_product_id'] == product_id ].copy() category = self.categorize_product( product_name=product_name, product_id=product_id, sales_data=product_sales ) results[product_id] = category logger.info(f"Batch categorization complete", total_products=len(products), categories=dict(pd.Series(list(results.values())).value_counts())) return results