bakery-ia/services/training/app/ml/product_categorizer.py

"""
Product Categorization System
Classifies bakery products into categories for category-specific forecasting
"""

import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Tuple
from enum import Enum
import structlog

logger = structlog.get_logger()


class ProductCategory(str, Enum):
    """Product categories for bakery items"""
    BREAD = "bread"
    PASTRIES = "pastries"
    CAKES = "cakes"
    DRINKS = "drinks"
    SEASONAL = "seasonal"
    SAVORY = "savory"
    UNKNOWN = "unknown"


class ProductCategorizer:
    """
    Automatic product categorization based on product name and sales patterns.

    Categories have different characteristics:
    - BREAD: Daily staple, high volume, consistent demand, short shelf life (1 day)
    - PASTRIES: Morning peak, weekend boost, medium shelf life (2-3 days)
    - CAKES: Event-driven, weekends, advance orders, longer shelf life (3-5 days)
    - DRINKS: Weather-dependent, hot/cold seasonal patterns
    - SEASONAL: Holiday-specific (roscón, panettone, etc.)
    - SAVORY: Lunch peak, weekday focus
    """

    def __init__(self):
        # Keywords for automatic classification
        self.category_keywords = {
            ProductCategory.BREAD: [
                'pan', 'baguette', 'hogaza', 'chapata', 'integral', 'centeno',
                'bread', 'loaf', 'barra', 'molde', 'candeal'
            ],
            ProductCategory.PASTRIES: [
                'croissant', 'napolitana', 'palmera', 'ensaimada', 'magdalena',
                'bollo', 'brioche', 'suizo', 'caracola', 'donut', 'berlina'
            ],
            ProductCategory.CAKES: [
                'tarta', 'pastel', 'bizcocho', 'cake', 'torta', 'milhojas',
                'saint honoré', 'selva negra', 'tres leches'
            ],
            ProductCategory.DRINKS: [
                'café', 'coffee', 'té', 'tea', 'zumo', 'juice', 'batido',
                'smoothie', 'refresco', 'agua', 'water'
            ],
            ProductCategory.SEASONAL: [
                'roscón', 'panettone', 'turrón', 'polvorón', 'mona de pascua',
                'huevo de pascua', 'buñuelo', 'torrija'
            ],
            ProductCategory.SAVORY: [
                'empanada', 'quiche', 'pizza', 'focaccia', 'salado', 'bocadillo',
                'sandwich', 'croqueta', 'hojaldre salado'
            ]
        }

    def categorize_product(
        self,
        product_name: str,
        product_id: str = None,
        sales_data: pd.DataFrame = None
    ) -> ProductCategory:
        """
        Categorize a product based on name and optional sales patterns.

        Args:
            product_name: Product name
            product_id: Optional product ID
            sales_data: Optional historical sales data for pattern analysis

        Returns:
            ProductCategory enum
        """
        # First try keyword matching
        category = self._categorize_by_keywords(product_name)

        if category != ProductCategory.UNKNOWN:
            logger.info(f"Product categorized by keywords",
                       product=product_name,
                       category=category.value)
            return category

        # If no keyword match and we have sales data, analyze patterns
        if sales_data is not None and len(sales_data) > 30:
            category = self._categorize_by_sales_pattern(product_name, sales_data)
            logger.info(f"Product categorized by sales pattern",
                       product=product_name,
                       category=category.value)
            return category

        logger.warning(f"Could not categorize product, using UNKNOWN",
                      product=product_name)
        return ProductCategory.UNKNOWN

    def _categorize_by_keywords(self, product_name: str) -> ProductCategory:
        """Categorize by matching keywords in product name"""
        product_name_lower = product_name.lower()

        # Check each category's keywords
        for category, keywords in self.category_keywords.items():
            for keyword in keywords:
                if keyword in product_name_lower:
                    return category

        return ProductCategory.UNKNOWN

    def _categorize_by_sales_pattern(
        self,
        product_name: str,
        sales_data: pd.DataFrame
    ) -> ProductCategory:
        """
        Categorize by analyzing sales patterns.

        Patterns:
        - BREAD: Consistent daily sales, low variance
        - PASTRIES: Weekend boost, morning peak
        - CAKES: Weekend spike, event correlation
        - DRINKS: Temperature correlation
        - SEASONAL: Concentrated in specific months
        - SAVORY: Weekday focus, lunch peak
        """
        try:
            # Ensure we have required columns
            if 'date' not in sales_data.columns or 'quantity' not in sales_data.columns:
                return ProductCategory.UNKNOWN

            sales_data = sales_data.copy()
            sales_data['date'] = pd.to_datetime(sales_data['date'])
            sales_data['day_of_week'] = sales_data['date'].dt.dayofweek
            sales_data['month'] = sales_data['date'].dt.month
            sales_data['is_weekend'] = sales_data['day_of_week'].isin([5, 6])

            # Calculate pattern metrics
            weekend_avg = sales_data[sales_data['is_weekend']]['quantity'].mean()
            weekday_avg = sales_data[~sales_data['is_weekend']]['quantity'].mean()
            overall_avg = sales_data['quantity'].mean()
            cv = sales_data['quantity'].std() / overall_avg if overall_avg > 0 else 0

            # Weekend ratio
            weekend_ratio = weekend_avg / weekday_avg if weekday_avg > 0 else 1.0

            # Seasonal concentration (Gini coefficient for months)
            monthly_sales = sales_data.groupby('month')['quantity'].sum()
            seasonal_concentration = self._gini_coefficient(monthly_sales.values)

            # Decision rules based on patterns
            if seasonal_concentration > 0.6:
                # High concentration in specific months = seasonal
                return ProductCategory.SEASONAL

            elif cv < 0.3 and weekend_ratio < 1.2:
                # Low variance, consistent daily = bread
                return ProductCategory.BREAD

            elif weekend_ratio > 1.5:
                # Strong weekend boost = cakes
                return ProductCategory.CAKES

            elif weekend_ratio > 1.2:
                # Moderate weekend boost = pastries
                return ProductCategory.PASTRIES

            elif weekend_ratio < 0.9:
                # Weekday focus = savory
                return ProductCategory.SAVORY

            else:
                return ProductCategory.UNKNOWN

        except Exception as e:
            logger.error(f"Error analyzing sales pattern: {e}")
            return ProductCategory.UNKNOWN

    def _gini_coefficient(self, values: np.ndarray) -> float:
        """Calculate Gini coefficient for concentration measurement"""
        if len(values) == 0:
            return 0.0

        sorted_values = np.sort(values)
        n = len(values)
        cumsum = np.cumsum(sorted_values)

        # Gini coefficient formula
        return (2 * np.sum((np.arange(1, n + 1) * sorted_values))) / (n * cumsum[-1]) - (n + 1) / n

    def get_category_characteristics(self, category: ProductCategory) -> Dict[str, any]:
        """
        Get forecasting characteristics for a category.

        Returns hyperparameters and settings specific to the category.
        """
        characteristics = {
            ProductCategory.BREAD: {
                "shelf_life_days": 1,
                "demand_stability": "high",
                "seasonality_strength": "low",
                "weekend_factor": 0.95,  # Slightly lower on weekends
                "holiday_factor": 0.7,   # Much lower on holidays
                "weather_sensitivity": "low",
                "prophet_params": {
                    "seasonality_mode": "additive",
                    "yearly_seasonality": False,
                    "weekly_seasonality": True,
                    "daily_seasonality": False,
                    "changepoint_prior_scale": 0.01,  # Very stable
                    "seasonality_prior_scale": 5.0
                }
            },
            ProductCategory.PASTRIES: {
                "shelf_life_days": 2,
                "demand_stability": "medium",
                "seasonality_strength": "medium",
                "weekend_factor": 1.3,   # Boost on weekends
                "holiday_factor": 1.1,   # Slight boost on holidays
                "weather_sensitivity": "medium",
                "prophet_params": {
                    "seasonality_mode": "multiplicative",
                    "yearly_seasonality": True,
                    "weekly_seasonality": True,
                    "daily_seasonality": False,
                    "changepoint_prior_scale": 0.05,
                    "seasonality_prior_scale": 10.0
                }
            },
            ProductCategory.CAKES: {
                "shelf_life_days": 4,
                "demand_stability": "low",
                "seasonality_strength": "high",
                "weekend_factor": 2.0,   # Large weekend boost
                "holiday_factor": 1.5,   # Holiday boost
                "weather_sensitivity": "low",
                "prophet_params": {
                    "seasonality_mode": "multiplicative",
                    "yearly_seasonality": True,
                    "weekly_seasonality": True,
                    "daily_seasonality": False,
                    "changepoint_prior_scale": 0.1,  # More flexible
                    "seasonality_prior_scale": 15.0
                }
            },
            ProductCategory.DRINKS: {
                "shelf_life_days": 1,
                "demand_stability": "medium",
                "seasonality_strength": "high",
                "weekend_factor": 1.1,
                "holiday_factor": 1.2,
                "weather_sensitivity": "very_high",
                "prophet_params": {
                    "seasonality_mode": "multiplicative",
                    "yearly_seasonality": True,
                    "weekly_seasonality": True,
                    "daily_seasonality": False,
                    "changepoint_prior_scale": 0.08,
                    "seasonality_prior_scale": 12.0
                }
            },
            ProductCategory.SEASONAL: {
                "shelf_life_days": 7,
                "demand_stability": "very_low",
                "seasonality_strength": "very_high",
                "weekend_factor": 1.2,
                "holiday_factor": 3.0,   # Massive holiday boost
                "weather_sensitivity": "low",
                "prophet_params": {
                    "seasonality_mode": "multiplicative",
                    "yearly_seasonality": True,
                    "weekly_seasonality": False,
                    "daily_seasonality": False,
                    "changepoint_prior_scale": 0.2,  # Very flexible
                    "seasonality_prior_scale": 20.0
                }
            },
            ProductCategory.SAVORY: {
                "shelf_life_days": 1,
                "demand_stability": "medium",
                "seasonality_strength": "low",
                "weekend_factor": 0.8,   # Lower on weekends
                "holiday_factor": 0.6,   # Much lower on holidays
                "weather_sensitivity": "medium",
                "prophet_params": {
                    "seasonality_mode": "additive",
                    "yearly_seasonality": False,
                    "weekly_seasonality": True,
                    "daily_seasonality": False,
                    "changepoint_prior_scale": 0.03,
                    "seasonality_prior_scale": 7.0
                }
            },
            ProductCategory.UNKNOWN: {
                "shelf_life_days": 2,
                "demand_stability": "medium",
                "seasonality_strength": "medium",
                "weekend_factor": 1.0,
                "holiday_factor": 1.0,
                "weather_sensitivity": "medium",
                "prophet_params": {
                    "seasonality_mode": "multiplicative",
                    "yearly_seasonality": True,
                    "weekly_seasonality": True,
                    "daily_seasonality": False,
                    "changepoint_prior_scale": 0.05,
                    "seasonality_prior_scale": 10.0
                }
            }
        }

        return characteristics.get(category, characteristics[ProductCategory.UNKNOWN])

    def batch_categorize(
        self,
        products: List[Dict[str, any]],
        sales_data: pd.DataFrame = None
    ) -> Dict[str, ProductCategory]:
        """
        Categorize multiple products at once.

        Args:
            products: List of dicts with 'id' and 'name' keys
            sales_data: Optional sales data with 'inventory_product_id' column

        Returns:
            Dict mapping product_id to category
        """
        results = {}

        for product in products:
            product_id = product.get('id')
            product_name = product.get('name', '')

            # Filter sales data for this product if available
            product_sales = None
            if sales_data is not None and 'inventory_product_id' in sales_data.columns:
                product_sales = sales_data[
                    sales_data['inventory_product_id'] == product_id
                ].copy()

            category = self.categorize_product(
                product_name=product_name,
                product_id=product_id,
                sales_data=product_sales
            )

            results[product_id] = category

        logger.info(f"Batch categorization complete",
                   total_products=len(products),
                   categories=dict(pd.Series(list(results.values())).value_counts()))

        return results