bakery-ia/services/training/app/ml/model_selector.py

"""
Model Selection System
Determines whether to use Prophet-only or Hybrid Prophet+XGBoost models
"""

import pandas as pd
import numpy as np
from typing import Dict, Any, Optional
import structlog

logger = structlog.get_logger()


class ModelSelector:
    """
    Intelligent model selection based on data characteristics.

    Decision Criteria:
    - Data size: Hybrid needs more data (min 90 days)
    - Complexity: High variance benefits from XGBoost
    - Seasonality strength: Weak seasonality benefits from XGBoost
    - Historical performance: Compare models on validation set
    """

    # Thresholds for model selection
    MIN_DATA_POINTS_HYBRID = 90  # Minimum data points for hybrid
    HIGH_VARIANCE_THRESHOLD = 0.5  # CV > 0.5 suggests complex patterns
    LOW_SEASONALITY_THRESHOLD = 0.3  # Weak seasonal patterns
    HYBRID_IMPROVEMENT_THRESHOLD = 0.05  # 5% MAPE improvement to justify hybrid

    def __init__(self):
        pass

    def select_model_type(
        self,
        df: pd.DataFrame,
        product_category: str = "unknown",
        force_prophet: bool = False,
        force_hybrid: bool = False
    ) -> str:
        """
        Select best model type based on data characteristics.

        Args:
            df: Training data with 'y' column
            product_category: Product category (bread, pastries, etc.)
            force_prophet: Force Prophet-only model
            force_hybrid: Force hybrid model

        Returns:
            "prophet" or "hybrid"
        """
        # Honor forced selections
        if force_prophet:
            logger.info("Prophet-only model forced by configuration")
            return "prophet"

        if force_hybrid:
            logger.info("Hybrid model forced by configuration")
            return "hybrid"

        # Check minimum data requirements
        if len(df) < self.MIN_DATA_POINTS_HYBRID:
            logger.info(
                "Insufficient data for hybrid model, using Prophet",
                data_points=len(df),
                min_required=self.MIN_DATA_POINTS_HYBRID
            )
            return "prophet"

        # Calculate data characteristics
        characteristics = self._analyze_data_characteristics(df)

        # Decision logic
        score_hybrid = 0
        score_prophet = 0

        # Factor 1: Data complexity (variance)
        if characteristics['coefficient_of_variation'] > self.HIGH_VARIANCE_THRESHOLD:
            score_hybrid += 2
            logger.debug("High variance detected, favoring hybrid", cv=characteristics['coefficient_of_variation'])
        else:
            score_prophet += 1

        # Factor 2: Seasonality strength
        if characteristics['seasonality_strength'] < self.LOW_SEASONALITY_THRESHOLD:
            score_hybrid += 2
            logger.debug("Weak seasonality detected, favoring hybrid", strength=characteristics['seasonality_strength'])
        else:
            score_prophet += 1

        # Factor 3: Data size (more data = better for hybrid)
        if len(df) > 180:
            score_hybrid += 1
        elif len(df) < 120:
            score_prophet += 1

        # Factor 4: Product category considerations
        if product_category in ['seasonal', 'cakes']:
            # Event-driven products benefit from XGBoost pattern learning
            score_hybrid += 1
        elif product_category in ['bread', 'savory']:
            # Stable products work well with Prophet
            score_prophet += 1

        # Factor 5: Zero ratio (sparse data)
        if characteristics['zero_ratio'] > 0.3:
            # High zero ratio suggests difficult forecasting, hybrid might help
            score_hybrid += 1

        # Make decision
        selected_model = "hybrid" if score_hybrid > score_prophet else "prophet"

        logger.info(
            "Model selection complete",
            selected_model=selected_model,
            score_hybrid=score_hybrid,
            score_prophet=score_prophet,
            data_points=len(df),
            cv=characteristics['coefficient_of_variation'],
            seasonality=characteristics['seasonality_strength'],
            category=product_category
        )

        return selected_model

    def _analyze_data_characteristics(self, df: pd.DataFrame) -> Dict[str, float]:
        """
        Analyze time series characteristics.

        Args:
            df: DataFrame with 'y' column (sales data)

        Returns:
            Dictionary with data characteristics
        """
        y = df['y'].values

        # Coefficient of variation
        cv = np.std(y) / np.mean(y) if np.mean(y) > 0 else 0

        # Zero ratio
        zero_ratio = (y == 0).sum() / len(y)

        # Seasonality strength using autocorrelation at key lags (7 days, 30 days)
        # This better captures periodic patterns without using future data
        if len(df) >= 14:
            # Calculate autocorrelation at weekly lag (7 days)
            # Higher autocorrelation indicates stronger weekly patterns
            try:
                weekly_autocorr = pd.Series(y).autocorr(lag=7) if len(y) > 7 else 0

                # Calculate autocorrelation at monthly lag if enough data
                monthly_autocorr = pd.Series(y).autocorr(lag=30) if len(y) > 30 else 0

                # Combine autocorrelations (weekly weighted more for bakery data)
                seasonality_strength = abs(weekly_autocorr) * 0.7 + abs(monthly_autocorr) * 0.3

                # Ensure in valid range [0, 1]
                seasonality_strength = max(0.0, min(1.0, seasonality_strength))
            except Exception:
                # Fallback to simpler calculation if autocorrelation fails
                seasonality_strength = 0.5
        else:
            seasonality_strength = 0.5  # Default

        # Trend strength
        if len(df) >= 30:
            from scipy import stats
            x = np.arange(len(y))
            slope, _, r_value, _, _ = stats.linregress(x, y)
            trend_strength = abs(r_value)
        else:
            trend_strength = 0

        return {
            'coefficient_of_variation': float(cv),
            'zero_ratio': float(zero_ratio),
            'seasonality_strength': float(seasonality_strength),
            'trend_strength': float(trend_strength),
            'mean': float(np.mean(y)),
            'std': float(np.std(y))
        }

    def compare_models(
        self,
        prophet_metrics: Dict[str, float],
        hybrid_metrics: Dict[str, float]
    ) -> str:
        """
        Compare Prophet and Hybrid model performance.

        Args:
            prophet_metrics: Prophet model metrics (with 'mape' key)
            hybrid_metrics: Hybrid model metrics (with 'mape' key)

        Returns:
            "prophet" or "hybrid" based on better performance
        """
        prophet_mape = prophet_metrics.get('mape', float('inf'))
        hybrid_mape = hybrid_metrics.get('mape', float('inf'))

        # Calculate improvement
        if prophet_mape > 0:
            improvement = (prophet_mape - hybrid_mape) / prophet_mape
        else:
            improvement = 0

        # Hybrid must improve by at least threshold to justify complexity
        if improvement >= self.HYBRID_IMPROVEMENT_THRESHOLD:
            logger.info(
                "Hybrid model selected based on performance",
                prophet_mape=prophet_mape,
                hybrid_mape=hybrid_mape,
                improvement=f"{improvement*100:.1f}%"
            )
            return "hybrid"
        else:
            logger.info(
                "Prophet model selected (hybrid improvement insufficient)",
                prophet_mape=prophet_mape,
                hybrid_mape=hybrid_mape,
                improvement=f"{improvement*100:.1f}%"
            )
            return "prophet"


def should_use_hybrid_model(
    df: pd.DataFrame,
    product_category: str = "unknown",
    tenant_settings: Dict[str, Any] = None
) -> bool:
    """
    Convenience function to determine if hybrid model should be used.

    Args:
        df: Training data
        product_category: Product category
        tenant_settings: Optional tenant-specific settings

    Returns:
        True if hybrid model should be used, False otherwise
    """
    selector = ModelSelector()

    # Check tenant settings
    force_prophet = tenant_settings.get('force_prophet_only', False) if tenant_settings else False
    force_hybrid = tenant_settings.get('force_hybrid', False) if tenant_settings else False

    selected = selector.select_model_type(
        df=df,
        product_category=product_category,
        force_prophet=force_prophet,
        force_hybrid=force_hybrid
    )

    return selected == "hybrid"