bakery-ia/services/forecasting/app/services/prediction_service.py

# services/forecasting/app/services/prediction_service.py - FIXED SEASON FEATURE
"""
Prediction service for loading models and generating predictions
FIXED: Added missing 'season' feature that matches training service exactly
"""

import structlog
from typing import Dict, List, Any, Optional
import asyncio
import pickle
import json
from datetime import datetime, date
import numpy as np
import pandas as pd
import httpx
from pathlib import Path
import os
import joblib

from app.core.config import settings
from shared.monitoring.metrics import MetricsCollector
from shared.database.base import create_database_manager

logger = structlog.get_logger()
metrics = MetricsCollector("forecasting-service")

class PredictionService:
    """
    Service for loading ML models and generating predictions with dependency injection
    Interfaces with trained Prophet models from the training service
    """

    def __init__(self, database_manager=None):
        self.database_manager = database_manager or create_database_manager(settings.DATABASE_URL, "forecasting-service")
        self.model_cache = {}
        self.cache_ttl = 3600  # 1 hour cache

    async def validate_prediction_request(self, request: Dict[str, Any]) -> Dict[str, Any]:
        """Validate prediction request"""
        try:
            required_fields = ["inventory_product_id", "model_id", "features"]
            missing_fields = [field for field in required_fields if field not in request]

            if missing_fields:
                return {
                    "is_valid": False,
                    "errors": [f"Missing required fields: {missing_fields}"],
                    "validation_passed": False
                }

            return {
                "is_valid": True,
                "errors": [],
                "validation_passed": True,
                "validated_fields": list(request.keys())
            }

        except Exception as e:
            logger.error("Validation error", error=str(e))
            return {
                "is_valid": False,
                "errors": [str(e)],
                "validation_passed": False
            }

    async def predict(self, model_id: str, model_path: str, features: Dict[str, Any],
                     confidence_level: float = 0.8) -> Dict[str, float]:
        """Generate prediction using trained model"""

        start_time = datetime.now()

        try:
            logger.info("Generating prediction",
                       model_id=model_id,
                       features_count=len(features))

            # Load model
            model = await self._load_model(model_id, model_path)

            if not model:
                raise ValueError(f"Model {model_id} not found or failed to load")

            # Prepare features for Prophet model
            prophet_df = self._prepare_prophet_features(features)

            # Generate prediction
            forecast = model.predict(prophet_df)

            # Extract prediction values
            prediction_value = float(forecast['yhat'].iloc[0])
            lower_bound = float(forecast['yhat_lower'].iloc[0])
            upper_bound = float(forecast['yhat_upper'].iloc[0])

            # Calculate confidence interval
            confidence_interval = upper_bound - lower_bound

            result = {
                "prediction": max(0, prediction_value),  # Ensure non-negative
                "lower_bound": max(0, lower_bound),
                "upper_bound": max(0, upper_bound),
                "confidence_interval": confidence_interval,
                "confidence_level": confidence_level
            }

            # Record metrics
            processing_time = (datetime.now() - start_time).total_seconds()
            # Record metrics with proper registration and error handling
            try:
                # Register metrics if not already registered
                if "prediction_processing_time" not in metrics._histograms:
                    metrics.register_histogram(
                        "prediction_processing_time",
                        "Time taken to process predictions",
                        labels=['service', 'model_type']
                    )

                if "predictions_served_total" not in metrics._counters:
                    try:
                        metrics.register_counter(
                            "predictions_served_total",
                            "Total number of predictions served",
                            labels=['service', 'status']
                        )
                    except Exception as reg_error:
                        # Metric might already exist in global registry
                        logger.debug("Counter already exists in registry", error=str(reg_error))

                # Now record the metrics
                metrics.observe_histogram(
                    "prediction_processing_time",
                    processing_time,
                    labels={'service': 'forecasting-service', 'model_type': 'prophet'}
                )
                metrics.increment_counter(
                    "predictions_served_total",
                    labels={'service': 'forecasting-service', 'status': 'success'}
                )
            except Exception as metrics_error:
                # Log metrics error but don't fail the prediction
                logger.warning("Failed to record metrics", error=str(metrics_error))

            logger.info("Prediction generated successfully",
                       model_id=model_id,
                       prediction=result["prediction"],
                       processing_time=processing_time)

            return result

        except Exception as e:
            logger.error("Error generating prediction",
                        error=str(e),
                        model_id=model_id)
            try:
                if "prediction_errors_total" not in metrics._counters:
                    metrics.register_counter(
                        "prediction_errors_total",
                        "Total number of prediction errors",
                        labels=['service', 'error_type']
                    )
                metrics.increment_counter(
                    "prediction_errors_total",
                    labels={'service': 'forecasting-service', 'error_type': 'prediction_failed'}
                )
            except Exception:
                pass  # Don't fail on metrics errors
            raise

    async def predict_with_weather_forecast(
        self,
        model_id: str,
        model_path: str,
        features: Dict[str, Any],
        tenant_id: str,
        days: int = 7,
        confidence_level: float = 0.8
    ) -> List[Dict[str, float]]:
        """
        Generate predictions enriched with real weather forecast data

        This method:
        1. Loads the trained ML model
        2. Fetches real weather forecast from external service
        3. Enriches prediction features with actual forecast data
        4. Generates weather-aware predictions

        Args:
            model_id: ID of the trained model
            model_path: Path to model file
            features: Base features for prediction
            tenant_id: Tenant ID for weather forecast
            days: Number of days to forecast
            confidence_level: Confidence level for predictions

        Returns:
            List of predictions with weather-aware adjustments
        """
        from app.services.data_client import data_client

        start_time = datetime.now()

        try:
            logger.info("Generating weather-aware predictions",
                       model_id=model_id,
                       days=days)

            # Step 1: Load ML model
            model = await self._load_model(model_id, model_path)
            if not model:
                raise ValueError(f"Model {model_id} not found")

            # Step 2: Fetch real weather forecast
            latitude = features.get('latitude', 40.4168)
            longitude = features.get('longitude', -3.7038)

            weather_forecast = await data_client.fetch_weather_forecast(
                tenant_id=tenant_id,
                days=days,
                latitude=latitude,
                longitude=longitude
            )

            logger.info(f"Fetched weather forecast for {len(weather_forecast)} days",
                       tenant_id=tenant_id)

            # Step 3: Generate predictions for each day with weather data
            predictions = []

            for day_offset in range(days):
                # Get weather for this specific day
                day_weather = weather_forecast[day_offset] if day_offset < len(weather_forecast) else {}

                # Enrich features with actual weather forecast
                enriched_features = features.copy()
                enriched_features.update({
                    'temperature': day_weather.get('temperature', features.get('temperature', 20.0)),
                    'precipitation': day_weather.get('precipitation', features.get('precipitation', 0.0)),
                    'humidity': day_weather.get('humidity', features.get('humidity', 60.0)),
                    'wind_speed': day_weather.get('wind_speed', features.get('wind_speed', 10.0)),
                    'pressure': day_weather.get('pressure', features.get('pressure', 1013.0)),
                    'weather_description': day_weather.get('description', 'Clear')
                })

                # Prepare Prophet dataframe with weather features
                prophet_df = self._prepare_prophet_features(enriched_features)

                # Generate prediction for this day
                forecast = model.predict(prophet_df)

                prediction_value = float(forecast['yhat'].iloc[0])
                lower_bound = float(forecast['yhat_lower'].iloc[0])
                upper_bound = float(forecast['yhat_upper'].iloc[0])

                # Apply weather-based adjustments (business rules)
                adjusted_prediction = self._apply_weather_adjustments(
                    prediction_value,
                    day_weather,
                    features.get('product_category', 'general')
                )

                predictions.append({
                    "date": enriched_features['date'],
                    "prediction": max(0, adjusted_prediction),
                    "lower_bound": max(0, lower_bound),
                    "upper_bound": max(0, upper_bound),
                    "confidence_level": confidence_level,
                    "weather": {
                        "temperature": enriched_features['temperature'],
                        "precipitation": enriched_features['precipitation'],
                        "description": enriched_features['weather_description']
                    }
                })

            processing_time = (datetime.now() - start_time).total_seconds()

            logger.info("Weather-aware predictions generated",
                       model_id=model_id,
                       days=len(predictions),
                       processing_time=processing_time)

            return predictions

        except Exception as e:
            logger.error("Error generating weather-aware predictions",
                        error=str(e),
                        model_id=model_id)
            raise

    def _apply_weather_adjustments(
        self,
        base_prediction: float,
        weather: Dict[str, Any],
        product_category: str
    ) -> float:
        """
        Apply business rules based on weather conditions

        Adjusts predictions based on real weather forecast
        """
        adjusted = base_prediction
        temp = weather.get('temperature', 20.0)
        precip = weather.get('precipitation', 0.0)

        # Temperature-based adjustments
        if product_category == 'ice_cream':
            if temp > 30:
                adjusted *= 1.4  # +40% for very hot days
            elif temp > 25:
                adjusted *= 1.2  # +20% for hot days
            elif temp < 15:
                adjusted *= 0.7  # -30% for cold days

        elif product_category == 'bread':
            if temp > 30:
                adjusted *= 0.9  # -10% for very hot days
            elif temp < 10:
                adjusted *= 1.1  # +10% for cold days

        elif product_category == 'coffee':
            if temp < 15:
                adjusted *= 1.2  # +20% for cold days
            elif precip > 5:
                adjusted *= 1.15  # +15% for rainy days

        # Precipitation-based adjustments
        if precip > 10:  # Heavy rain
            if product_category in ['pastry', 'coffee']:
                adjusted *= 1.2  # People stay indoors, buy comfort food

        return adjusted

    async def _load_model(self, model_id: str, model_path: str):
        """Load model from file with improved validation and error handling"""

        # Enhanced model file validation
        if not await self._validate_model_file(model_path):
            logger.error(f"Model file not valid: {model_path}")
            return None

        # Check cache first
        if model_id in self.model_cache:
            cached_model, cached_time = self.model_cache[model_id]
            if (datetime.now() - cached_time).seconds < self.cache_ttl:
                return cached_model

        try:
            if os.path.exists(model_path):
                # Try multiple loading methods for compatibility
                model = await self._load_model_safely(model_path)

                if model is None:
                    logger.error(f"Failed to load model from: {model_path}")
                    return None

                # Cache the model
                self.model_cache[model_id] = (model, datetime.now())
                logger.info(f"Model loaded successfully: {model_path}")
                return model
            else:
                logger.error(f"Model file not found: {model_path}")
                return None

        except Exception as e:
            logger.error(f"Error loading model: {e}")
            return None

    async def _load_model_safely(self, model_path: str):
        """Safely load model with multiple fallback methods"""

        # Method 1: Try joblib first (recommended for sklearn/Prophet models)
        try:
            logger.debug(f"Attempting to load model with joblib: {model_path}")
            model = joblib.load(model_path)
            logger.info(f"Model loaded successfully with joblib")
            return model
        except Exception as e:
            logger.warning(f"Joblib loading failed: {e}")

        # Method 2: Try pickle as fallback
        try:
            logger.debug(f"Attempting to load model with pickle: {model_path}")
            with open(model_path, 'rb') as f:
                model = pickle.load(f)
            logger.info(f"Model loaded successfully with pickle")
            return model
        except Exception as e:
            logger.warning(f"Pickle loading failed: {e}")

        # Method 3: Try pandas pickle (for Prophet models saved with pandas)
        try:
            logger.debug(f"Attempting to load model with pandas: {model_path}")
            import pandas as pd
            model = pd.read_pickle(model_path)
            logger.info(f"Model loaded successfully with pandas")
            return model
        except Exception as e:
            logger.warning(f"Pandas loading failed: {e}")

        logger.error(f"All loading methods failed for: {model_path}")
        return None

    async def _validate_model_file(self, model_path: str) -> bool:
        """Enhanced model file validation"""
        try:
            if not os.path.exists(model_path):
                logger.error(f"Model file not found: {model_path}")
                return False

            # Check file size (should be > 1KB for a trained model)
            file_size = os.path.getsize(model_path)
            if file_size < 1024:
                logger.warning(f"Model file too small ({file_size} bytes): {model_path}")
                return False

            # More comprehensive file format detection
            try:
                with open(model_path, 'rb') as f:
                    header = f.read(16)  # Read more bytes for better detection

                # Check for various pickle/joblib signatures
                valid_signatures = [
                    b']\x93PICKLE',     # Joblib
                    b'\x80\x03',        # Pickle protocol 3
                    b'\x80\x04',        # Pickle protocol 4
                    b'\x80\x05',        # Pickle protocol 5
                    b'}\x94',           # Newer joblib format
                    b'}\x93',           # Alternative joblib format
                ]

                is_valid_format = any(header.startswith(sig) for sig in valid_signatures)

                if not is_valid_format:
                    # Log header for debugging but don't fail validation
                    logger.warning(f"Unrecognized file header: {header[:8]} for {model_path}")
                    logger.info("Proceeding with loading attempt despite unrecognized header")
                    # Return True to allow loading attempt - some valid files may have different headers
                    return True

                return True

            except Exception as e:
                logger.error(f"Error reading model file header: {e}")
                return False

        except Exception as e:
            logger.error(f"Model validation error: {e}")
            return False

    def _prepare_prophet_features(self, features: Dict[str, Any]) -> pd.DataFrame:
        """Convert features to Prophet-compatible DataFrame - COMPLETE FEATURE MATCHING"""

        try:
            # Create base DataFrame with required 'ds' column
            df = pd.DataFrame({
                'ds': [pd.to_datetime(features['date'])]
            })

            # ✅ FIX: Add ALL traffic features that training service uses
            # Core traffic features
            df['traffic_volume'] = float(features.get('traffic_volume', 100.0))
            df['pedestrian_count'] = float(features.get('pedestrian_count', 50.0))
            df['congestion_level'] = float(features.get('congestion_level', 1.0))
            df['average_speed'] = float(features.get('average_speed', 30.0))  # ← MISSING FEATURE!

            # Weather features
            df['temperature'] = float(features.get('temperature', 15.0))
            df['precipitation'] = float(features.get('precipitation', 0.0))
            df['humidity'] = float(features.get('humidity', 60.0))
            df['wind_speed'] = float(features.get('wind_speed', 5.0))
            df['pressure'] = float(features.get('pressure', 1013.0))
            df['temp_category'] = self._get_temp_category(df['temperature'].iloc[0])

            # Extract date information for temporal features
            forecast_date = pd.to_datetime(features['date'])
            day_of_week = forecast_date.weekday()  # 0=Monday, 6=Sunday

            # ✅ FIX: Add ALL temporal features (must match training exactly!)
            df['day_of_week'] = int(day_of_week)
            df['day_of_month'] = int(forecast_date.day)
            df['month'] = int(forecast_date.month)
            df['quarter'] = int(forecast_date.quarter)
            df['week_of_year'] = int(forecast_date.isocalendar().week)

            # ✅ FIX: Add the missing 'season' feature that matches training exactly
            df['season'] = self._get_season(forecast_date.month)

            # Bakery-specific temporal features
            df['is_weekend'] = int(day_of_week >= 5)
            df['is_monday'] = int(day_of_week == 0)
            df['is_tuesday'] = int(day_of_week == 1)
            df['is_wednesday'] = int(day_of_week == 2)
            df['is_thursday'] = int(day_of_week == 3)
            df['is_friday'] = int(day_of_week == 4)
            df['is_saturday'] = int(day_of_week == 5)
            df['is_sunday'] = int(day_of_week == 6)
            df['is_working_day'] = int(day_of_week < 5)  # Working days (Mon-Fri)

            # Season-based features (match training service)
            df['is_spring'] = int(df['season'].iloc[0] == 2)
            df['is_summer'] = int(df['season'].iloc[0] == 3)
            df['is_autumn'] = int(df['season'].iloc[0] == 4)
            df['is_winter'] = int(df['season'].iloc[0] == 1)

            # ✅ PERFORMANCE FIX: Build all features at once to avoid DataFrame fragmentation

            # Extract values once to avoid repeated iloc calls
            temperature = df['temperature'].iloc[0]
            humidity = df['humidity'].iloc[0]
            pressure = df['pressure'].iloc[0]
            wind_speed = df['wind_speed'].iloc[0]
            precipitation = df['precipitation'].iloc[0]
            traffic = df['traffic_volume'].iloc[0]
            pedestrians = df['pedestrian_count'].iloc[0]
            avg_speed = df['average_speed'].iloc[0]
            congestion = df['congestion_level'].iloc[0]
            season = df['season'].iloc[0]
            is_weekend = df['is_weekend'].iloc[0]

            # Build all new features as a dictionary
            new_features = {
                # Holiday features
                'is_holiday': int(features.get('is_holiday', False)),
                'is_school_holiday': int(features.get('is_school_holiday', False)),

                # Month-based features
                'is_january': int(forecast_date.month == 1),
                'is_february': int(forecast_date.month == 2),
                'is_march': int(forecast_date.month == 3),
                'is_april': int(forecast_date.month == 4),
                'is_may': int(forecast_date.month == 5),
                'is_june': int(forecast_date.month == 6),
                'is_july': int(forecast_date.month == 7),
                'is_august': int(forecast_date.month == 8),
                'is_september': int(forecast_date.month == 9),
                'is_october': int(forecast_date.month == 10),
                'is_november': int(forecast_date.month == 11),
                'is_december': int(forecast_date.month == 12),

                # Special day features
                'is_month_start': int(forecast_date.day <= 3),
                'is_month_end': int(forecast_date.day >= 28),
                'is_payday_period': int((forecast_date.day <= 5) or (forecast_date.day >= 25)),

                # Weather-based derived features
                'temp_squared': temperature ** 2,
                'is_cold_day': int(temperature < 10),
                'is_hot_day': int(temperature > 25),
                'is_pleasant_day': int(10 <= temperature <= 25),

                # Humidity features
                'humidity_squared': humidity ** 2,
                'is_high_humidity': int(humidity > 70),
                'is_low_humidity': int(humidity < 40),

                # Pressure features
                'pressure_squared': pressure ** 2,
                'is_high_pressure': int(pressure > 1020),
                'is_low_pressure': int(pressure < 1000),

                # Wind features
                'wind_squared': wind_speed ** 2,
                'is_windy': int(wind_speed > 15),
                'is_calm': int(wind_speed < 5),

                # Precipitation features
                'precip_squared': precipitation ** 2,
                'precip_log': float(np.log1p(precipitation)),
                'is_rainy_day': int(precipitation > 0.1),
                'is_very_rainy_day': int(precipitation > 5.0),
                'is_heavy_rain': int(precipitation > 10),
                'rain_intensity': self._get_rain_intensity(precipitation),

                # Traffic-based features
                'high_traffic': int(traffic > 150) if traffic > 0 else 0,
                'low_traffic': int(traffic < 50) if traffic > 0 else 0,
                # Fix: Use same normalization as training (when std=0, normalized=0.0)
                # Training uses constant 100.0 values, so std=0 and normalized=0.0
                'traffic_normalized': 0.0,  # Match training behavior for consistent predictions
                'traffic_squared': traffic ** 2,
                'traffic_log': float(np.log1p(traffic)),

                # Pedestrian features
                'high_pedestrian_count': int(pedestrians > 100),
                'low_pedestrian_count': int(pedestrians < 25),
                'pedestrian_normalized': float((pedestrians - 50) / 25),
                'pedestrian_squared': pedestrians ** 2,
                'pedestrian_log': float(np.log1p(pedestrians)),

                # Speed features
                'high_speed': int(avg_speed > 40),
                'low_speed': int(avg_speed < 20),
                'speed_normalized': float((avg_speed - 30) / 10),
                'speed_squared': avg_speed ** 2,
                'speed_log': float(np.log1p(avg_speed)),

                # Congestion features
                'high_congestion': int(congestion > 3),
                'low_congestion': int(congestion < 2),
                'congestion_squared': congestion ** 2,

                # Day features
                'is_peak_bakery_day': int(day_of_week in [4, 5, 6]),
                'is_high_demand_month': int(forecast_date.month in [6, 7, 8, 12]),
                'is_warm_season': int(forecast_date.month in [4, 5, 6, 7, 8, 9])
            }

            # Calculate interaction features
            is_holiday = new_features['is_holiday']
            is_pleasant = new_features['is_pleasant_day']
            is_rainy = new_features['is_rainy_day']

            interaction_features = {
                # Weekend interactions
                'weekend_temp_interaction': is_weekend * temperature,
                'weekend_pleasant_weather': is_weekend * is_pleasant,
                'weekend_traffic_interaction': is_weekend * traffic,

                # Holiday interactions
                'holiday_temp_interaction': is_holiday * temperature,
                'holiday_traffic_interaction': is_holiday * traffic,

                # Season interactions
                'season_temp_interaction': season * temperature,
                'season_traffic_interaction': season * traffic,

                # Rain-traffic interactions
                'rain_traffic_interaction': is_rainy * traffic,
                'rain_speed_interaction': is_rainy * avg_speed,

                # Day-weather interactions
                'day_temp_interaction': day_of_week * temperature,
                'month_temp_interaction': forecast_date.month * temperature,

                # Traffic-speed interactions
                'traffic_speed_interaction': traffic * avg_speed,
                'pedestrian_speed_interaction': pedestrians * avg_speed,

                # Congestion interactions
                'congestion_temp_interaction': congestion * temperature,
                'congestion_weekend_interaction': congestion * is_weekend
            }

            # Combine all features
            all_new_features = {**new_features, **interaction_features}

            # Add all features at once using pd.concat to avoid fragmentation
            new_feature_df = pd.DataFrame([all_new_features])
            df = pd.concat([df, new_feature_df], axis=1)

            logger.debug("Complete Prophet features prepared",
                        feature_count=len(df.columns),
                        date=features['date'],
                        season=df['season'].iloc[0],
                        traffic_volume=df['traffic_volume'].iloc[0],
                        average_speed=df['average_speed'].iloc[0],
                        pedestrian_count=df['pedestrian_count'].iloc[0])

            return df

        except Exception as e:
            logger.error("Error preparing Prophet features", error=str(e))
            raise

    def _get_season(self, month: int) -> int:
        """Get season from month (1-4 for Winter, Spring, Summer, Autumn) - MATCH TRAINING"""
        if month in [12, 1, 2]:
            return 1  # Winter
        elif month in [3, 4, 5]:
            return 2  # Spring
        elif month in [6, 7, 8]:
            return 3  # Summer
        else:
            return 4  # Autumn

    def _is_school_holiday(self, date: datetime) -> bool:
        """Check if a date is during school holidays - MATCH TRAINING"""
        month = date.month

        # Approximate Spanish school holiday periods
        if month in [7, 8]:  # Summer holidays
            return True
        if month == 12 and date.day >= 20:  # Christmas holidays
            return True
        if month == 1 and date.day <= 10:  # Christmas holidays continued
            return True
        if month == 4 and date.day <= 15:  # Easter holidays (approximate)
            return True

        return False

    def _get_temp_category(self, temperature: float) -> int:
        """Get temperature category (0-3) - MATCH TRAINING"""
        if temperature <= 5:
            return 0  # Very cold
        elif temperature <= 15:
            return 1  # Cold
        elif temperature <= 25:
            return 2  # Mild
        else:
            return 3  # Hot

    def _get_rain_intensity(self, precipitation: float) -> int:
        """Get rain intensity category (0-3) - MATCH TRAINING"""
        if precipitation <= 0:
            return 0  # No rain
        elif precipitation <= 2:
            return 1  # Light rain
        elif precipitation <= 10:
            return 2  # Moderate rain
        else:
            return 3  # Heavy rain