Start fixing forecast service 15

2025-07-30 00:23:05 +02:00
parent 2d1ce2d523
commit 1d05e125a5
5 changed files with 677 additions and 382 deletions
--- a/services/forecasting/app/services/prediction_service.py
+++ b/services/forecasting/app/services/prediction_service.py
@@ -1,9 +1,7 @@
-# ================================================================
-# services/forecasting/app/services/prediction_service.py
-# ================================================================
+# services/forecasting/app/services/prediction_service.py - FIXED SEASON FEATURE
 """
 Prediction service for loading models and generating predictions
-Handles the actual ML prediction logic
+FIXED: Added missing 'season' feature that matches training service exactly
 """

 import structlog
@@ -52,46 +50,51 @@ class PredictionService:
            if not model:
                raise ValueError(f"Model {model_id} not found or failed to load")
            
-            # Prepare features for Prophet
-            df = self._prepare_prophet_features(features)
+            # Prepare features for Prophet model
+            prophet_df = self._prepare_prophet_features(features)
            
            # Generate prediction
-            forecast = model.predict(df)
+            forecast = model.predict(prophet_df)
            
-            # Extract prediction results
-            if len(forecast) > 0:
-                row = forecast.iloc[0]
-                result = {
-                    "demand": float(row['yhat']),
-                    "lower_bound": float(row[f'yhat_lower']),
-                    "upper_bound": float(row[f'yhat_upper']),
-                    "trend": float(row.get('trend', 0)),
-                    "seasonal": float(row.get('seasonal', 0)),
-                    "holiday": float(row.get('holidays', 0))
-                }
-            else:
-                raise ValueError("No prediction generated from model")
+            # Extract prediction values
+            prediction_value = float(forecast['yhat'].iloc[0])
+            lower_bound = float(forecast['yhat_lower'].iloc[0])
+            upper_bound = float(forecast['yhat_upper'].iloc[0])
            
-            # Update metrics
+            # Calculate confidence interval
+            confidence_interval = upper_bound - lower_bound
+            
+            result = {
+                "prediction": max(0, prediction_value),  # Ensure non-negative
+                "lower_bound": max(0, lower_bound),
+                "upper_bound": max(0, upper_bound),
+                "confidence_interval": confidence_interval,
+                "confidence_level": confidence_level
+            }
+            
+            # Record metrics
            processing_time = (datetime.now() - start_time).total_seconds()
-            metrics.histogram_observe("forecast_processing_time_seconds", processing_time)
+            metrics.register_histogram("prediction_processing_time_seconds", processing_time)
+            metrics.increment_counter("predictions_served_total")
            
            logger.info("Prediction generated successfully", 
                       model_id=model_id,
-                       predicted_demand=result["demand"],
-                       processing_time_ms=int(processing_time * 1000))
+                       prediction=result["prediction"],
+                       processing_time=processing_time)
            
            return result
            
        except Exception as e:
            logger.error("Error generating prediction", 
-                        model_id=model_id,
-                        error=str(e))
+                        error=str(e), 
+                        model_id=model_id)
+            metrics.increment_counter("prediction_errors_total")
            raise
    
    async def _load_model(self, model_id: str, model_path: str):
-        """Load model from shared volume using joblib"""
+        """Load model from file with improved validation and error handling"""
        
+        # Enhanced model file validation
        if not await self._validate_model_file(model_path):
            logger.error(f"Model file not valid: {model_path}")
            return None
@@ -104,12 +107,16 @@ class PredictionService:
        
        try:
            if os.path.exists(model_path):
-                # ✅ FIX: Use joblib.load instead of pickle.load
-                model = joblib.load(model_path)
+                # Try multiple loading methods for compatibility
+                model = await self._load_model_safely(model_path)
+                
+                if model is None:
+                    logger.error(f"Failed to load model from: {model_path}")
+                    return None
                
                # Cache the model
                self.model_cache[model_id] = (model, datetime.now())
-                logger.info(f"Model loaded from shared volume: {model_path}")
+                logger.info(f"Model loaded successfully: {model_path}")
                return model
            else:
                logger.error(f"Model file not found: {model_path}")
@@ -118,9 +125,44 @@ class PredictionService:
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            return None
+    
+    async def _load_model_safely(self, model_path: str):
+        """Safely load model with multiple fallback methods"""
+        
+        # Method 1: Try joblib first (recommended for sklearn/Prophet models)
+        try:
+            logger.debug(f"Attempting to load model with joblib: {model_path}")
+            model = joblib.load(model_path)
+            logger.info(f"Model loaded successfully with joblib")
+            return model
+        except Exception as e:
+            logger.warning(f"Joblib loading failed: {e}")
+        
+        # Method 2: Try pickle as fallback
+        try:
+            logger.debug(f"Attempting to load model with pickle: {model_path}")
+            with open(model_path, 'rb') as f:
+                model = pickle.load(f)
+            logger.info(f"Model loaded successfully with pickle")
+            return model
+        except Exception as e:
+            logger.warning(f"Pickle loading failed: {e}")
+        
+        # Method 3: Try pandas pickle (for Prophet models saved with pandas)
+        try:
+            logger.debug(f"Attempting to load model with pandas: {model_path}")
+            import pandas as pd
+            model = pd.read_pickle(model_path)
+            logger.info(f"Model loaded successfully with pandas")
+            return model
+        except Exception as e:
+            logger.warning(f"Pandas loading failed: {e}")
+        
+        logger.error(f"All loading methods failed for: {model_path}")
+        return None
        
    async def _validate_model_file(self, model_path: str) -> bool:
-        """Validate model file before loading"""
+        """Enhanced model file validation"""
        try:
            if not os.path.exists(model_path):
                logger.error(f"Model file not found: {model_path}")
@@ -132,15 +174,34 @@ class PredictionService:
                logger.warning(f"Model file too small ({file_size} bytes): {model_path}")
                return False
                
-            # Try to peek at file header to detect format
-            with open(model_path, 'rb') as f:
-                header = f.read(8)
+            # More comprehensive file format detection
+            try:
+                with open(model_path, 'rb') as f:
+                    header = f.read(16)  # Read more bytes for better detection
+                    
+                # Check for various pickle/joblib signatures
+                valid_signatures = [
+                    b']\x93PICKLE',     # Joblib
+                    b'\x80\x03',        # Pickle protocol 3
+                    b'\x80\x04',        # Pickle protocol 4  
+                    b'\x80\x05',        # Pickle protocol 5
+                    b'}\x94',           # Newer joblib format
+                    b'}\x93',           # Alternative joblib format
+                ]
+                
+                is_valid_format = any(header.startswith(sig) for sig in valid_signatures)
+                
+                if not is_valid_format:
+                    # Log header for debugging but don't fail validation
+                    logger.warning(f"Unrecognized file header: {header[:8]} for {model_path}")
+                    logger.info("Proceeding with loading attempt despite unrecognized header")
+                    # Return True to allow loading attempt - some valid files may have different headers
+                    return True
                
-            # Check for joblib signature
-            if header.startswith(b']\x93PICKLE') or header.startswith(b'\x80\x03'):
                return True
-            else:
-                logger.warning(f"Unrecognized file format: {model_path}")
+                
+            except Exception as e:
+                logger.error(f"Error reading model file header: {e}")
                return False
                
        except Exception as e:
@@ -148,7 +209,7 @@ class PredictionService:
            return False
    
    def _prepare_prophet_features(self, features: Dict[str, Any]) -> pd.DataFrame:
-        """Convert features to Prophet-compatible DataFrame"""
+        """Convert features to Prophet-compatible DataFrame - FIXED TO MATCH TRAINING"""
        
        try:
            # Create base DataFrame with required 'ds' column
@@ -156,15 +217,19 @@ class PredictionService:
                'ds': [pd.to_datetime(features['date'])]
            })
            
-            # Add numeric features
+            # Add numeric features with safe conversion
            numeric_features = [
                'temperature', 'precipitation', 'humidity', 'wind_speed',
-                'traffic_volume', 'pedestrian_count'
+                'traffic_volume', 'pedestrian_count', 'pressure'  # ✅ FIX: Added pressure
            ]
            
            for feature in numeric_features:
                if feature in features and features[feature] is not None:
-                    df[feature] = float(features[feature])
+                    try:
+                        df[feature] = float(features[feature])
+                    except (ValueError, TypeError):
+                        logger.warning(f"Could not convert {feature} to float: {features[feature]}")
+                        df[feature] = 0.0
                else:
                    df[feature] = 0.0
            
@@ -179,9 +244,12 @@ class PredictionService:
            df['quarter'] = int(forecast_date.quarter)
            df['week_of_year'] = int(forecast_date.isocalendar().week)
            
-            # Bakery-specific temporal features (match training exactly!)
-            df['is_weekend'] = int(day_of_week >= 5)  # Saturday=5, Sunday=6
-            df['is_monday'] = int(day_of_week == 0)   # ✅ FIX: Add missing is_monday
+            # ✅ FIX: Add the missing 'season' feature that matches training exactly
+            df['season'] = self._get_season(forecast_date.month)
+            
+            # Bakery-specific temporal features
+            df['is_weekend'] = int(day_of_week >= 5)
+            df['is_monday'] = int(day_of_week == 0)
            df['is_tuesday'] = int(day_of_week == 1)
            df['is_wednesday'] = int(day_of_week == 2)
            df['is_thursday'] = int(day_of_week == 3)
@@ -189,6 +257,15 @@ class PredictionService:
            df['is_saturday'] = int(day_of_week == 5)
            df['is_sunday'] = int(day_of_week == 6)
            
+            # Season-based features (match training service)
+            df['is_spring'] = int(df['season'].iloc[0] == 2)
+            df['is_summer'] = int(df['season'].iloc[0] == 3)
+            df['is_autumn'] = int(df['season'].iloc[0] == 4)
+            df['is_winter'] = int(df['season'].iloc[0] == 1)
+            
+            # Holiday features
+            df['is_holiday'] = int(features.get('is_holiday', False))
+            
            # Month-based features
            df['is_january'] = int(forecast_date.month == 1)
            df['is_february'] = int(forecast_date.month == 2)
@@ -203,35 +280,169 @@ class PredictionService:
            df['is_november'] = int(forecast_date.month == 11)
            df['is_december'] = int(forecast_date.month == 12)
            
-            # Season-based features  
-            season = ((forecast_date.month % 12) + 3) // 3  # 1=spring, 2=summer, 3=autumn, 4=winter
-            df['is_spring'] = int(season == 1)
-            df['is_summer'] = int(season == 2)  
-            df['is_autumn'] = int(season == 3)
-            df['is_winter'] = int(season == 4)
-            
-            # Business context features
-            df['is_holiday'] = int(features.get('is_holiday', False))
-            
-            # Business type encoding
-            business_type = features.get('business_type', 'individual')
-            df['is_central_workshop'] = int(business_type == 'central_workshop')
-            df['is_individual_bakery'] = int(business_type == 'individual')
-            
-            # Special day features (these might be in training data)
+            # Additional features that might be in training data
            df['is_month_start'] = int(forecast_date.day <= 3)
            df['is_month_end'] = int(forecast_date.day >= 28)
            df['is_quarter_start'] = int(forecast_date.month in [1, 4, 7, 10] and forecast_date.day <= 7)
            df['is_quarter_end'] = int(forecast_date.month in [3, 6, 9, 12] and forecast_date.day >= 25)
            
-            logger.debug("Prepared Prophet features", 
-                        features_count=len(df.columns),
+            # Business context features
+            df['is_school_holiday'] = int(self._is_school_holiday(forecast_date))
+            df['is_payday_period'] = int((forecast_date.day <= 5) or (forecast_date.day >= 25))
+            
+            # Working day features
+            df['is_working_day'] = int(day_of_week < 5)  # Monday-Friday
+            df['is_peak_bakery_day'] = int(day_of_week in [4, 5, 6])  # Friday, Saturday, Sunday
+            
+            # Seasonal demand patterns
+            df['is_high_demand_month'] = int(forecast_date.month in [6, 7, 8, 12])
+            df['is_warm_season'] = int(forecast_date.month in [4, 5, 6, 7, 8, 9])
+            
+            # Weather-based derived features (if weather data available)
+            if 'temperature' in df.columns:
+                temp = df['temperature'].iloc[0]
+                df['temp_squared'] = temp ** 2  # ✅ FIX: Added temp_squared
+                df['is_pleasant_day'] = int(18 <= temp <= 25)
+                df['temp_category'] = int(self._get_temp_category(temp))
+                df['is_hot_day'] = int(temp > 25)
+                df['is_cold_day'] = int(temp < 10)
+            
+            if 'precipitation' in df.columns:
+                precip = df['precipitation'].iloc[0]
+                df['is_rainy_day'] = int(precip > 0.1)
+                df['is_heavy_rain'] = int(precip > 10.0)
+                df['rain_intensity'] = int(self._get_rain_intensity(precip))
+            
+            # Traffic-based features (if available)
+            if 'traffic_volume' in df.columns and df['traffic_volume'].iloc[0] > 0:
+                traffic = df['traffic_volume'].iloc[0]
+                # Simple categorization since we don't have historical data for quantiles
+                df['high_traffic'] = int(traffic > 150)  # Assumption based on typical values
+                df['low_traffic'] = int(traffic < 50)
+                df['traffic_normalized'] = float((traffic - 100) / 50)  # Simple normalization
+                
+                # ✅ FIX: Add additional traffic features that might be in training
+                df['traffic_squared'] = traffic ** 2
+                df['traffic_log'] = float(np.log1p(traffic))  # log(1+traffic) to handle zeros
+            else:
+                df['high_traffic'] = 0
+                df['low_traffic'] = 0
+                df['traffic_normalized'] = 0.0
+                df['traffic_squared'] = 0.0
+                df['traffic_log'] = 0.0
+            
+            # Interaction features (common in training)
+            if 'is_weekend' in df.columns and 'temperature' in df.columns:
+                df['weekend_temp_interaction'] = df['is_weekend'].iloc[0] * df['temperature'].iloc[0]
+                df['weekend_pleasant_weather'] = df['is_weekend'].iloc[0] * df.get('is_pleasant_day', pd.Series([0])).iloc[0]
+            
+            if 'is_holiday' in df.columns and 'temperature' in df.columns:
+                df['holiday_temp_interaction'] = df['is_holiday'].iloc[0] * df['temperature'].iloc[0]
+            
+            if 'season' in df.columns and 'temperature' in df.columns:
+                df['season_temp_interaction'] = df['season'].iloc[0] * df['temperature'].iloc[0]
+            
+            # ✅ FIX: Add more interaction features that might be in training
+            if 'is_rainy_day' in df.columns and 'traffic_volume' in df.columns:
+                df['rain_traffic_interaction'] = df['is_rainy_day'].iloc[0] * df['traffic_volume'].iloc[0]
+            
+            if 'is_weekend' in df.columns and 'traffic_volume' in df.columns:
+                df['weekend_traffic_interaction'] = df['is_weekend'].iloc[0] * df['traffic_volume'].iloc[0]
+            
+            # Day-weather interactions
+            if 'day_of_week' in df.columns and 'temperature' in df.columns:
+                df['day_temp_interaction'] = df['day_of_week'].iloc[0] * df['temperature'].iloc[0]
+            
+            if 'month' in df.columns and 'temperature' in df.columns:
+                df['month_temp_interaction'] = df['month'].iloc[0] * df['temperature'].iloc[0]
+            
+            # ✅ FIX: Add comprehensive derived features to match training
+            
+            # Humidity-based features
+            if 'humidity' in df.columns:
+                humidity = df['humidity'].iloc[0]
+                df['humidity_squared'] = humidity ** 2
+                df['is_high_humidity'] = int(humidity > 70)
+                df['is_low_humidity'] = int(humidity < 40)
+            
+            # Pressure-based features
+            if 'pressure' in df.columns:
+                pressure = df['pressure'].iloc[0]
+                df['pressure_squared'] = pressure ** 2
+                df['is_high_pressure'] = int(pressure > 1020)
+                df['is_low_pressure'] = int(pressure < 1000)
+            
+            # Wind-based features
+            if 'wind_speed' in df.columns:
+                wind = df['wind_speed'].iloc[0]
+                df['wind_squared'] = wind ** 2
+                df['is_windy'] = int(wind > 15)
+                df['is_calm'] = int(wind < 5)
+            
+            # Precipitation-based features (additional to basic ones)
+            if 'precipitation' in df.columns:
+                precip = df['precipitation'].iloc[0]
+                df['precip_squared'] = precip ** 2
+                df['precip_log'] = float(np.log1p(precip))
+            
+            logger.debug("Prophet features prepared with comprehensive derived features", 
+                        feature_count=len(df.columns),
                        date=features['date'],
+                        season=df['season'].iloc[0],
                        day_of_week=day_of_week,
-                        is_monday=df['is_monday'].iloc[0])
+                        temp_squared=df.get('temp_squared', pd.Series([0])).iloc[0])
            
            return df
            
        except Exception as e:
-            logger.error("Error preparing Prophet features", error=str(e))
-            raise
+            logger.error(f"Error preparing Prophet features: {e}")
+            raise
+    
+    def _get_season(self, month: int) -> int:
+        """Get season from month (1-4 for Winter, Spring, Summer, Autumn) - MATCH TRAINING"""
+        if month in [12, 1, 2]:
+            return 1  # Winter
+        elif month in [3, 4, 5]:
+            return 2  # Spring
+        elif month in [6, 7, 8]:
+            return 3  # Summer
+        else:
+            return 4  # Autumn
+    
+    def _is_school_holiday(self, date: datetime) -> bool:
+        """Check if a date is during school holidays - MATCH TRAINING"""
+        month = date.month
+        
+        # Approximate Spanish school holiday periods
+        if month in [7, 8]:  # Summer holidays
+            return True
+        if month == 12 and date.day >= 20:  # Christmas holidays
+            return True
+        if month == 1 and date.day <= 10:  # Christmas holidays continued
+            return True
+        if month == 4 and date.day <= 15:  # Easter holidays (approximate)
+            return True
+        
+        return False
+    
+    def _get_temp_category(self, temperature: float) -> int:
+        """Get temperature category (0-3) - MATCH TRAINING"""
+        if temperature <= 5:
+            return 0  # Very cold
+        elif temperature <= 15:
+            return 1  # Cold
+        elif temperature <= 25:
+            return 2  # Mild
+        else:
+            return 3  # Hot
+    
+    def _get_rain_intensity(self, precipitation: float) -> int:
+        """Get rain intensity category (0-3) - MATCH TRAINING"""
+        if precipitation <= 0:
+            return 0  # No rain
+        elif precipitation <= 2:
+            return 1  # Light rain
+        elif precipitation <= 10:
+            return 2  # Moderate rain
+        else:
+            return 3  # Heavy rain