imporve features

2025-11-14 07:23:56 +01:00
parent 9bc048d360
commit a8d8828935
32 changed files with 5436 additions and 271 deletions
--- a/services/forecasting/app/ml/predictor.py
+++ b/services/forecasting/app/ml/predictor.py
@@ -56,21 +56,17 @@ class BakeryForecaster:
        from app.services.poi_feature_service import POIFeatureService
        self.poi_feature_service = POIFeatureService()

+        # Initialize enhanced data processor from shared module
        if use_enhanced_features:
-            # Import enhanced data processor from training service
-            import sys
-            import os
-            # Add training service to path
-            training_path = os.path.join(os.path.dirname(__file__), '../../../training')
-            if training_path not in sys.path:
-                sys.path.insert(0, training_path)
-
            try:
-                from app.ml.data_processor import EnhancedBakeryDataProcessor
-                self.data_processor = EnhancedBakeryDataProcessor(database_manager)
-                logger.info("Enhanced features enabled for forecasting")
+                from shared.ml.data_processor import EnhancedBakeryDataProcessor
+                self.data_processor = EnhancedBakeryDataProcessor(region='MD')
+                logger.info("Enhanced features enabled using shared data processor")
            except ImportError as e:
-                logger.warning(f"Could not import EnhancedBakeryDataProcessor: {e}, falling back to basic features")
+                logger.warning(
+                    f"Could not import EnhancedBakeryDataProcessor from shared module: {e}. "
+                    "Falling back to basic features."
+                )
                self.use_enhanced_features = False
                self.data_processor = None
        else:
--- a/services/forecasting/app/services/forecasting_service.py
+++ b/services/forecasting/app/services/forecasting_service.py
@@ -1056,13 +1056,13 @@ class EnhancedForecastingService:
        - External service is unavailable
        """
        try:
-            # Get tenant's calendar ID
-            calendar_id = await self.data_client.get_tenant_calendar(tenant_id)
+            # Get tenant's calendar information
+            calendar_info = await self.data_client.fetch_tenant_calendar(tenant_id)

-            if calendar_id:
+            if calendar_info:
                # Check school holiday via external service
                is_school_holiday = await self.data_client.check_school_holiday(
-                    calendar_id=calendar_id,
+                    calendar_id=calendar_info["calendar_id"],
                    check_date=date_obj.isoformat(),
                    tenant_id=tenant_id
                )
--- a/services/forecasting/app/services/prediction_service.py
+++ b/services/forecasting/app/services/prediction_service.py
@@ -206,13 +206,39 @@ class PredictionService:
            
            # Calculate confidence interval
            confidence_interval = upper_bound - lower_bound
-            
+
+            # Adjust confidence based on data freshness if historical features were calculated
+            adjusted_confidence_level = confidence_level
+            data_availability_score = features.get('historical_data_availability_score', 1.0)  # Default to 1.0 if not available
+
+            # Reduce confidence if historical data is significantly old
+            if data_availability_score < 0.5:
+                # For data availability score < 0.5 (more than 90 days old), reduce confidence
+                adjusted_confidence_level = max(0.6, confidence_level * data_availability_score)
+
+                # Increase confidence interval to reflect uncertainty
+                adjustment_factor = 1.0 + (0.5 * (1.0 - data_availability_score))  # Up to 50% wider interval
+                adjusted_lower_bound = prediction_value - (prediction_value - lower_bound) * adjustment_factor
+                adjusted_upper_bound = prediction_value + (upper_bound - prediction_value) * adjustment_factor
+
+                logger.info("Adjusted prediction confidence due to stale historical data",
+                           original_confidence=confidence_level,
+                           adjusted_confidence=adjusted_confidence_level,
+                           data_availability_score=data_availability_score,
+                           original_interval=confidence_interval,
+                           adjusted_interval=adjusted_upper_bound - adjusted_lower_bound)
+
+                lower_bound = max(0, adjusted_lower_bound)
+                upper_bound = adjusted_upper_bound
+                confidence_interval = upper_bound - lower_bound
+
            result = {
                "prediction": max(0, prediction_value),  # Ensure non-negative
                "lower_bound": max(0, lower_bound),
                "upper_bound": max(0, upper_bound),
                "confidence_interval": confidence_interval,
-                "confidence_level": confidence_level
+                "confidence_level": adjusted_confidence_level,
+                "data_freshness_score": data_availability_score  # Include data freshness in result
            }
            
            # Record metrics
@@ -222,35 +248,45 @@ class PredictionService:
                # Register metrics if not already registered
                if "prediction_processing_time" not in metrics._histograms:
                    metrics.register_histogram(
-                        "prediction_processing_time", 
-                        "Time taken to process predictions", 
+                        "prediction_processing_time",
+                        "Time taken to process predictions",
                        labels=['service', 'model_type']
                    )
-                
+
                if "predictions_served_total" not in metrics._counters:
                    try:
                        metrics.register_counter(
-                            "predictions_served_total", 
-                            "Total number of predictions served", 
+                            "predictions_served_total",
+                            "Total number of predictions served",
                            labels=['service', 'status']
                        )
                    except Exception as reg_error:
                        # Metric might already exist in global registry
                        logger.debug("Counter already exists in registry", error=str(reg_error))
-                
-                # Now record the metrics
-                metrics.observe_histogram(
-                    "prediction_processing_time", 
-                    processing_time,
-                    labels={'service': 'forecasting-service', 'model_type': 'prophet'}
-                )
-                metrics.increment_counter(
-                    "predictions_served_total", 
-                    labels={'service': 'forecasting-service', 'status': 'success'}
-                )
+
+                # Now record the metrics - try with expected labels, fallback if needed
+                try:
+                    metrics.observe_histogram(
+                        "prediction_processing_time",
+                        processing_time,
+                        labels={'service': 'forecasting-service', 'model_type': 'prophet'}
+                    )
+                    metrics.increment_counter(
+                        "predictions_served_total",
+                        labels={'service': 'forecasting-service', 'status': 'success'}
+                    )
+                except Exception as label_error:
+                    # If specific labels fail, try without labels to avoid breaking predictions
+                    logger.warning("Failed to record metrics with labels, trying without", error=str(label_error))
+                    try:
+                        metrics.observe_histogram("prediction_processing_time", processing_time)
+                        metrics.increment_counter("predictions_served_total")
+                    except Exception as no_label_error:
+                        logger.warning("Failed to record metrics even without labels", error=str(no_label_error))
+
            except Exception as metrics_error:
                # Log metrics error but don't fail the prediction
-                logger.warning("Failed to record metrics", error=str(metrics_error))
+                logger.warning("Failed to register or record metrics", error=str(metrics_error))
            
            logger.info("Prediction generated successfully", 
                       model_id=model_id,
@@ -260,22 +296,32 @@ class PredictionService:
            return result
            
        except Exception as e:
-            logger.error("Error generating prediction", 
-                        error=str(e), 
+            logger.error("Error generating prediction",
+                        error=str(e),
                        model_id=model_id)
+            # Record error metrics with robust error handling
            try:
                if "prediction_errors_total" not in metrics._counters:
                    metrics.register_counter(
-                        "prediction_errors_total", 
-                        "Total number of prediction errors", 
+                        "prediction_errors_total",
+                        "Total number of prediction errors",
                        labels=['service', 'error_type']
                    )
-                metrics.increment_counter(
-                    "prediction_errors_total",
-                    labels={'service': 'forecasting-service', 'error_type': 'prediction_failed'}
-                )
-            except Exception:
-                pass  # Don't fail on metrics errors
+
+                # Try with labels first, then without if that fails
+                try:
+                    metrics.increment_counter(
+                        "prediction_errors_total",
+                        labels={'service': 'forecasting-service', 'error_type': 'prediction_failed'}
+                    )
+                except Exception as label_error:
+                    logger.debug("Failed to record error metrics with labels", error=str(label_error))
+                    try:
+                        metrics.increment_counter("prediction_errors_total")
+                    except Exception as no_label_error:
+                        logger.warning("Failed to record error metrics even without labels", error=str(no_label_error))
+            except Exception as registration_error:
+                logger.warning("Failed to register error metrics", error=str(registration_error))
            raise

    async def predict_with_weather_forecast(
@@ -353,6 +399,33 @@ class PredictionService:
                    'weather_description': day_weather.get('description', 'Clear')
                })

+                # CRITICAL FIX: Fetch historical sales data and calculate historical features
+                # This populates lag, rolling, and trend features for better predictions
+                # Using 90 days for better trend analysis and more robust rolling statistics
+                if 'tenant_id' in enriched_features and 'inventory_product_id' in enriched_features and 'date' in enriched_features:
+                    try:
+                        forecast_date = pd.to_datetime(enriched_features['date'])
+                        historical_sales = await self._fetch_historical_sales(
+                            tenant_id=enriched_features['tenant_id'],
+                            inventory_product_id=enriched_features['inventory_product_id'],
+                            forecast_date=forecast_date,
+                            days_back=90  # Changed from 30 to 90 for better historical context
+                        )
+
+                        # Calculate historical features and merge into features dict
+                        historical_features = self._calculate_historical_features(
+                            historical_sales, forecast_date
+                        )
+                        enriched_features.update(historical_features)
+
+                        logger.info("Historical features enriched",
+                                    lag_1_day=historical_features.get('lag_1_day'),
+                                    rolling_mean_7d=historical_features.get('rolling_mean_7d'))
+                    except Exception as e:
+                        logger.warning("Failed to enrich with historical features, using defaults",
+                                     error=str(e))
+                        # Features dict will use defaults (0.0) from _prepare_prophet_features
+
                # Prepare Prophet dataframe with weather features
                prophet_df = self._prepare_prophet_features(enriched_features)

@@ -363,6 +436,29 @@ class PredictionService:
                lower_bound = float(forecast['yhat_lower'].iloc[0])
                upper_bound = float(forecast['yhat_upper'].iloc[0])

+                # Calculate confidence adjustment based on data freshness
+                current_confidence_level = confidence_level
+                data_availability_score = enriched_features.get('historical_data_availability_score', 1.0)  # Default to 1.0 if not available
+
+                # Adjust confidence based on data freshness if historical features were calculated
+                # Reduce confidence if historical data is significantly old
+                if data_availability_score < 0.5:
+                    # For data availability score < 0.5 (more than 90 days old), reduce confidence
+                    current_confidence_level = max(0.6, confidence_level * data_availability_score)
+
+                    # Increase confidence interval to reflect uncertainty
+                    adjustment_factor = 1.0 + (0.5 * (1.0 - data_availability_score))  # Up to 50% wider interval
+                    adjusted_lower_bound = prediction_value - (prediction_value - lower_bound) * adjustment_factor
+                    adjusted_upper_bound = prediction_value + (upper_bound - prediction_value) * adjustment_factor
+
+                    logger.info("Adjusted weather prediction confidence due to stale historical data",
+                               original_confidence=confidence_level,
+                               adjusted_confidence=current_confidence_level,
+                               data_availability_score=data_availability_score)
+
+                    lower_bound = max(0, adjusted_lower_bound)
+                    upper_bound = adjusted_upper_bound
+
                # Apply weather-based adjustments (business rules)
                adjusted_prediction = self._apply_weather_adjustments(
                    prediction_value,
@@ -375,7 +471,8 @@ class PredictionService:
                    "prediction": max(0, adjusted_prediction),
                    "lower_bound": max(0, lower_bound),
                    "upper_bound": max(0, upper_bound),
-                    "confidence_level": confidence_level,
+                    "confidence_level": current_confidence_level,
+                    "data_freshness_score": data_availability_score,  # Include data freshness in result
                    "weather": {
                        "temperature": enriched_features['temperature'],
                        "precipitation": enriched_features['precipitation'],
@@ -567,6 +664,8 @@ class PredictionService:
    ) -> pd.Series:
        """
        Fetch historical sales data for calculating lagged and rolling features.
+        Enhanced to handle cases where recent data is not available by extending
+        the search for the most recent data if needed.

        Args:
            tenant_id: Tenant UUID
@@ -578,7 +677,7 @@ class PredictionService:
            pandas Series with sales quantities indexed by date
        """
        try:
-            # Calculate date range
+            # Calculate initial date range for recent data
            end_date = forecast_date - pd.Timedelta(days=1)  # Day before forecast
            start_date = end_date - pd.Timedelta(days=days_back)

@@ -589,7 +688,7 @@ class PredictionService:
                        end_date=end_date.date(),
                        days_back=days_back)

-            # Fetch sales data from sales service
+            # First, try to fetch sales data from the recent period
            sales_data = await self.sales_client.get_sales_data(
                tenant_id=tenant_id,
                start_date=start_date.strftime("%Y-%m-%d"),
@@ -598,15 +697,72 @@ class PredictionService:
                aggregation="daily"
            )

+            # If no recent data found, search for the most recent available data
            if not sales_data:
-                logger.warning("No historical sales data found",
+                logger.info("No recent sales data found, expanding search to find most recent data",
+                           tenant_id=tenant_id,
+                           product_id=inventory_product_id)
+
+                # Search for available data in larger time windows (up to 2 years back)
+                search_windows = [365, 730]  # 1 year, 2 years
+
+                for window_days in search_windows:
+                    extended_start_date = forecast_date - pd.Timedelta(days=window_days)
+
+                    logger.debug("Expanding search window for historical data",
+                                start_date=extended_start_date.date(),
+                                end_date=end_date.date(),
+                                window_days=window_days)
+
+                    sales_data = await self.sales_client.get_sales_data(
+                        tenant_id=tenant_id,
+                        start_date=extended_start_date.strftime("%Y-%m-%d"),
+                        end_date=end_date.strftime("%Y-%m-%d"),
+                        product_id=inventory_product_id,
+                        aggregation="daily"
+                    )
+
+                    if sales_data:
+                        logger.info("Found historical data in expanded search window",
+                                   tenant_id=tenant_id,
+                                   product_id=inventory_product_id,
+                                   data_start=sales_data[0]['sale_date'] if sales_data else "None",
+                                   data_end=sales_data[-1]['sale_date'] if sales_data else "None",
+                                   window_days=window_days)
+                        break
+
+            if not sales_data:
+                logger.warning("No historical sales data found in any search window",
                             tenant_id=tenant_id,
                             product_id=inventory_product_id)
                return pd.Series(dtype=float)

-            # Convert to pandas Series indexed by date
+            # Convert to pandas DataFrame and check if it has the expected structure
            df = pd.DataFrame(sales_data)
-            df['sale_date'] = pd.to_datetime(df['sale_date'])
+
+            # Check if the expected 'sale_date' column exists
+            if df.empty:
+                logger.warning("No historical sales data returned from API")
+                return pd.Series(dtype=float)
+
+            # Check for available columns and find date column
+            available_columns = list(df.columns)
+            logger.debug(f"Available sales data columns: {available_columns}")
+
+            # Check for alternative date column names
+            date_columns = ['sale_date', 'date', 'forecast_date', 'datetime', 'timestamp']
+            date_column = None
+            for col in date_columns:
+                if col in df.columns:
+                    date_column = col
+                    break
+
+            if date_column is None:
+                logger.error(f"Sales data missing expected date column. Available columns: {available_columns}")
+                logger.debug(f"Sample of sales data: {df.head()}")
+                return pd.Series(dtype=float)
+
+            df['sale_date'] = pd.to_datetime(df[date_column])
            df = df.set_index('sale_date')

            # Extract quantity column (could be 'quantity' or 'total_quantity')
@@ -639,6 +795,10 @@ class PredictionService:
    ) -> Dict[str, float]:
        """
        Calculate lagged, rolling, and trend features from historical sales data.
+        Enhanced to handle cases where recent data is not available by using
+        available historical data with appropriate temporal adjustments.
+
+        Now uses shared feature calculator for consistency with training service.

        Args:
            historical_sales: Series of sales quantities indexed by date
@@ -647,117 +807,26 @@ class PredictionService:
        Returns:
            Dictionary of calculated features
        """
-        features = {}
-
        try:
-            if len(historical_sales) == 0:
-                logger.warning("No historical data available, using default values")
-                # Return all features with default values (0.0)
-                return {
-                    # Lagged features
-                    'lag_1_day': 0.0,
-                    'lag_7_day': 0.0,
-                    'lag_14_day': 0.0,
-                    # Rolling statistics (7-day window)
-                    'rolling_mean_7d': 0.0,
-                    'rolling_std_7d': 0.0,
-                    'rolling_max_7d': 0.0,
-                    'rolling_min_7d': 0.0,
-                    # Rolling statistics (14-day window)
-                    'rolling_mean_14d': 0.0,
-                    'rolling_std_14d': 0.0,
-                    'rolling_max_14d': 0.0,
-                    'rolling_min_14d': 0.0,
-                    # Rolling statistics (30-day window)
-                    'rolling_mean_30d': 0.0,
-                    'rolling_std_30d': 0.0,
-                    'rolling_max_30d': 0.0,
-                    'rolling_min_30d': 0.0,
-                    # Trend features
-                    'days_since_start': 0,
-                    'momentum_1_7': 0.0,
-                    'trend_7_30': 0.0,
-                    'velocity_week': 0.0,
-                }
+            # Use shared feature calculator for consistency
+            from shared.ml.feature_calculator import HistoricalFeatureCalculator

-            # Calculate lagged features
-            features['lag_1_day'] = float(historical_sales.iloc[-1]) if len(historical_sales) >= 1 else 0.0
-            features['lag_7_day'] = float(historical_sales.iloc[-7]) if len(historical_sales) >= 7 else features['lag_1_day']
-            features['lag_14_day'] = float(historical_sales.iloc[-14]) if len(historical_sales) >= 14 else features['lag_7_day']
+            calculator = HistoricalFeatureCalculator()

-            # Calculate rolling statistics (7-day window)
-            if len(historical_sales) >= 7:
-                window_7d = historical_sales.iloc[-7:]
-                features['rolling_mean_7d'] = float(window_7d.mean())
-                features['rolling_std_7d'] = float(window_7d.std())
-                features['rolling_max_7d'] = float(window_7d.max())
-                features['rolling_min_7d'] = float(window_7d.min())
-            else:
-                features['rolling_mean_7d'] = features['lag_1_day']
-                features['rolling_std_7d'] = 0.0
-                features['rolling_max_7d'] = features['lag_1_day']
-                features['rolling_min_7d'] = features['lag_1_day']
+            # Calculate all features using shared calculator
+            features = calculator.calculate_all_features(
+                sales_data=historical_sales,
+                reference_date=forecast_date,
+                mode='prediction'
+            )

-            # Calculate rolling statistics (14-day window)
-            if len(historical_sales) >= 14:
-                window_14d = historical_sales.iloc[-14:]
-                features['rolling_mean_14d'] = float(window_14d.mean())
-                features['rolling_std_14d'] = float(window_14d.std())
-                features['rolling_max_14d'] = float(window_14d.max())
-                features['rolling_min_14d'] = float(window_14d.min())
-            else:
-                features['rolling_mean_14d'] = features['rolling_mean_7d']
-                features['rolling_std_14d'] = features['rolling_std_7d']
-                features['rolling_max_14d'] = features['rolling_max_7d']
-                features['rolling_min_14d'] = features['rolling_min_7d']
-
-            # Calculate rolling statistics (30-day window)
-            if len(historical_sales) >= 30:
-                window_30d = historical_sales.iloc[-30:]
-                features['rolling_mean_30d'] = float(window_30d.mean())
-                features['rolling_std_30d'] = float(window_30d.std())
-                features['rolling_max_30d'] = float(window_30d.max())
-                features['rolling_min_30d'] = float(window_30d.min())
-            else:
-                features['rolling_mean_30d'] = features['rolling_mean_14d']
-                features['rolling_std_30d'] = features['rolling_std_14d']
-                features['rolling_max_30d'] = features['rolling_max_14d']
-                features['rolling_min_30d'] = features['rolling_min_14d']
-
-            # Calculate trend features
-            if len(historical_sales) > 0:
-                # Days since first sale
-                features['days_since_start'] = (forecast_date - historical_sales.index[0]).days
-
-                # Momentum (difference between recent lag_1_day and lag_7_day)
-                if len(historical_sales) >= 7:
-                    features['momentum_1_7'] = features['lag_1_day'] - features['lag_7_day']
-                else:
-                    features['momentum_1_7'] = 0.0
-
-                # Trend (difference between recent 7-day and 30-day averages)
-                if len(historical_sales) >= 30:
-                    features['trend_7_30'] = features['rolling_mean_7d'] - features['rolling_mean_30d']
-                else:
-                    features['trend_7_30'] = 0.0
-
-                # Velocity (rate of change over the last week)
-                if len(historical_sales) >= 7:
-                    week_change = historical_sales.iloc[-1] - historical_sales.iloc[-7]
-                    features['velocity_week'] = float(week_change / 7.0)
-                else:
-                    features['velocity_week'] = 0.0
-            else:
-                features['days_since_start'] = 0
-                features['momentum_1_7'] = 0.0
-                features['trend_7_30'] = 0.0
-                features['velocity_week'] = 0.0
-
-            logger.debug("Historical features calculated",
-                        lag_1_day=features['lag_1_day'],
-                        rolling_mean_7d=features['rolling_mean_7d'],
-                        rolling_mean_30d=features['rolling_mean_30d'],
-                        momentum=features['momentum_1_7'])
+            logger.debug("Historical features calculated (using shared calculator)",
+                        lag_1_day=features.get('lag_1_day', 0.0),
+                        rolling_mean_7d=features.get('rolling_mean_7d', 0.0),
+                        rolling_mean_30d=features.get('rolling_mean_30d', 0.0),
+                        momentum=features.get('momentum_1_7', 0.0),
+                        days_since_last_sale=features.get('days_since_last_sale', 0),
+                        data_availability_score=features.get('historical_data_availability_score', 0.0))

            return features

@@ -770,8 +839,9 @@ class PredictionService:
                'rolling_mean_7d', 'rolling_std_7d', 'rolling_max_7d', 'rolling_min_7d',
                'rolling_mean_14d', 'rolling_std_14d', 'rolling_max_14d', 'rolling_min_14d',
                'rolling_mean_30d', 'rolling_std_30d', 'rolling_max_30d', 'rolling_min_30d',
-                'momentum_1_7', 'trend_7_30', 'velocity_week'
-            ]} | {'days_since_start': 0}
+                'momentum_1_7', 'trend_7_30', 'velocity_week',
+                'days_since_last_sale', 'historical_data_availability_score'
+            ]}

    def _prepare_prophet_features(self, features: Dict[str, Any]) -> pd.DataFrame:
        """Convert features to Prophet-compatible DataFrame - COMPLETE FEATURE MATCHING"""
@@ -962,6 +1032,9 @@ class PredictionService:
                'momentum_1_7': float(features.get('momentum_1_7', 0.0)),
                'trend_7_30': float(features.get('trend_7_30', 0.0)),
                'velocity_week': float(features.get('velocity_week', 0.0)),
+                # Data freshness metrics to help model understand data recency
+                'days_since_last_sale': int(features.get('days_since_last_sale', 0)),
+                'historical_data_availability_score': float(features.get('historical_data_availability_score', 0.0)),
            }
            
            # Calculate interaction features