Add POI feature and imporve the overall backend implementation
This commit is contained in:
@@ -116,20 +116,22 @@ class EnhancedBakeryDataProcessor:
|
||||
weather_data: pd.DataFrame,
|
||||
traffic_data: pd.DataFrame,
|
||||
inventory_product_id: str,
|
||||
poi_features: Dict[str, Any] = None,
|
||||
tenant_id: str = None,
|
||||
job_id: str = None,
|
||||
session=None) -> pd.DataFrame:
|
||||
"""
|
||||
Prepare comprehensive training data for a specific product with repository logging.
|
||||
|
||||
|
||||
Args:
|
||||
sales_data: Historical sales data for the product
|
||||
weather_data: Weather data
|
||||
traffic_data: Traffic data
|
||||
inventory_product_id: Inventory product UUID for logging
|
||||
poi_features: POI features (location-based, static)
|
||||
tenant_id: Optional tenant ID for tracking
|
||||
job_id: Optional job ID for tracking
|
||||
|
||||
|
||||
Returns:
|
||||
DataFrame ready for Prophet training with 'ds' and 'y' columns plus features
|
||||
"""
|
||||
@@ -250,6 +252,18 @@ class EnhancedBakeryDataProcessor:
|
||||
inventory_product_id=inventory_product_id,
|
||||
total_features=len(daily_sales.columns))
|
||||
|
||||
logger.debug("Starting Step 8b: Add POI features",
|
||||
inventory_product_id=inventory_product_id)
|
||||
# Step 8b: Add POI features (static, location-based)
|
||||
if poi_features:
|
||||
daily_sales = self._add_poi_features(daily_sales, poi_features)
|
||||
logger.debug("Step 8b completed: Add POI features",
|
||||
inventory_product_id=inventory_product_id,
|
||||
poi_feature_count=len(poi_features))
|
||||
else:
|
||||
logger.debug("Step 8b skipped: No POI features available",
|
||||
inventory_product_id=inventory_product_id)
|
||||
|
||||
logger.debug("Starting Step 9: Handle missing values",
|
||||
inventory_product_id=inventory_product_id)
|
||||
# Step 7: Handle missing values
|
||||
@@ -331,6 +345,7 @@ class EnhancedBakeryDataProcessor:
|
||||
future_dates: pd.DatetimeIndex,
|
||||
weather_forecast: pd.DataFrame = None,
|
||||
traffic_forecast: pd.DataFrame = None,
|
||||
poi_features: Dict[str, Any] = None,
|
||||
historical_data: pd.DataFrame = None) -> pd.DataFrame:
|
||||
"""
|
||||
Create features for future predictions with proper date handling.
|
||||
@@ -339,6 +354,7 @@ class EnhancedBakeryDataProcessor:
|
||||
future_dates: Future dates to predict
|
||||
weather_forecast: Weather forecast data
|
||||
traffic_forecast: Traffic forecast data
|
||||
poi_features: POI features (location-based, static)
|
||||
historical_data: Historical data for creating lagged and rolling features
|
||||
|
||||
Returns:
|
||||
@@ -390,6 +406,10 @@ class EnhancedBakeryDataProcessor:
|
||||
logger.warning("No historical data provided, lagged features will be NaN")
|
||||
future_df = self._add_advanced_features(future_df)
|
||||
|
||||
# Add POI features (static, location-based)
|
||||
if poi_features:
|
||||
future_df = self._add_poi_features(future_df, poi_features)
|
||||
|
||||
future_df = future_df.rename(columns={'date': 'ds'})
|
||||
|
||||
# Handle missing values in future data
|
||||
@@ -1171,7 +1191,42 @@ class EnhancedBakeryDataProcessor:
|
||||
df[col] = df[col].fillna(default_value)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def _add_poi_features(self, df: pd.DataFrame, poi_features: Dict[str, Any]) -> pd.DataFrame:
|
||||
"""
|
||||
Add POI features to training dataframe.
|
||||
|
||||
POI features are static (location-based, not time-varying),
|
||||
so they're broadcast to all rows in the dataframe.
|
||||
|
||||
Args:
|
||||
df: Training dataframe
|
||||
poi_features: Dictionary of POI ML features
|
||||
|
||||
Returns:
|
||||
Dataframe with POI features added as columns
|
||||
"""
|
||||
if not poi_features:
|
||||
logger.warning("No POI features to add")
|
||||
return df
|
||||
|
||||
logger.info(f"Adding {len(poi_features)} POI features to dataframe")
|
||||
|
||||
# Add each POI feature as a column with constant value
|
||||
for feature_name, feature_value in poi_features.items():
|
||||
# Convert boolean to int for ML compatibility
|
||||
if isinstance(feature_value, bool):
|
||||
feature_value = 1 if feature_value else 0
|
||||
df[feature_name] = feature_value
|
||||
|
||||
logger.info(
|
||||
"POI features added successfully",
|
||||
feature_count=len(poi_features),
|
||||
feature_names=list(poi_features.keys())[:5] # Log first 5 for brevity
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
def _prepare_prophet_format(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Prepare data in Prophet format with enhanced validation"""
|
||||
prophet_df = df.copy()
|
||||
|
||||
204
services/training/app/ml/poi_feature_integrator.py
Normal file
204
services/training/app/ml/poi_feature_integrator.py
Normal file
@@ -0,0 +1,204 @@
|
||||
"""
|
||||
POI Feature Integrator
|
||||
|
||||
Integrates POI features into ML training pipeline.
|
||||
Fetches POI context from External service and merges features into training data.
|
||||
"""
|
||||
|
||||
import httpx
|
||||
from typing import Dict, Any, Optional, List
|
||||
import structlog
|
||||
import pandas as pd
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class POIFeatureIntegrator:
|
||||
"""
|
||||
POI feature integration for ML training.
|
||||
|
||||
Fetches POI context from External service and adds features
|
||||
to training dataframes for location-based demand forecasting.
|
||||
"""
|
||||
|
||||
def __init__(self, external_service_url: str = "http://external-service:8000"):
|
||||
"""
|
||||
Initialize POI feature integrator.
|
||||
|
||||
Args:
|
||||
external_service_url: Base URL for external service
|
||||
"""
|
||||
self.external_service_url = external_service_url.rstrip("/")
|
||||
self.poi_context_endpoint = f"{self.external_service_url}/poi-context"
|
||||
|
||||
async def fetch_poi_features(
|
||||
self,
|
||||
tenant_id: str,
|
||||
latitude: float,
|
||||
longitude: float,
|
||||
force_refresh: bool = False
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Fetch POI features for tenant location.
|
||||
|
||||
First checks if POI context exists, if not, triggers detection.
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant UUID
|
||||
latitude: Bakery latitude
|
||||
longitude: Bakery longitude
|
||||
force_refresh: Force re-detection
|
||||
|
||||
Returns:
|
||||
Dictionary with POI features or None if detection fails
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
# Try to get existing POI context first
|
||||
if not force_refresh:
|
||||
try:
|
||||
response = await client.get(
|
||||
f"{self.poi_context_endpoint}/{tenant_id}"
|
||||
)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
poi_context = data.get("poi_context", {})
|
||||
|
||||
# Check if stale
|
||||
if not data.get("is_stale", False):
|
||||
logger.info(
|
||||
"Using existing POI context",
|
||||
tenant_id=tenant_id
|
||||
)
|
||||
return poi_context.get("ml_features", {})
|
||||
else:
|
||||
logger.info(
|
||||
"POI context is stale, refreshing",
|
||||
tenant_id=tenant_id
|
||||
)
|
||||
force_refresh = True
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code != 404:
|
||||
raise
|
||||
logger.info(
|
||||
"No existing POI context, will detect",
|
||||
tenant_id=tenant_id
|
||||
)
|
||||
|
||||
# Detect or refresh POIs
|
||||
logger.info(
|
||||
"Detecting POIs for tenant",
|
||||
tenant_id=tenant_id,
|
||||
location=(latitude, longitude)
|
||||
)
|
||||
|
||||
response = await client.post(
|
||||
f"{self.poi_context_endpoint}/{tenant_id}/detect",
|
||||
params={
|
||||
"latitude": latitude,
|
||||
"longitude": longitude,
|
||||
"force_refresh": force_refresh
|
||||
}
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
poi_context = result.get("poi_context", {})
|
||||
ml_features = poi_context.get("ml_features", {})
|
||||
|
||||
logger.info(
|
||||
"POI detection completed",
|
||||
tenant_id=tenant_id,
|
||||
total_pois=poi_context.get("total_pois_detected", 0),
|
||||
feature_count=len(ml_features)
|
||||
)
|
||||
|
||||
return ml_features
|
||||
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(
|
||||
"Failed to fetch POI features",
|
||||
tenant_id=tenant_id,
|
||||
error=str(e),
|
||||
exc_info=True
|
||||
)
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Unexpected error fetching POI features",
|
||||
tenant_id=tenant_id,
|
||||
error=str(e),
|
||||
exc_info=True
|
||||
)
|
||||
return None
|
||||
|
||||
def add_poi_features_to_dataframe(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
poi_features: Dict[str, Any]
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Add POI features to training dataframe.
|
||||
|
||||
POI features are static (don't vary by date), so they're
|
||||
broadcast to all rows in the dataframe.
|
||||
|
||||
Args:
|
||||
df: Training dataframe
|
||||
poi_features: Dictionary of POI ML features
|
||||
|
||||
Returns:
|
||||
Dataframe with POI features added as columns
|
||||
"""
|
||||
if not poi_features:
|
||||
logger.warning("No POI features to add")
|
||||
return df
|
||||
|
||||
logger.info(
|
||||
"Adding POI features to dataframe",
|
||||
feature_count=len(poi_features),
|
||||
dataframe_rows=len(df)
|
||||
)
|
||||
|
||||
# Add each POI feature as a column with constant value
|
||||
for feature_name, feature_value in poi_features.items():
|
||||
df[feature_name] = feature_value
|
||||
|
||||
logger.info(
|
||||
"POI features added successfully",
|
||||
new_columns=list(poi_features.keys())
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
def get_poi_feature_names(self, poi_features: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Get list of POI feature names for model registration.
|
||||
|
||||
Args:
|
||||
poi_features: Dictionary of POI ML features
|
||||
|
||||
Returns:
|
||||
List of feature names
|
||||
"""
|
||||
return list(poi_features.keys()) if poi_features else []
|
||||
|
||||
async def check_poi_service_health(self) -> bool:
|
||||
"""
|
||||
Check if POI service is accessible.
|
||||
|
||||
Returns:
|
||||
True if service is healthy, False otherwise
|
||||
"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
response = await client.get(
|
||||
f"{self.poi_context_endpoint}/health"
|
||||
)
|
||||
return response.status_code == 200
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"POI service health check failed",
|
||||
error=str(e)
|
||||
)
|
||||
return False
|
||||
@@ -210,9 +210,23 @@ class EnhancedBakeryMLTrainer:
|
||||
# Process data for each product using enhanced processor
|
||||
logger.info("Processing data using enhanced processor")
|
||||
processed_data = await self._process_all_products_enhanced(
|
||||
sales_df, weather_df, traffic_df, products, tenant_id, job_id, session
|
||||
sales_df, weather_df, traffic_df, products, tenant_id, job_id, training_dataset.poi_features, session
|
||||
)
|
||||
|
||||
# Validate that we have processed data
|
||||
if not processed_data or len(processed_data) == 0:
|
||||
error_msg = f"No products could be processed successfully. Found {len(products)} products in sales data but all failed during processing."
|
||||
logger.error("Training aborted - no processed data",
|
||||
tenant_id=tenant_id,
|
||||
job_id=job_id,
|
||||
products_found=len(products),
|
||||
products_processed=0)
|
||||
raise ValueError(error_msg)
|
||||
|
||||
logger.info(f"Successfully processed {len(processed_data)} out of {len(products)} products",
|
||||
products_processed=len(processed_data),
|
||||
products_found=len(products))
|
||||
|
||||
# Categorize all products for category-specific forecasting
|
||||
logger.info("Categorizing products for optimized forecasting")
|
||||
product_categories = await self._categorize_all_products(
|
||||
@@ -491,6 +505,7 @@ class EnhancedBakeryMLTrainer:
|
||||
products: List[str],
|
||||
tenant_id: str,
|
||||
job_id: str,
|
||||
poi_features: Dict[str, Any],
|
||||
session=None) -> Dict[str, pd.DataFrame]:
|
||||
"""Process data for all products using enhanced processor with repository tracking"""
|
||||
processed_data = {}
|
||||
@@ -515,6 +530,7 @@ class EnhancedBakeryMLTrainer:
|
||||
weather_data=weather_df,
|
||||
traffic_data=traffic_df,
|
||||
inventory_product_id=inventory_product_id,
|
||||
poi_features=poi_features, # POI features for location-based forecasting
|
||||
tenant_id=tenant_id,
|
||||
job_id=job_id,
|
||||
session=session # Pass the session to avoid creating new ones
|
||||
@@ -1132,6 +1148,7 @@ class EnhancedBakeryMLTrainer:
|
||||
weather_data=test_weather_df,
|
||||
traffic_data=test_traffic_df,
|
||||
inventory_product_id=inventory_product_id,
|
||||
poi_features=test_dataset.poi_features, # POI features for location-based forecasting
|
||||
tenant_id=tenant_id
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user