Fix issues

This commit is contained in:
Urtzi Alfaro
2025-08-17 10:28:58 +02:00
parent 8914786973
commit 109961ef6e
10 changed files with 450 additions and 176 deletions

View File

@@ -73,14 +73,47 @@ class TrainingDataOrchestrator:
logger.info(f"Starting comprehensive training data preparation for tenant {tenant_id}, job {job_id}")
try:
# Step 1: Fetch and validate sales data (unified approach)
sales_data = await self.data_client.fetch_sales_data(tenant_id, fetch_all=True)
sales_data = await self.data_client.fetch_sales_data(tenant_id)
# Pre-flight validation moved here to eliminate duplicate fetching
if not sales_data or len(sales_data) == 0:
error_msg = f"No sales data available for tenant {tenant_id}. Please import sales data before starting training."
logger.error("Training aborted - no sales data", tenant_id=tenant_id, job_id=job_id)
raise ValueError(error_msg)
# Step 1: Extract and validate sales data date range
# Debug: Analyze the sales data structure to understand product distribution
sales_df_debug = pd.DataFrame(sales_data)
if 'inventory_product_id' in sales_df_debug.columns:
unique_products_found = sales_df_debug['inventory_product_id'].unique()
product_counts = sales_df_debug['inventory_product_id'].value_counts().to_dict()
logger.info("Sales data analysis (moved from pre-flight)",
tenant_id=tenant_id,
job_id=job_id,
total_sales_records=len(sales_data),
unique_products_count=len(unique_products_found),
unique_products=unique_products_found.tolist(),
records_per_product=product_counts)
if len(unique_products_found) == 1:
logger.warning("POTENTIAL ISSUE: Only ONE unique product found in all sales data",
tenant_id=tenant_id,
single_product=unique_products_found[0],
record_count=len(sales_data))
else:
logger.warning("No 'inventory_product_id' column found in sales data",
tenant_id=tenant_id,
columns=list(sales_df_debug.columns))
logger.info(f"Sales data validation passed: {len(sales_data)} sales records found",
tenant_id=tenant_id, job_id=job_id)
# Step 2: Extract and validate sales data date range
sales_date_range = self._extract_sales_date_range(sales_data)
logger.info(f"Sales data range detected: {sales_date_range.start} to {sales_date_range.end}")
# Step 2: Apply date alignment across all data sources
# Step 3: Apply date alignment across all data sources
aligned_range = self.date_alignment_service.validate_and_align_dates(
user_sales_range=sales_date_range,
requested_start=requested_start,
@@ -91,21 +124,21 @@ class TrainingDataOrchestrator:
if aligned_range.constraints:
logger.info(f"Applied constraints: {aligned_range.constraints}")
# Step 3: Filter sales data to aligned date range
# Step 4: Filter sales data to aligned date range
filtered_sales = self._filter_sales_data(sales_data, aligned_range)
# Step 4: Collect external data sources concurrently
# Step 5: Collect external data sources concurrently
logger.info("Collecting external data sources...")
weather_data, traffic_data = await self._collect_external_data(
aligned_range, bakery_location, tenant_id
)
# Step 5: Validate data quality
# Step 6: Validate data quality
data_quality_results = self._validate_data_sources(
filtered_sales, weather_data, traffic_data, aligned_range
)
# Step 6: Create comprehensive training dataset
# Step 7: Create comprehensive training dataset
training_dataset = TrainingDataSet(
sales_data=filtered_sales,
weather_data=weather_data,
@@ -126,7 +159,7 @@ class TrainingDataOrchestrator:
}
)
# Step 7: Final validation
# Step 8: Final validation
final_validation = self.validate_training_data_quality(training_dataset)
training_dataset.metadata["final_validation"] = final_validation
@@ -375,14 +408,16 @@ class TrainingDataOrchestrator:
start_date_str = aligned_range.start.isoformat()
end_date_str = aligned_range.end.isoformat()
# Enhanced: Fetch traffic data using new abstracted service
# Enhanced: Fetch traffic data using unified cache-first method
# This automatically detects the appropriate city and uses the right client
traffic_data = await self.data_client.fetch_traffic_data(
traffic_data = await self.data_client.fetch_traffic_data_unified(
tenant_id=tenant_id,
start_date=start_date_str,
end_date=end_date_str,
latitude=lat,
longitude=lon)
longitude=lon,
force_refresh=False # Use cache-first strategy
)
# Enhanced validation including pedestrian inference data
if self._validate_traffic_data_enhanced(traffic_data):
@@ -461,54 +496,6 @@ class TrainingDataOrchestrator:
minimal_traffic_data = [{"city": "madrid", "source": "legacy"}] * min(record_count, 1)
self._log_enhanced_traffic_data_storage(lat, lon, aligned_range, record_count, minimal_traffic_data)
async def retrieve_stored_traffic_for_retraining(
self,
bakery_location: Tuple[float, float],
start_date: datetime,
end_date: datetime,
tenant_id: str
) -> List[Dict[str, Any]]:
"""
Retrieve previously stored traffic data for model re-training
This method specifically accesses the stored traffic data without making new API calls
"""
lat, lon = bakery_location
try:
# Use the dedicated stored traffic data method for training
stored_traffic_data = await self.data_client.fetch_stored_traffic_data_for_training(
tenant_id=tenant_id,
start_date=start_date.isoformat(),
end_date=end_date.isoformat(),
latitude=lat,
longitude=lon
)
if stored_traffic_data:
logger.info(
f"Retrieved {len(stored_traffic_data)} stored traffic records for re-training",
location=f"{lat:.4f},{lon:.4f}",
date_range=f"{start_date.isoformat()} to {end_date.isoformat()}",
tenant_id=tenant_id
)
return stored_traffic_data
else:
logger.warning(
"No stored traffic data found for re-training",
location=f"{lat:.4f},{lon:.4f}",
date_range=f"{start_date.isoformat()} to {end_date.isoformat()}"
)
return []
except Exception as e:
logger.error(
f"Failed to retrieve stored traffic data for re-training: {e}",
location=f"{lat:.4f},{lon:.4f}",
tenant_id=tenant_id
)
return []
def _validate_weather_data(self, weather_data: List[Dict[str, Any]]) -> bool:
"""Validate weather data quality"""
if not weather_data: