Fix new services implementation 5

This commit is contained in:
Urtzi Alfaro
2025-08-15 17:53:59 +02:00
parent 03b4d4185d
commit f7de9115d1
43 changed files with 1714 additions and 891 deletions

View File

@@ -574,13 +574,14 @@ class TrainingDataOrchestrator:
if city_count >= 1: # At least some city awareness
city_aware_records += 1
# Record is valid if it has basic requirements
if record_score >= 2:
# Record is valid if it has basic requirements (date + any traffic field)
# Lowered requirement from >= 2 to >= 1 to accept records with just date or traffic data
if record_score >= 1:
valid_records += 1
total_records = len(traffic_data)
validity_threshold = 0.3
enhancement_threshold = 0.2 # Lower threshold for enhanced features
validity_threshold = 0.1 # Reduced from 0.3 to 0.1 - accept if 10% of records are valid
enhancement_threshold = 0.1 # Reduced threshold for enhanced features
basic_validity = (valid_records / total_records) >= validity_threshold
has_enhancements = (enhanced_records / total_records) >= enhancement_threshold

View File

@@ -141,6 +141,30 @@ class EnhancedTrainingService:
logger.error("Training aborted - no sales data", tenant_id=tenant_id, job_id=job_id)
raise ValueError(error_msg)
# Debug: Analyze the sales data structure to understand product distribution
sales_df_debug = pd.DataFrame(sales_data)
if 'inventory_product_id' in sales_df_debug.columns:
unique_products_found = sales_df_debug['inventory_product_id'].unique()
product_counts = sales_df_debug['inventory_product_id'].value_counts().to_dict()
logger.info("Pre-flight sales data analysis",
tenant_id=tenant_id,
job_id=job_id,
total_sales_records=len(sales_data),
unique_products_count=len(unique_products_found),
unique_products=unique_products_found.tolist(),
records_per_product=product_counts)
if len(unique_products_found) == 1:
logger.warning("POTENTIAL ISSUE: Only ONE unique product found in all sales data",
tenant_id=tenant_id,
single_product=unique_products_found[0],
record_count=len(sales_data))
else:
logger.warning("No 'inventory_product_id' column found in sales data",
tenant_id=tenant_id,
columns=list(sales_df_debug.columns))
logger.info(f"Pre-flight check passed: {len(sales_data)} sales records found",
tenant_id=tenant_id, job_id=job_id)
@@ -536,18 +560,69 @@ class EnhancedTrainingService:
progress: int = None,
current_step: str = None,
error_message: str = None,
results: Dict = None):
results: Dict = None,
tenant_id: str = None):
"""Update job status using repository pattern"""
try:
async with self.database_manager.get_session() as session:
await self._init_repositories(session)
await self.training_log_repo.update_log_progress(
job_id=job_id,
progress=progress,
current_step=current_step,
status=status
)
# Check if log exists, create if not
existing_log = await self.training_log_repo.get_log_by_job_id(job_id)
if not existing_log:
# Create initial log entry
if not tenant_id:
# Extract tenant_id from job_id if not provided
# Format: enhanced_training_{tenant_id}_{job_suffix}
try:
parts = job_id.split('_')
if len(parts) >= 3 and parts[0] == 'enhanced' and parts[1] == 'training':
tenant_id = parts[2]
except Exception:
logger.warning(f"Could not extract tenant_id from job_id {job_id}")
if tenant_id:
log_data = {
"job_id": job_id,
"tenant_id": tenant_id,
"status": status or "pending",
"progress": progress or 0,
"current_step": current_step or "initializing",
"start_time": datetime.utcnow()
}
if error_message:
log_data["error_message"] = error_message
if results:
log_data["results"] = results
await self.training_log_repo.create_training_log(log_data)
logger.info("Created initial training log", job_id=job_id, tenant_id=tenant_id)
else:
logger.error("Cannot create training log without tenant_id", job_id=job_id)
return
else:
# Update existing log
await self.training_log_repo.update_log_progress(
job_id=job_id,
progress=progress,
current_step=current_step,
status=status
)
# Update additional fields if provided
if error_message or results:
update_data = {}
if error_message:
update_data["error_message"] = error_message
if results:
update_data["results"] = results
if status in ["completed", "failed"]:
update_data["end_time"] = datetime.utcnow()
if update_data:
await self.training_log_repo.update(existing_log.id, update_data)
except Exception as e:
logger.error("Failed to update job status using repository",