Improve the event messaging for training service

This commit is contained in:
Urtzi Alfaro
2025-07-30 21:21:02 +02:00
parent 5e3fbc5493
commit 923b2d48d2
3 changed files with 428 additions and 79 deletions

View File

@@ -16,7 +16,13 @@ import pandas as pd
from app.services.data_client import DataClient
from app.services.date_alignment_service import DateAlignmentService, DateRange, DataSourceType, AlignedDateRange
from app.services.messaging import publish_job_progress, publish_job_failed
from app.services.messaging import (
publish_job_progress,
publish_data_validation_started,
publish_data_validation_completed,
publish_job_step_completed,
publish_job_failed
)
logger = logging.getLogger(__name__)
@@ -69,16 +75,19 @@ class TrainingDataOrchestrator:
try:
#publish_job_progress(job_id, tenant_id, 5, "Extraer datos de venta")
await publish_job_progress(job_id, tenant_id, 5, "Extrayendo datos de ventas",
step_details="Conectando con servicio de datos")
sales_data = await self.data_client.fetch_sales_data(tenant_id)
# Step 1: Extract and validate sales data date range
#publish_job_progress(job_id, tenant_id, 10, "Extraer y validar las fechas de de los datos de venta")
await publish_job_progress(job_id, tenant_id, 10, "Validando fechas de datos de venta",
step_details="Aplicando restricciones de fuentes de datos")
sales_date_range = self._extract_sales_date_range(sales_data)
logger.info(f"Sales data range detected: {sales_date_range.start} to {sales_date_range.end}")
# Step 2: Apply date alignment across all data sources
#publish_job_progress(job_id, tenant_id, 15, "Aplicar la alineación de fechas en todas las fuentes de datos")
await publish_job_progress(job_id, tenant_id, 15, "Alinear el rango de fechas",
step_details="Aplicar la alineación de fechas en todas las fuentes de datos")
aligned_range = self.date_alignment_service.validate_and_align_dates(
user_sales_range=sales_date_range,
requested_start=requested_start,
@@ -90,18 +99,21 @@ class TrainingDataOrchestrator:
logger.info(f"Applied constraints: {aligned_range.constraints}")
# Step 3: Filter sales data to aligned date range
#publish_job_progress(job_id, tenant_id, 20, "Aplicar la alineación de fechas en todas las fuentes de datos")
await publish_job_progress(job_id, tenant_id, 20, "Alinear el rango de las ventas",
step_details="Aplicar la alineación de fechas de las ventas")
filtered_sales = self._filter_sales_data(sales_data, aligned_range)
# Step 4: Collect external data sources concurrently
logger.info("Collecting external data sources...")
#publish_job_progress(job_id, tenant_id, 25, "Recopilación de fuentes de datos externas")
await publish_job_progress(job_id, tenant_id, 25, "Recopilación de fuentes de datos externas",
step_details="Recopilación de fuentes de datos externas")
weather_data, traffic_data = await self._collect_external_data(
aligned_range, bakery_location, tenant_id
)
# Step 5: Validate data quality
#publish_job_progress(job_id, tenant_id, 30, "Validando la calidad de los datos")
await publish_job_progress(job_id, tenant_id, 30, "Validando la calidad de los datos",
step_details="Validando la calidad de los datos")
data_quality_results = self._validate_data_sources(
filtered_sales, weather_data, traffic_data, aligned_range
)
@@ -128,7 +140,8 @@ class TrainingDataOrchestrator:
)
# Step 7: Final validation
#publish_job_progress(job_id, tenant_id, 35, "Validancion final de los datos")
await publish_job_progress(job_id, tenant_id, 35, "Validancion final de los datos",
step_details="Validancion final de los datos")
final_validation = self.validate_training_data_quality(training_dataset)
training_dataset.metadata["final_validation"] = final_validation
@@ -141,7 +154,7 @@ class TrainingDataOrchestrator:
return training_dataset
except Exception as e:
#publish_job_failed(job_id, tenant_id, str(e))
publish_job_failed(job_id, tenant_id, str(e))
logger.error(f"Training data preparation failed: {str(e)}")
raise ValueError(f"Failed to prepare training data: {str(e)}")
@@ -546,6 +559,7 @@ class TrainingDataOrchestrator:
return synthetic_data
def validate_training_data_quality(self, dataset: TrainingDataSet) -> Dict[str, Any]:
"""Enhanced validation of training data quality"""
validation_results = {
"is_valid": True,