Improve the UI and training

This commit is contained in:
Urtzi Alfaro
2025-11-15 15:20:10 +01:00
parent c349b845a6
commit 843cd2bf5c
19 changed files with 2073 additions and 233 deletions

View File

@@ -158,6 +158,56 @@ async def start_training_job(
# Continue with job creation but log the error
try:
# CRITICAL FIX: Check for existing running jobs before starting new one
# This prevents duplicate tenant-level training jobs
async with enhanced_training_service.database_manager.get_session() as check_session:
from app.repositories.training_log_repository import TrainingLogRepository
log_repo = TrainingLogRepository(check_session)
# Check for active jobs (running or pending)
active_jobs = await log_repo.get_active_jobs(tenant_id=tenant_id)
pending_jobs = await log_repo.get_logs_by_tenant(
tenant_id=tenant_id,
status="pending",
limit=10
)
all_active = active_jobs + pending_jobs
if all_active:
# Training job already in progress, return existing job info
existing_job = all_active[0]
logger.info("Training job already in progress, returning existing job",
existing_job_id=existing_job.job_id,
tenant_id=tenant_id,
status=existing_job.status)
return TrainingJobResponse(
job_id=existing_job.job_id,
tenant_id=tenant_id,
status=existing_job.status,
message=f"Training job already in progress (started {existing_job.created_at.isoformat() if existing_job.created_at else 'recently'})",
created_at=existing_job.created_at or datetime.now(timezone.utc),
estimated_duration_minutes=existing_job.config.get("estimated_duration_minutes", 15) if existing_job.config else 15,
training_results={
"total_products": 0,
"successful_trainings": 0,
"failed_trainings": 0,
"products": [],
"overall_training_time_seconds": 0.0
},
data_summary=None,
completed_at=None,
error_details=None,
processing_metadata={
"background_task": True,
"async_execution": True,
"existing_job": True,
"deduplication": True
}
)
# No existing job, proceed with creating new one
# Generate enhanced job ID
job_id = f"enhanced_training_{tenant_id}_{uuid.uuid4().hex[:8]}"
@@ -407,6 +457,7 @@ async def start_single_product_training(
request: SingleProductTrainingRequest,
tenant_id: str = Path(..., description="Tenant ID"),
inventory_product_id: str = Path(..., description="Inventory product UUID"),
background_tasks: BackgroundTasks = BackgroundTasks(),
request_obj: Request = None,
current_user: Dict[str, Any] = Depends(get_current_user_dep),
enhanced_training_service: EnhancedTrainingService = Depends(get_enhanced_training_service)
@@ -421,6 +472,7 @@ async def start_single_product_training(
- Enhanced error handling and validation
- Metrics tracking
- Transactional operations
- Background execution to prevent blocking
"""
metrics = get_metrics_collector(request_obj)
@@ -429,6 +481,53 @@ async def start_single_product_training(
inventory_product_id=inventory_product_id,
tenant_id=tenant_id)
# CRITICAL FIX: Check if this product is currently being trained
# This prevents duplicate training from rapid-click scenarios
async with enhanced_training_service.database_manager.get_session() as check_session:
from app.repositories.training_log_repository import TrainingLogRepository
log_repo = TrainingLogRepository(check_session)
# Check for active jobs for this specific product
active_jobs = await log_repo.get_active_jobs(tenant_id=tenant_id)
pending_jobs = await log_repo.get_logs_by_tenant(
tenant_id=tenant_id,
status="pending",
limit=20
)
all_active = active_jobs + pending_jobs
# Filter for jobs that include this specific product
product_jobs = [
job for job in all_active
if job.config and (
# Single product job for this product
job.config.get("product_id") == inventory_product_id or
# Tenant-wide job that would include this product
job.config.get("job_type") == "tenant_training"
)
]
if product_jobs:
existing_job = product_jobs[0]
logger.warning("Product training already in progress, rejecting duplicate request",
existing_job_id=existing_job.job_id,
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
status=existing_job.status)
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail={
"error": "Product training already in progress",
"message": f"Product {inventory_product_id} is currently being trained in job {existing_job.job_id}",
"existing_job_id": existing_job.job_id,
"status": existing_job.status,
"started_at": existing_job.created_at.isoformat() if existing_job.created_at else None
}
)
# No existing job, proceed with training
# Record metrics
if metrics:
metrics.increment_counter("enhanced_single_product_training_total")
@@ -436,22 +535,60 @@ async def start_single_product_training(
# Generate enhanced job ID
job_id = f"enhanced_single_{tenant_id}_{inventory_product_id}_{uuid.uuid4().hex[:8]}"
# Delegate to enhanced training service
result = await enhanced_training_service.start_single_product_training(
# CRITICAL FIX: Add initial training log entry
await enhanced_training_service._update_job_status_repository(
job_id=job_id,
status="pending",
progress=0,
current_step="Initializing single product training",
tenant_id=tenant_id
)
# Add enhanced background task for single product training
background_tasks.add_task(
execute_single_product_training_background,
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
job_id=job_id,
bakery_location=request.bakery_location or (40.4168, -3.7038)
bakery_location=request.bakery_location or (40.4168, -3.7038),
database_manager=enhanced_training_service.database_manager
)
if metrics:
metrics.increment_counter("enhanced_single_product_training_success_total")
# Return immediate response with job info
response_data = {
"job_id": job_id,
"tenant_id": tenant_id,
"status": "pending",
"message": "Enhanced single product training started successfully",
"created_at": datetime.now(timezone.utc),
"estimated_duration_minutes": 15, # Default estimate for single product
"training_results": {
"total_products": 1,
"successful_trainings": 0,
"failed_trainings": 0,
"products": [],
"overall_training_time_seconds": 0.0
},
"data_summary": None,
"completed_at": None,
"error_details": None,
"processing_metadata": {
"background_task": True,
"async_execution": True,
"enhanced_features": True,
"repository_pattern": True,
"dependency_injection": True
}
}
logger.info("Enhanced single product training completed",
logger.info("Enhanced single product training queued successfully",
inventory_product_id=inventory_product_id,
job_id=job_id)
return TrainingJobResponse(**result)
if metrics:
metrics.increment_counter("enhanced_single_product_training_queued_total")
return TrainingJobResponse(**response_data)
except ValueError as e:
if metrics:
@@ -475,6 +612,74 @@ async def start_single_product_training(
)
async def execute_single_product_training_background(
tenant_id: str,
inventory_product_id: str,
job_id: str,
bakery_location: tuple,
database_manager
):
"""
Enhanced background task that executes single product training using repository pattern.
Uses a separate service instance to avoid session conflicts.
"""
logger.info("Enhanced background single product training started",
job_id=job_id,
tenant_id=tenant_id,
inventory_product_id=inventory_product_id)
# Create a new service instance with a fresh database session to avoid conflicts
from app.services.training_service import EnhancedTrainingService
fresh_training_service = EnhancedTrainingService(database_manager)
try:
# Update job status to running
await fresh_training_service._update_job_status_repository(
job_id=job_id,
status="running",
progress=0,
current_step="Starting single product training",
tenant_id=tenant_id
)
# Execute the enhanced single product training with repository pattern
result = await fresh_training_service.start_single_product_training(
tenant_id=tenant_id,
inventory_product_id=inventory_product_id,
job_id=job_id,
bakery_location=bakery_location
)
logger.info("Enhanced background single product training completed successfully",
job_id=job_id,
inventory_product_id=inventory_product_id)
except Exception as training_error:
logger.error("Enhanced single product training failed",
job_id=job_id,
inventory_product_id=inventory_product_id,
error=str(training_error))
try:
await fresh_training_service._update_job_status_repository(
job_id=job_id,
status="failed",
progress=0,
current_step="Single product training failed",
error_message=str(training_error),
tenant_id=tenant_id
)
except Exception as status_error:
logger.error("Failed to update job status after training error",
job_id=job_id,
status_error=str(status_error))
finally:
logger.info("Enhanced background single product training cleanup completed",
job_id=job_id,
inventory_product_id=inventory_product_id)
@router.get("/health")
async def health_check():
"""Health check endpoint for the training operations"""