Fix multiple critical bugs in onboarding training step
This commit addresses all identified bugs and issues in the training code path: ## Critical Fixes: - Add get_start_time() method to TrainingLogRepository and fix non-existent method call - Remove duplicate training.started event from API endpoint (trainer publishes the accurate one) - Add missing progress events for 80-100% range (85%, 92%, 94%) to eliminate progress "dead zone" ## High Priority Fixes: - Fix division by zero risk in time estimation with double-check and max() safety - Remove unreachable exception handler in training_operations.py - Simplify WebSocket token refresh logic to only reconnect on actual user session changes ## Medium Priority Fixes: - Fix auto-start training effect with useRef to prevent duplicate starts - Add HTTP polling debounce delay (5s) to prevent race conditions with WebSocket - Extract all magic numbers to centralized constants files: - Backend: services/training/app/core/training_constants.py - Frontend: frontend/src/constants/training.ts - Standardize error logging with exc_info=True on critical errors ## Code Quality Improvements: - All progress percentages now use named constants - All timeouts and intervals now use named constants - Improved code maintainability and readability - Better separation of concerns ## Files Changed: - Backend: training_service.py, trainer.py, training_events.py, progress_tracker.py - Backend: training_operations.py, training_log_repository.py, training_constants.py (new) - Frontend: training.ts (hooks), MLTrainingStep.tsx, training.ts (constants, new) All training progress events now properly flow from 0% to 100% with no gaps.
This commit is contained in:
@@ -135,6 +135,61 @@ async def publish_data_analysis(
|
||||
return success
|
||||
|
||||
|
||||
async def publish_training_progress(
|
||||
job_id: str,
|
||||
tenant_id: str,
|
||||
progress: int,
|
||||
current_step: str,
|
||||
step_details: Optional[str] = None,
|
||||
estimated_time_remaining_seconds: Optional[int] = None,
|
||||
estimated_completion_time: Optional[str] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Generic Training Progress Event (for any progress percentage)
|
||||
|
||||
Args:
|
||||
job_id: Training job identifier
|
||||
tenant_id: Tenant identifier
|
||||
progress: Progress percentage (0-100)
|
||||
current_step: Current step name
|
||||
step_details: Details about the current step
|
||||
estimated_time_remaining_seconds: Estimated time remaining in seconds
|
||||
estimated_completion_time: ISO timestamp of estimated completion
|
||||
"""
|
||||
event_data = {
|
||||
"service_name": "training-service",
|
||||
"event_type": "training.progress",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"data": {
|
||||
"job_id": job_id,
|
||||
"tenant_id": tenant_id,
|
||||
"progress": progress,
|
||||
"current_step": current_step,
|
||||
"step_details": step_details or current_step,
|
||||
"estimated_time_remaining_seconds": estimated_time_remaining_seconds,
|
||||
"estimated_completion_time": estimated_completion_time
|
||||
}
|
||||
}
|
||||
|
||||
success = await training_publisher.publish_event(
|
||||
exchange_name="training.events",
|
||||
routing_key="training.progress",
|
||||
event_data=event_data
|
||||
)
|
||||
|
||||
if success:
|
||||
logger.info("Published training progress event",
|
||||
job_id=job_id,
|
||||
progress=progress,
|
||||
current_step=current_step)
|
||||
else:
|
||||
logger.error("Failed to publish training progress event",
|
||||
job_id=job_id,
|
||||
progress=progress)
|
||||
|
||||
return success
|
||||
|
||||
|
||||
async def publish_product_training_completed(
|
||||
job_id: str,
|
||||
tenant_id: str,
|
||||
|
||||
Reference in New Issue
Block a user