Fix multiple critical bugs in onboarding training step
This commit addresses all identified bugs and issues in the training code path: ## Critical Fixes: - Add get_start_time() method to TrainingLogRepository and fix non-existent method call - Remove duplicate training.started event from API endpoint (trainer publishes the accurate one) - Add missing progress events for 80-100% range (85%, 92%, 94%) to eliminate progress "dead zone" ## High Priority Fixes: - Fix division by zero risk in time estimation with double-check and max() safety - Remove unreachable exception handler in training_operations.py - Simplify WebSocket token refresh logic to only reconnect on actual user session changes ## Medium Priority Fixes: - Fix auto-start training effect with useRef to prevent duplicate starts - Add HTTP polling debounce delay (5s) to prevent race conditions with WebSocket - Extract all magic numbers to centralized constants files: - Backend: services/training/app/core/training_constants.py - Frontend: frontend/src/constants/training.ts - Standardize error logging with exc_info=True on critical errors ## Code Quality Improvements: - All progress percentages now use named constants - All timeouts and intervals now use named constants - Improved code maintainability and readability - Better separation of concerns ## Files Changed: - Backend: training_service.py, trainer.py, training_events.py, progress_tracker.py - Backend: training_operations.py, training_log_repository.py, training_constants.py (new) - Frontend: training.ts (hooks), MLTrainingStep.tsx, training.ts (constants, new) All training progress events now properly flow from 0% to 100% with no gaps.
This commit is contained in:
@@ -5,6 +5,11 @@ import { Button } from '../../../ui/Button';
|
||||
import { useCurrentTenant } from '../../../../stores/tenant.store';
|
||||
import { useCreateTrainingJob, useTrainingWebSocket, useTrainingJobStatus } from '../../../../api/hooks/training';
|
||||
import { Info } from 'lucide-react';
|
||||
import {
|
||||
TRAINING_SKIP_OPTION_DELAY_MS,
|
||||
TRAINING_COMPLETION_DELAY_MS,
|
||||
SKIP_TIMER_CHECK_INTERVAL_MS
|
||||
} from '../../../../constants/training';
|
||||
|
||||
interface MLTrainingStepProps {
|
||||
onNext: () => void;
|
||||
@@ -38,16 +43,16 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
|
||||
const currentTenant = useCurrentTenant();
|
||||
const createTrainingJob = useCreateTrainingJob();
|
||||
|
||||
// Check if training has been running for more than 2 minutes
|
||||
// Check if training has been running for more than the skip delay threshold
|
||||
useEffect(() => {
|
||||
if (trainingStartTime && isTraining && !showSkipOption) {
|
||||
const checkTimer = setInterval(() => {
|
||||
const elapsedTime = (Date.now() - trainingStartTime) / 1000; // in seconds
|
||||
if (elapsedTime > 120) { // 2 minutes
|
||||
if (elapsedTime > TRAINING_SKIP_OPTION_DELAY_MS / 1000) {
|
||||
setShowSkipOption(true);
|
||||
clearInterval(checkTimer);
|
||||
}
|
||||
}, 5000); // Check every 5 seconds
|
||||
}, SKIP_TIMER_CHECK_INTERVAL_MS);
|
||||
|
||||
return () => clearInterval(checkTimer);
|
||||
}
|
||||
@@ -72,14 +77,14 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
|
||||
message: 'Entrenamiento completado exitosamente'
|
||||
});
|
||||
setIsTraining(false);
|
||||
|
||||
|
||||
setTimeout(() => {
|
||||
onComplete({
|
||||
jobId: jobId,
|
||||
success: true,
|
||||
message: 'Modelo entrenado correctamente'
|
||||
});
|
||||
}, 2000);
|
||||
}, TRAINING_COMPLETION_DELAY_MS);
|
||||
}, [onComplete, jobId]);
|
||||
|
||||
const handleError = useCallback((data: any) => {
|
||||
@@ -147,7 +152,7 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
|
||||
message: 'Modelo entrenado correctamente',
|
||||
detectedViaPolling: true
|
||||
});
|
||||
}, 2000);
|
||||
}, TRAINING_COMPLETION_DELAY_MS);
|
||||
} else if (jobStatus.status === 'failed') {
|
||||
console.log(`❌ Training failure detected (source: ${isConnected ? 'WebSocket' : 'HTTP polling'})`);
|
||||
setError('Error detectado durante el entrenamiento (verificación de estado)');
|
||||
@@ -169,13 +174,15 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
|
||||
}
|
||||
}, [jobStatus, jobId, trainingProgress?.stage, onComplete, isConnected]);
|
||||
|
||||
// Auto-trigger training when component mounts
|
||||
// Auto-trigger training when component mounts (run once)
|
||||
const hasAutoStarted = React.useRef(false);
|
||||
useEffect(() => {
|
||||
if (currentTenant?.id && !isTraining && !trainingProgress && !error) {
|
||||
if (currentTenant?.id && !hasAutoStarted.current && !isTraining && !trainingProgress && !error) {
|
||||
console.log('🚀 Auto-starting ML training for tenant:', currentTenant.id);
|
||||
hasAutoStarted.current = true;
|
||||
handleStartTraining();
|
||||
}
|
||||
}, [currentTenant?.id]); // Only run when tenant is available
|
||||
}, [currentTenant?.id, isTraining, trainingProgress, error]); // Include all checked dependencies
|
||||
|
||||
const handleStartTraining = async () => {
|
||||
if (!currentTenant?.id) {
|
||||
|
||||
Reference in New Issue
Block a user