Fix multiple critical bugs in onboarding training step

This commit addresses all identified bugs and issues in the training code path:

## Critical Fixes:
- Add get_start_time() method to TrainingLogRepository and fix non-existent method call
- Remove duplicate training.started event from API endpoint (trainer publishes the accurate one)
- Add missing progress events for 80-100% range (85%, 92%, 94%) to eliminate progress "dead zone"

## High Priority Fixes:
- Fix division by zero risk in time estimation with double-check and max() safety
- Remove unreachable exception handler in training_operations.py
- Simplify WebSocket token refresh logic to only reconnect on actual user session changes

## Medium Priority Fixes:
- Fix auto-start training effect with useRef to prevent duplicate starts
- Add HTTP polling debounce delay (5s) to prevent race conditions with WebSocket
- Extract all magic numbers to centralized constants files:
  - Backend: services/training/app/core/training_constants.py
  - Frontend: frontend/src/constants/training.ts
- Standardize error logging with exc_info=True on critical errors

## Code Quality Improvements:
- All progress percentages now use named constants
- All timeouts and intervals now use named constants
- Improved code maintainability and readability
- Better separation of concerns

## Files Changed:
- Backend: training_service.py, trainer.py, training_events.py, progress_tracker.py
- Backend: training_operations.py, training_log_repository.py, training_constants.py (new)
- Frontend: training.ts (hooks), MLTrainingStep.tsx, training.ts (constants, new)

All training progress events now properly flow from 0% to 100% with no gaps.
This commit is contained in:
Claude
2025-11-05 13:02:39 +00:00
parent e3ea92640b
commit 5a84be83d6
10 changed files with 291 additions and 106 deletions

View File

@@ -5,6 +5,11 @@ import { Button } from '../../../ui/Button';
import { useCurrentTenant } from '../../../../stores/tenant.store';
import { useCreateTrainingJob, useTrainingWebSocket, useTrainingJobStatus } from '../../../../api/hooks/training';
import { Info } from 'lucide-react';
import {
TRAINING_SKIP_OPTION_DELAY_MS,
TRAINING_COMPLETION_DELAY_MS,
SKIP_TIMER_CHECK_INTERVAL_MS
} from '../../../../constants/training';
interface MLTrainingStepProps {
onNext: () => void;
@@ -38,16 +43,16 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
const currentTenant = useCurrentTenant();
const createTrainingJob = useCreateTrainingJob();
// Check if training has been running for more than 2 minutes
// Check if training has been running for more than the skip delay threshold
useEffect(() => {
if (trainingStartTime && isTraining && !showSkipOption) {
const checkTimer = setInterval(() => {
const elapsedTime = (Date.now() - trainingStartTime) / 1000; // in seconds
if (elapsedTime > 120) { // 2 minutes
if (elapsedTime > TRAINING_SKIP_OPTION_DELAY_MS / 1000) {
setShowSkipOption(true);
clearInterval(checkTimer);
}
}, 5000); // Check every 5 seconds
}, SKIP_TIMER_CHECK_INTERVAL_MS);
return () => clearInterval(checkTimer);
}
@@ -72,14 +77,14 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
message: 'Entrenamiento completado exitosamente'
});
setIsTraining(false);
setTimeout(() => {
onComplete({
jobId: jobId,
success: true,
message: 'Modelo entrenado correctamente'
});
}, 2000);
}, TRAINING_COMPLETION_DELAY_MS);
}, [onComplete, jobId]);
const handleError = useCallback((data: any) => {
@@ -147,7 +152,7 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
message: 'Modelo entrenado correctamente',
detectedViaPolling: true
});
}, 2000);
}, TRAINING_COMPLETION_DELAY_MS);
} else if (jobStatus.status === 'failed') {
console.log(`❌ Training failure detected (source: ${isConnected ? 'WebSocket' : 'HTTP polling'})`);
setError('Error detectado durante el entrenamiento (verificación de estado)');
@@ -169,13 +174,15 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
}
}, [jobStatus, jobId, trainingProgress?.stage, onComplete, isConnected]);
// Auto-trigger training when component mounts
// Auto-trigger training when component mounts (run once)
const hasAutoStarted = React.useRef(false);
useEffect(() => {
if (currentTenant?.id && !isTraining && !trainingProgress && !error) {
if (currentTenant?.id && !hasAutoStarted.current && !isTraining && !trainingProgress && !error) {
console.log('🚀 Auto-starting ML training for tenant:', currentTenant.id);
hasAutoStarted.current = true;
handleStartTraining();
}
}, [currentTenant?.id]); // Only run when tenant is available
}, [currentTenant?.id, isTraining, trainingProgress, error]); // Include all checked dependencies
const handleStartTraining = async () => {
if (!currentTenant?.id) {