Fix multiple critical bugs in onboarding training step

This commit addresses all identified bugs and issues in the training code path: ## Critical Fixes: - Add get_start_time() method to TrainingLogRepository and fix non-existent method call - Remove duplicate training.started event from API endpoint (trainer publishes the accurate one) - Add missing progress events for 80-100% range (85%, 92%, 94%) to eliminate progress "dead zone" ## High Priority Fixes: - Fix division by zero risk in time estimation with double-check and max() safety - Remove unreachable exception handler in training_operations.py - Simplify WebSocket token refresh logic to only reconnect on actual user session changes ## Medium Priority Fixes: - Fix auto-start training effect with useRef to prevent duplicate starts - Add HTTP polling debounce delay (5s) to prevent race conditions with WebSocket - Extract all magic numbers to centralized constants files: - Backend: services/training/app/core/training_constants.py - Frontend: frontend/src/constants/training.ts - Standardize error logging with exc_info=True on critical errors ## Code Quality Improvements: - All progress percentages now use named constants - All timeouts and intervals now use named constants - Improved code maintainability and readability - Better separation of concerns ## Files Changed: - Backend: training_service.py, trainer.py, training_events.py, progress_tracker.py - Backend: training_operations.py, training_log_repository.py, training_constants.py (new) - Frontend: training.ts (hooks), MLTrainingStep.tsx, training.ts (constants, new) All training progress events now properly flow from 0% to 100% with no gaps.
2025-11-05 13:02:39 +00:00
parent e3ea92640b
commit 5a84be83d6
10 changed files with 291 additions and 106 deletions
--- a/frontend/src/components/domain/onboarding/steps/MLTrainingStep.tsx
+++ b/frontend/src/components/domain/onboarding/steps/MLTrainingStep.tsx
@@ -5,6 +5,11 @@ import { Button } from '../../../ui/Button';
 import { useCurrentTenant } from '../../../../stores/tenant.store';
 import { useCreateTrainingJob, useTrainingWebSocket, useTrainingJobStatus } from '../../../../api/hooks/training';
 import { Info } from 'lucide-react';
+import {
+  TRAINING_SKIP_OPTION_DELAY_MS,
+  TRAINING_COMPLETION_DELAY_MS,
+  SKIP_TIMER_CHECK_INTERVAL_MS
+} from '../../../../constants/training';

 interface MLTrainingStepProps {
  onNext: () => void;
@@ -38,16 +43,16 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
  const currentTenant = useCurrentTenant();
  const createTrainingJob = useCreateTrainingJob();

-  // Check if training has been running for more than 2 minutes
+  // Check if training has been running for more than the skip delay threshold
  useEffect(() => {
    if (trainingStartTime && isTraining && !showSkipOption) {
      const checkTimer = setInterval(() => {
        const elapsedTime = (Date.now() - trainingStartTime) / 1000; // in seconds
-        if (elapsedTime > 120) { // 2 minutes
+        if (elapsedTime > TRAINING_SKIP_OPTION_DELAY_MS / 1000) {
          setShowSkipOption(true);
          clearInterval(checkTimer);
        }
-      }, 5000); // Check every 5 seconds
+      }, SKIP_TIMER_CHECK_INTERVAL_MS);

      return () => clearInterval(checkTimer);
    }
@@ -72,14 +77,14 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
      message: 'Entrenamiento completado exitosamente'
    });
    setIsTraining(false);
-    
+
    setTimeout(() => {
      onComplete({
        jobId: jobId,
        success: true,
        message: 'Modelo entrenado correctamente'
      });
-    }, 2000);
+    }, TRAINING_COMPLETION_DELAY_MS);
  }, [onComplete, jobId]);

  const handleError = useCallback((data: any) => {
@@ -147,7 +152,7 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
          message: 'Modelo entrenado correctamente',
          detectedViaPolling: true
        });
-      }, 2000);
+      }, TRAINING_COMPLETION_DELAY_MS);
    } else if (jobStatus.status === 'failed') {
      console.log(`❌ Training failure detected (source: ${isConnected ? 'WebSocket' : 'HTTP polling'})`);
      setError('Error detectado durante el entrenamiento (verificación de estado)');
@@ -169,13 +174,15 @@ export const MLTrainingStep: React.FC<MLTrainingStepProps> = ({
    }
  }, [jobStatus, jobId, trainingProgress?.stage, onComplete, isConnected]);

-  // Auto-trigger training when component mounts
+  // Auto-trigger training when component mounts (run once)
+  const hasAutoStarted = React.useRef(false);
  useEffect(() => {
-    if (currentTenant?.id && !isTraining && !trainingProgress && !error) {
+    if (currentTenant?.id && !hasAutoStarted.current && !isTraining && !trainingProgress && !error) {
      console.log('🚀 Auto-starting ML training for tenant:', currentTenant.id);
+      hasAutoStarted.current = true;
      handleStartTraining();
    }
-  }, [currentTenant?.id]); // Only run when tenant is available
+  }, [currentTenant?.id, isTraining, trainingProgress, error]); // Include all checked dependencies

  const handleStartTraining = async () => {
    if (!currentTenant?.id) {