Fix multiple critical bugs in onboarding training step

This commit addresses all identified bugs and issues in the training code path:

## Critical Fixes:
- Add get_start_time() method to TrainingLogRepository and fix non-existent method call
- Remove duplicate training.started event from API endpoint (trainer publishes the accurate one)
- Add missing progress events for 80-100% range (85%, 92%, 94%) to eliminate progress "dead zone"

## High Priority Fixes:
- Fix division by zero risk in time estimation with double-check and max() safety
- Remove unreachable exception handler in training_operations.py
- Simplify WebSocket token refresh logic to only reconnect on actual user session changes

## Medium Priority Fixes:
- Fix auto-start training effect with useRef to prevent duplicate starts
- Add HTTP polling debounce delay (5s) to prevent race conditions with WebSocket
- Extract all magic numbers to centralized constants files:
  - Backend: services/training/app/core/training_constants.py
  - Frontend: frontend/src/constants/training.ts
- Standardize error logging with exc_info=True on critical errors

## Code Quality Improvements:
- All progress percentages now use named constants
- All timeouts and intervals now use named constants
- Improved code maintainability and readability
- Better separation of concerns

## Files Changed:
- Backend: training_service.py, trainer.py, training_events.py, progress_tracker.py
- Backend: training_operations.py, training_log_repository.py, training_constants.py (new)
- Frontend: training.ts (hooks), MLTrainingStep.tsx, training.ts (constants, new)

All training progress events now properly flow from 0% to 100% with no gaps.
This commit is contained in:
Claude
2025-11-05 13:02:39 +00:00
parent e3ea92640b
commit 5a84be83d6
10 changed files with 291 additions and 106 deletions

View File

@@ -8,6 +8,17 @@ import { useMutation, useQuery, useQueryClient, UseQueryOptions, UseMutationOpti
import { trainingService } from '../services/training';
import { ApiError, apiClient } from '../client/apiClient';
import { useAuthStore } from '../../stores/auth.store';
import {
HTTP_POLLING_INTERVAL_MS,
HTTP_POLLING_DEBOUNCE_MS,
WEBSOCKET_HEARTBEAT_INTERVAL_MS,
WEBSOCKET_MAX_RECONNECT_ATTEMPTS,
WEBSOCKET_RECONNECT_INITIAL_DELAY_MS,
WEBSOCKET_RECONNECT_MAX_DELAY_MS,
PROGRESS_DATA_ANALYSIS,
PROGRESS_TRAINING_RANGE_START,
PROGRESS_TRAINING_RANGE_END
} from '../../constants/training';
import type {
TrainingJobRequest,
TrainingJobResponse,
@@ -53,14 +64,32 @@ export const useTrainingJobStatus = (
}
) => {
const { isWebSocketConnected, ...queryOptions } = options || {};
const [enablePolling, setEnablePolling] = React.useState(false);
// Completely disable the query when WebSocket is connected
const isEnabled = !!tenantId && !!jobId && !isWebSocketConnected;
// Debounce HTTP polling activation: wait after WebSocket disconnects
// This prevents race conditions where both WebSocket and HTTP are briefly active
React.useEffect(() => {
if (!isWebSocketConnected) {
const debounceTimer = setTimeout(() => {
setEnablePolling(true);
console.log(`🔄 HTTP polling enabled after ${HTTP_POLLING_DEBOUNCE_MS}ms debounce (WebSocket disconnected)`);
}, HTTP_POLLING_DEBOUNCE_MS);
return () => clearTimeout(debounceTimer);
} else {
setEnablePolling(false);
console.log('❌ HTTP polling disabled (WebSocket connected)');
}
}, [isWebSocketConnected]);
// Completely disable the query when WebSocket is connected or during debounce period
const isEnabled = !!tenantId && !!jobId && !isWebSocketConnected && enablePolling;
console.log('🔄 Training status query:', {
tenantId: !!tenantId,
jobId: !!jobId,
isWebSocketConnected,
enablePolling,
queryEnabled: isEnabled
});
@@ -85,8 +114,8 @@ export const useTrainingJobStatus = (
return false; // Stop polling when training is done
}
console.log('📊 HTTP fallback polling active (WebSocket disconnected) - 5s interval');
return 5000; // Poll every 5 seconds while training (fallback when WebSocket unavailable)
console.log(`📊 HTTP fallback polling active (WebSocket disconnected) - ${HTTP_POLLING_INTERVAL_MS}ms interval`);
return HTTP_POLLING_INTERVAL_MS; // Poll while training (fallback when WebSocket unavailable)
} : false, // Completely disable interval when WebSocket connected
staleTime: 1000, // Consider data stale after 1 second
retry: (failureCount, error) => {
@@ -298,7 +327,7 @@ export const useTrainingWebSocket = (
let reconnectTimer: NodeJS.Timeout | null = null;
let isManuallyDisconnected = false;
let reconnectAttempts = 0;
const maxReconnectAttempts = 3;
const maxReconnectAttempts = WEBSOCKET_MAX_RECONNECT_ATTEMPTS;
const connect = async () => {
try {
@@ -349,70 +378,49 @@ export const useTrainingWebSocket = (
console.warn('Failed to request status on connection:', e);
}
// Helper function to check if tokens represent different auth sessions
const isTokenSessionDifferent = (oldToken: string, newToken: string): boolean => {
// Helper function to check if tokens represent different auth users/sessions
const isNewAuthSession = (oldToken: string, newToken: string): boolean => {
if (!oldToken || !newToken) return !!oldToken !== !!newToken;
try {
const oldPayload = JSON.parse(atob(oldToken.split('.')[1]));
const newPayload = JSON.parse(atob(newToken.split('.')[1]));
// Compare by issued timestamp (iat) - different iat means new auth session
return oldPayload.iat !== newPayload.iat;
// Compare by user ID - different user means new auth session
// If user_id is same, it's just a token refresh, no need to reconnect
return oldPayload.user_id !== newPayload.user_id ||
oldPayload.sub !== newPayload.sub;
} catch (e) {
console.warn('Failed to parse token for session comparison, falling back to string comparison:', e);
return oldToken !== newToken;
console.warn('Failed to parse token for session comparison:', e);
// On parse error, don't reconnect (assume same session)
return false;
}
};
// Set up periodic ping and intelligent token refresh detection
// Set up periodic ping and check for auth session changes
const heartbeatInterval = setInterval(async () => {
if (ws?.readyState === WebSocket.OPEN && !isManuallyDisconnected) {
try {
// Check token validity (this may refresh if needed)
const currentToken = await apiClient.ensureValidToken();
// Enhanced token change detection with detailed logging
const tokenStringChanged = currentToken !== effectiveToken;
const tokenSessionChanged = currentToken && effectiveToken ?
isTokenSessionDifferent(effectiveToken, currentToken) : tokenStringChanged;
console.log('🔍 WebSocket token validation check:', {
hasCurrentToken: !!currentToken,
hasEffectiveToken: !!effectiveToken,
tokenStringChanged,
tokenSessionChanged,
currentTokenPreview: currentToken ? `${currentToken.slice(0, 20)}...${currentToken.slice(-10)}` : 'null',
effectiveTokenPreview: effectiveToken ? `${effectiveToken.slice(0, 20)}...${effectiveToken.slice(-10)}` : 'null'
});
// Only reconnect if we have a genuine session change (different iat)
if (tokenSessionChanged) {
console.log('🔄 Token session changed - reconnecting WebSocket with new session token');
console.log('📊 Session change details:', {
reason: !currentToken ? 'token removed' :
!effectiveToken ? 'token added' : 'new auth session',
oldTokenIat: effectiveToken ? (() => {
try { return JSON.parse(atob(effectiveToken.split('.')[1])).iat; } catch { return 'parse-error'; }
})() : 'N/A',
newTokenIat: currentToken ? (() => {
try { return JSON.parse(atob(currentToken.split('.')[1])).iat; } catch { return 'parse-error'; }
})() : 'N/A'
});
// Close current connection and trigger reconnection with new token
ws?.close(1000, 'Token session changed - reconnecting');
// Only reconnect if user changed (new auth session)
if (currentToken && effectiveToken && isNewAuthSession(effectiveToken, currentToken)) {
console.log('🔄 Auth session changed (different user) - reconnecting WebSocket');
ws?.close(1000, 'Auth session changed - reconnecting');
clearInterval(heartbeatInterval);
return;
} else if (tokenStringChanged) {
console.log(' Token string changed but same session - continuing with current connection');
// Update effective token reference for future comparisons
}
// Token may have been refreshed but it's the same user - continue
if (currentToken && currentToken !== effectiveToken) {
console.log(' Token refreshed (same user) - updating reference');
effectiveToken = currentToken;
}
console.log('✅ Token validated during heartbeat - same session');
// Send ping
ws?.send('ping');
console.log('💓 Sent ping to server (token session validated)');
console.log('💓 Sent ping to server');
} catch (e) {
console.warn('Failed to send ping or validate token:', e);
clearInterval(heartbeatInterval);
@@ -420,7 +428,7 @@ export const useTrainingWebSocket = (
} else {
clearInterval(heartbeatInterval);
}
}, 30000); // Check every 30 seconds for token refresh and send ping
}, WEBSOCKET_HEARTBEAT_INTERVAL_MS); // Check for auth changes and send ping
// Store interval for cleanup
(ws as any).heartbeatInterval = heartbeatInterval;
@@ -449,7 +457,8 @@ export const useTrainingWebSocket = (
if (initialData.type === 'product_completed') {
const productsCompleted = initialEventData.products_completed || 0;
const totalProducts = initialEventData.total_products || 1;
initialProgress = 20 + Math.floor((productsCompleted / totalProducts) * 60);
const trainingRangeWidth = PROGRESS_TRAINING_RANGE_END - PROGRESS_DATA_ANALYSIS;
initialProgress = PROGRESS_DATA_ANALYSIS + Math.floor((productsCompleted / totalProducts) * trainingRangeWidth);
console.log('📦 Product training completed in initial state',
`${productsCompleted}/${totalProducts}`,
`progress: ${initialProgress}%`);
@@ -486,8 +495,9 @@ export const useTrainingWebSocket = (
const productsCompleted = eventData.products_completed || 0;
const totalProducts = eventData.total_products || 1;
// Calculate progress: 20% base + (completed/total * 60%)
progress = 20 + Math.floor((productsCompleted / totalProducts) * 60);
// Calculate progress: DATA_ANALYSIS% base + (completed/total * (TRAINING_RANGE_END - DATA_ANALYSIS)%)
const trainingRangeWidth = PROGRESS_TRAINING_RANGE_END - PROGRESS_DATA_ANALYSIS;
progress = PROGRESS_DATA_ANALYSIS + Math.floor((productsCompleted / totalProducts) * trainingRangeWidth);
console.log('📦 Product training completed',
`${productsCompleted}/${totalProducts}`,
@@ -585,8 +595,8 @@ export const useTrainingWebSocket = (
// Detailed logging for different close codes
switch (event.code) {
case 1000:
if (event.reason === 'Token refreshed - reconnecting') {
console.log('🔄 WebSocket closed for token refresh - will reconnect immediately');
if (event.reason === 'Auth session changed - reconnecting') {
console.log('🔄 WebSocket closed for auth session change - will reconnect immediately');
} else {
console.log('🔒 WebSocket closed normally');
}
@@ -604,18 +614,21 @@ export const useTrainingWebSocket = (
console.log(`❓ WebSocket closed with code ${event.code}`);
}
// Handle token refresh reconnection (immediate reconnect)
if (event.code === 1000 && event.reason === 'Token refreshed - reconnecting') {
console.log('🔄 Reconnecting immediately due to token refresh...');
// Handle auth session change reconnection (immediate reconnect)
if (event.code === 1000 && event.reason === 'Auth session changed - reconnecting') {
console.log('🔄 Reconnecting immediately due to auth session change...');
reconnectTimer = setTimeout(() => {
connect(); // Reconnect immediately with fresh token
}, 1000); // Short delay to allow cleanup
connect(); // Reconnect immediately with new session token
}, WEBSOCKET_RECONNECT_INITIAL_DELAY_MS); // Short delay to allow cleanup
return;
}
// Try to reconnect if not manually disconnected and haven't exceeded max attempts
if (!isManuallyDisconnected && event.code !== 1000 && reconnectAttempts < maxReconnectAttempts) {
const delay = Math.min(1000 * Math.pow(2, reconnectAttempts), 10000); // Exponential backoff, max 10s
const delay = Math.min(
WEBSOCKET_RECONNECT_INITIAL_DELAY_MS * Math.pow(2, reconnectAttempts),
WEBSOCKET_RECONNECT_MAX_DELAY_MS
); // Exponential backoff
console.log(`🔄 Attempting to reconnect WebSocket in ${delay/1000}s... (attempt ${reconnectAttempts + 1}/${maxReconnectAttempts})`);
reconnectTimer = setTimeout(() => {