This commit addresses all identified bugs and issues in the training code path: ## Critical Fixes: - Add get_start_time() method to TrainingLogRepository and fix non-existent method call - Remove duplicate training.started event from API endpoint (trainer publishes the accurate one) - Add missing progress events for 80-100% range (85%, 92%, 94%) to eliminate progress "dead zone" ## High Priority Fixes: - Fix division by zero risk in time estimation with double-check and max() safety - Remove unreachable exception handler in training_operations.py - Simplify WebSocket token refresh logic to only reconnect on actual user session changes ## Medium Priority Fixes: - Fix auto-start training effect with useRef to prevent duplicate starts - Add HTTP polling debounce delay (5s) to prevent race conditions with WebSocket - Extract all magic numbers to centralized constants files: - Backend: services/training/app/core/training_constants.py - Frontend: frontend/src/constants/training.ts - Standardize error logging with exc_info=True on critical errors ## Code Quality Improvements: - All progress percentages now use named constants - All timeouts and intervals now use named constants - Improved code maintainability and readability - Better separation of concerns ## Files Changed: - Backend: training_service.py, trainer.py, training_events.py, progress_tracker.py - Backend: training_operations.py, training_log_repository.py, training_constants.py (new) - Frontend: training.ts (hooks), MLTrainingStep.tsx, training.ts (constants, new) All training progress events now properly flow from 0% to 100% with no gaps.
331 lines
10 KiB
Python
331 lines
10 KiB
Python
"""
|
|
Training Progress Events Publisher
|
|
Simple, clean event publisher for the 4 main training steps
|
|
"""
|
|
|
|
import structlog
|
|
from datetime import datetime
|
|
from typing import Dict, Any, Optional
|
|
from shared.messaging.rabbitmq import RabbitMQClient
|
|
from app.core.config import settings
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
# Single global publisher instance
|
|
training_publisher = RabbitMQClient(settings.RABBITMQ_URL, "training-service")
|
|
|
|
|
|
async def setup_messaging():
|
|
"""Initialize messaging"""
|
|
success = await training_publisher.connect()
|
|
if success:
|
|
logger.info("Training messaging initialized")
|
|
else:
|
|
logger.warning("Training messaging failed to initialize")
|
|
return success
|
|
|
|
|
|
async def cleanup_messaging():
|
|
"""Cleanup messaging"""
|
|
await training_publisher.disconnect()
|
|
logger.info("Training messaging cleaned up")
|
|
|
|
|
|
# ==========================================
|
|
# 4 MAIN TRAINING PROGRESS EVENTS
|
|
# ==========================================
|
|
|
|
async def publish_training_started(
|
|
job_id: str,
|
|
tenant_id: str,
|
|
total_products: int,
|
|
estimated_duration_minutes: Optional[int] = None,
|
|
estimated_completion_time: Optional[str] = None
|
|
) -> bool:
|
|
"""
|
|
Event 1: Training Started (0% progress)
|
|
|
|
Args:
|
|
job_id: Training job identifier
|
|
tenant_id: Tenant identifier
|
|
total_products: Number of products to train
|
|
estimated_duration_minutes: Estimated time to completion in minutes
|
|
estimated_completion_time: ISO timestamp of estimated completion
|
|
"""
|
|
event_data = {
|
|
"service_name": "training-service",
|
|
"event_type": "training.started",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"data": {
|
|
"job_id": job_id,
|
|
"tenant_id": tenant_id,
|
|
"progress": 0,
|
|
"current_step": "Training Started",
|
|
"step_details": f"Starting training for {total_products} products",
|
|
"total_products": total_products,
|
|
"estimated_duration_minutes": estimated_duration_minutes,
|
|
"estimated_completion_time": estimated_completion_time,
|
|
"estimated_time_remaining_seconds": estimated_duration_minutes * 60 if estimated_duration_minutes else None
|
|
}
|
|
}
|
|
|
|
success = await training_publisher.publish_event(
|
|
exchange_name="training.events",
|
|
routing_key="training.started",
|
|
event_data=event_data
|
|
)
|
|
|
|
if success:
|
|
logger.info("Published training started event",
|
|
job_id=job_id,
|
|
tenant_id=tenant_id,
|
|
total_products=total_products,
|
|
estimated_duration_minutes=estimated_duration_minutes)
|
|
else:
|
|
logger.error("Failed to publish training started event", job_id=job_id)
|
|
|
|
return success
|
|
|
|
|
|
async def publish_data_analysis(
|
|
job_id: str,
|
|
tenant_id: str,
|
|
analysis_details: Optional[str] = None,
|
|
estimated_time_remaining_seconds: Optional[int] = None,
|
|
estimated_completion_time: Optional[str] = None
|
|
) -> bool:
|
|
"""
|
|
Event 2: Data Analysis (20% progress)
|
|
|
|
Args:
|
|
job_id: Training job identifier
|
|
tenant_id: Tenant identifier
|
|
analysis_details: Details about the analysis
|
|
estimated_time_remaining_seconds: Estimated time remaining in seconds
|
|
estimated_completion_time: ISO timestamp of estimated completion
|
|
"""
|
|
event_data = {
|
|
"service_name": "training-service",
|
|
"event_type": "training.progress",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"data": {
|
|
"job_id": job_id,
|
|
"tenant_id": tenant_id,
|
|
"progress": 20,
|
|
"current_step": "Data Analysis",
|
|
"step_details": analysis_details or "Analyzing sales, weather, and traffic data",
|
|
"estimated_time_remaining_seconds": estimated_time_remaining_seconds,
|
|
"estimated_completion_time": estimated_completion_time
|
|
}
|
|
}
|
|
|
|
success = await training_publisher.publish_event(
|
|
exchange_name="training.events",
|
|
routing_key="training.progress",
|
|
event_data=event_data
|
|
)
|
|
|
|
if success:
|
|
logger.info("Published data analysis event",
|
|
job_id=job_id,
|
|
progress=20)
|
|
else:
|
|
logger.error("Failed to publish data analysis event", job_id=job_id)
|
|
|
|
return success
|
|
|
|
|
|
async def publish_training_progress(
|
|
job_id: str,
|
|
tenant_id: str,
|
|
progress: int,
|
|
current_step: str,
|
|
step_details: Optional[str] = None,
|
|
estimated_time_remaining_seconds: Optional[int] = None,
|
|
estimated_completion_time: Optional[str] = None
|
|
) -> bool:
|
|
"""
|
|
Generic Training Progress Event (for any progress percentage)
|
|
|
|
Args:
|
|
job_id: Training job identifier
|
|
tenant_id: Tenant identifier
|
|
progress: Progress percentage (0-100)
|
|
current_step: Current step name
|
|
step_details: Details about the current step
|
|
estimated_time_remaining_seconds: Estimated time remaining in seconds
|
|
estimated_completion_time: ISO timestamp of estimated completion
|
|
"""
|
|
event_data = {
|
|
"service_name": "training-service",
|
|
"event_type": "training.progress",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"data": {
|
|
"job_id": job_id,
|
|
"tenant_id": tenant_id,
|
|
"progress": progress,
|
|
"current_step": current_step,
|
|
"step_details": step_details or current_step,
|
|
"estimated_time_remaining_seconds": estimated_time_remaining_seconds,
|
|
"estimated_completion_time": estimated_completion_time
|
|
}
|
|
}
|
|
|
|
success = await training_publisher.publish_event(
|
|
exchange_name="training.events",
|
|
routing_key="training.progress",
|
|
event_data=event_data
|
|
)
|
|
|
|
if success:
|
|
logger.info("Published training progress event",
|
|
job_id=job_id,
|
|
progress=progress,
|
|
current_step=current_step)
|
|
else:
|
|
logger.error("Failed to publish training progress event",
|
|
job_id=job_id,
|
|
progress=progress)
|
|
|
|
return success
|
|
|
|
|
|
async def publish_product_training_completed(
|
|
job_id: str,
|
|
tenant_id: str,
|
|
product_name: str,
|
|
products_completed: int,
|
|
total_products: int,
|
|
estimated_time_remaining_seconds: Optional[int] = None,
|
|
estimated_completion_time: Optional[str] = None
|
|
) -> bool:
|
|
"""
|
|
Event 3: Product Training Completed (contributes to 20-80% progress)
|
|
|
|
This event is published each time a product training completes.
|
|
The frontend/consumer will calculate the progress as:
|
|
progress = 20 + (products_completed / total_products) * 60
|
|
|
|
Args:
|
|
job_id: Training job identifier
|
|
tenant_id: Tenant identifier
|
|
product_name: Name of the product that was trained
|
|
products_completed: Number of products completed so far
|
|
total_products: Total number of products
|
|
estimated_time_remaining_seconds: Estimated time remaining in seconds
|
|
estimated_completion_time: ISO timestamp of estimated completion
|
|
"""
|
|
event_data = {
|
|
"service_name": "training-service",
|
|
"event_type": "training.product.completed",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"data": {
|
|
"job_id": job_id,
|
|
"tenant_id": tenant_id,
|
|
"product_name": product_name,
|
|
"products_completed": products_completed,
|
|
"total_products": total_products,
|
|
"current_step": "Model Training",
|
|
"step_details": f"Completed training for {product_name} ({products_completed}/{total_products})",
|
|
"estimated_time_remaining_seconds": estimated_time_remaining_seconds,
|
|
"estimated_completion_time": estimated_completion_time
|
|
}
|
|
}
|
|
|
|
success = await training_publisher.publish_event(
|
|
exchange_name="training.events",
|
|
routing_key="training.product.completed",
|
|
event_data=event_data
|
|
)
|
|
|
|
if success:
|
|
logger.info("Published product training completed event",
|
|
job_id=job_id,
|
|
product_name=product_name,
|
|
products_completed=products_completed,
|
|
total_products=total_products)
|
|
else:
|
|
logger.error("Failed to publish product training completed event",
|
|
job_id=job_id)
|
|
|
|
return success
|
|
|
|
|
|
async def publish_training_completed(
|
|
job_id: str,
|
|
tenant_id: str,
|
|
successful_trainings: int,
|
|
failed_trainings: int,
|
|
total_duration_seconds: float
|
|
) -> bool:
|
|
"""
|
|
Event 4: Training Completed (100% progress)
|
|
"""
|
|
event_data = {
|
|
"service_name": "training-service",
|
|
"event_type": "training.completed",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"data": {
|
|
"job_id": job_id,
|
|
"tenant_id": tenant_id,
|
|
"progress": 100,
|
|
"current_step": "Training Completed",
|
|
"step_details": f"Training completed: {successful_trainings} successful, {failed_trainings} failed",
|
|
"successful_trainings": successful_trainings,
|
|
"failed_trainings": failed_trainings,
|
|
"total_duration_seconds": total_duration_seconds
|
|
}
|
|
}
|
|
|
|
success = await training_publisher.publish_event(
|
|
exchange_name="training.events",
|
|
routing_key="training.completed",
|
|
event_data=event_data
|
|
)
|
|
|
|
if success:
|
|
logger.info("Published training completed event",
|
|
job_id=job_id,
|
|
successful_trainings=successful_trainings,
|
|
failed_trainings=failed_trainings)
|
|
else:
|
|
logger.error("Failed to publish training completed event", job_id=job_id)
|
|
|
|
return success
|
|
|
|
|
|
async def publish_training_failed(
|
|
job_id: str,
|
|
tenant_id: str,
|
|
error_message: str
|
|
) -> bool:
|
|
"""
|
|
Event: Training Failed
|
|
"""
|
|
event_data = {
|
|
"service_name": "training-service",
|
|
"event_type": "training.failed",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"data": {
|
|
"job_id": job_id,
|
|
"tenant_id": tenant_id,
|
|
"current_step": "Training Failed",
|
|
"error_message": error_message
|
|
}
|
|
}
|
|
|
|
success = await training_publisher.publish_event(
|
|
exchange_name="training.events",
|
|
routing_key="training.failed",
|
|
event_data=event_data
|
|
)
|
|
|
|
if success:
|
|
logger.info("Published training failed event",
|
|
job_id=job_id,
|
|
error=error_message)
|
|
else:
|
|
logger.error("Failed to publish training failed event", job_id=job_id)
|
|
|
|
return success
|