2025-10-09 14:11:02 +02:00
|
|
|
"""
|
|
|
|
|
Training Progress Events Publisher
|
|
|
|
|
Simple, clean event publisher for the 4 main training steps
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import structlog
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from typing import Dict, Any, Optional
|
2025-12-05 20:07:01 +01:00
|
|
|
from shared.messaging import RabbitMQClient
|
2025-10-09 14:11:02 +02:00
|
|
|
from app.core.config import settings
|
|
|
|
|
|
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
|
|
|
|
|
|
# Single global publisher instance
|
|
|
|
|
training_publisher = RabbitMQClient(settings.RABBITMQ_URL, "training-service")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def setup_messaging():
|
|
|
|
|
"""Initialize messaging"""
|
|
|
|
|
success = await training_publisher.connect()
|
|
|
|
|
if success:
|
|
|
|
|
logger.info("Training messaging initialized")
|
|
|
|
|
else:
|
|
|
|
|
logger.warning("Training messaging failed to initialize")
|
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def cleanup_messaging():
|
|
|
|
|
"""Cleanup messaging"""
|
|
|
|
|
await training_publisher.disconnect()
|
|
|
|
|
logger.info("Training messaging cleaned up")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ==========================================
|
|
|
|
|
# 4 MAIN TRAINING PROGRESS EVENTS
|
|
|
|
|
# ==========================================
|
|
|
|
|
|
|
|
|
|
async def publish_training_started(
|
|
|
|
|
job_id: str,
|
|
|
|
|
tenant_id: str,
|
2025-10-15 16:12:49 +02:00
|
|
|
total_products: int,
|
|
|
|
|
estimated_duration_minutes: Optional[int] = None,
|
|
|
|
|
estimated_completion_time: Optional[str] = None
|
2025-10-09 14:11:02 +02:00
|
|
|
) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
Event 1: Training Started (0% progress)
|
2025-10-15 16:12:49 +02:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
job_id: Training job identifier
|
|
|
|
|
tenant_id: Tenant identifier
|
|
|
|
|
total_products: Number of products to train
|
|
|
|
|
estimated_duration_minutes: Estimated time to completion in minutes
|
|
|
|
|
estimated_completion_time: ISO timestamp of estimated completion
|
2025-10-09 14:11:02 +02:00
|
|
|
"""
|
|
|
|
|
event_data = {
|
|
|
|
|
"service_name": "training-service",
|
|
|
|
|
"event_type": "training.started",
|
|
|
|
|
"timestamp": datetime.now().isoformat(),
|
|
|
|
|
"data": {
|
|
|
|
|
"job_id": job_id,
|
|
|
|
|
"tenant_id": tenant_id,
|
|
|
|
|
"progress": 0,
|
|
|
|
|
"current_step": "Training Started",
|
|
|
|
|
"step_details": f"Starting training for {total_products} products",
|
2025-10-15 16:12:49 +02:00
|
|
|
"total_products": total_products,
|
|
|
|
|
"estimated_duration_minutes": estimated_duration_minutes,
|
|
|
|
|
"estimated_completion_time": estimated_completion_time,
|
|
|
|
|
"estimated_time_remaining_seconds": estimated_duration_minutes * 60 if estimated_duration_minutes else None
|
2025-10-09 14:11:02 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
success = await training_publisher.publish_event(
|
|
|
|
|
exchange_name="training.events",
|
|
|
|
|
routing_key="training.started",
|
|
|
|
|
event_data=event_data
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if success:
|
|
|
|
|
logger.info("Published training started event",
|
|
|
|
|
job_id=job_id,
|
|
|
|
|
tenant_id=tenant_id,
|
2025-10-15 16:12:49 +02:00
|
|
|
total_products=total_products,
|
|
|
|
|
estimated_duration_minutes=estimated_duration_minutes)
|
2025-10-09 14:11:02 +02:00
|
|
|
else:
|
|
|
|
|
logger.error("Failed to publish training started event", job_id=job_id)
|
|
|
|
|
|
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def publish_data_analysis(
|
|
|
|
|
job_id: str,
|
|
|
|
|
tenant_id: str,
|
2025-10-15 16:12:49 +02:00
|
|
|
analysis_details: Optional[str] = None,
|
2025-10-15 21:09:42 +02:00
|
|
|
estimated_time_remaining_seconds: Optional[int] = None,
|
|
|
|
|
estimated_completion_time: Optional[str] = None
|
2025-10-09 14:11:02 +02:00
|
|
|
) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
Event 2: Data Analysis (20% progress)
|
2025-10-15 16:12:49 +02:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
job_id: Training job identifier
|
|
|
|
|
tenant_id: Tenant identifier
|
|
|
|
|
analysis_details: Details about the analysis
|
|
|
|
|
estimated_time_remaining_seconds: Estimated time remaining in seconds
|
2025-10-15 21:09:42 +02:00
|
|
|
estimated_completion_time: ISO timestamp of estimated completion
|
2025-10-09 14:11:02 +02:00
|
|
|
"""
|
|
|
|
|
event_data = {
|
|
|
|
|
"service_name": "training-service",
|
|
|
|
|
"event_type": "training.progress",
|
|
|
|
|
"timestamp": datetime.now().isoformat(),
|
|
|
|
|
"data": {
|
|
|
|
|
"job_id": job_id,
|
|
|
|
|
"tenant_id": tenant_id,
|
|
|
|
|
"progress": 20,
|
|
|
|
|
"current_step": "Data Analysis",
|
2025-10-15 16:12:49 +02:00
|
|
|
"step_details": analysis_details or "Analyzing sales, weather, and traffic data",
|
2025-10-15 21:09:42 +02:00
|
|
|
"estimated_time_remaining_seconds": estimated_time_remaining_seconds,
|
|
|
|
|
"estimated_completion_time": estimated_completion_time
|
2025-10-09 14:11:02 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
success = await training_publisher.publish_event(
|
|
|
|
|
exchange_name="training.events",
|
|
|
|
|
routing_key="training.progress",
|
|
|
|
|
event_data=event_data
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if success:
|
|
|
|
|
logger.info("Published data analysis event",
|
|
|
|
|
job_id=job_id,
|
|
|
|
|
progress=20)
|
|
|
|
|
else:
|
|
|
|
|
logger.error("Failed to publish data analysis event", job_id=job_id)
|
|
|
|
|
|
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
Fix multiple critical bugs in onboarding training step
This commit addresses all identified bugs and issues in the training code path:
## Critical Fixes:
- Add get_start_time() method to TrainingLogRepository and fix non-existent method call
- Remove duplicate training.started event from API endpoint (trainer publishes the accurate one)
- Add missing progress events for 80-100% range (85%, 92%, 94%) to eliminate progress "dead zone"
## High Priority Fixes:
- Fix division by zero risk in time estimation with double-check and max() safety
- Remove unreachable exception handler in training_operations.py
- Simplify WebSocket token refresh logic to only reconnect on actual user session changes
## Medium Priority Fixes:
- Fix auto-start training effect with useRef to prevent duplicate starts
- Add HTTP polling debounce delay (5s) to prevent race conditions with WebSocket
- Extract all magic numbers to centralized constants files:
- Backend: services/training/app/core/training_constants.py
- Frontend: frontend/src/constants/training.ts
- Standardize error logging with exc_info=True on critical errors
## Code Quality Improvements:
- All progress percentages now use named constants
- All timeouts and intervals now use named constants
- Improved code maintainability and readability
- Better separation of concerns
## Files Changed:
- Backend: training_service.py, trainer.py, training_events.py, progress_tracker.py
- Backend: training_operations.py, training_log_repository.py, training_constants.py (new)
- Frontend: training.ts (hooks), MLTrainingStep.tsx, training.ts (constants, new)
All training progress events now properly flow from 0% to 100% with no gaps.
2025-11-05 13:02:39 +00:00
|
|
|
async def publish_training_progress(
|
|
|
|
|
job_id: str,
|
|
|
|
|
tenant_id: str,
|
|
|
|
|
progress: int,
|
|
|
|
|
current_step: str,
|
|
|
|
|
step_details: Optional[str] = None,
|
|
|
|
|
estimated_time_remaining_seconds: Optional[int] = None,
|
|
|
|
|
estimated_completion_time: Optional[str] = None
|
|
|
|
|
) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
Generic Training Progress Event (for any progress percentage)
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
job_id: Training job identifier
|
|
|
|
|
tenant_id: Tenant identifier
|
|
|
|
|
progress: Progress percentage (0-100)
|
|
|
|
|
current_step: Current step name
|
|
|
|
|
step_details: Details about the current step
|
|
|
|
|
estimated_time_remaining_seconds: Estimated time remaining in seconds
|
|
|
|
|
estimated_completion_time: ISO timestamp of estimated completion
|
|
|
|
|
"""
|
|
|
|
|
event_data = {
|
|
|
|
|
"service_name": "training-service",
|
|
|
|
|
"event_type": "training.progress",
|
|
|
|
|
"timestamp": datetime.now().isoformat(),
|
|
|
|
|
"data": {
|
|
|
|
|
"job_id": job_id,
|
|
|
|
|
"tenant_id": tenant_id,
|
|
|
|
|
"progress": progress,
|
|
|
|
|
"current_step": current_step,
|
|
|
|
|
"step_details": step_details or current_step,
|
|
|
|
|
"estimated_time_remaining_seconds": estimated_time_remaining_seconds,
|
|
|
|
|
"estimated_completion_time": estimated_completion_time
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
success = await training_publisher.publish_event(
|
|
|
|
|
exchange_name="training.events",
|
|
|
|
|
routing_key="training.progress",
|
|
|
|
|
event_data=event_data
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if success:
|
|
|
|
|
logger.info("Published training progress event",
|
|
|
|
|
job_id=job_id,
|
|
|
|
|
progress=progress,
|
|
|
|
|
current_step=current_step)
|
|
|
|
|
else:
|
|
|
|
|
logger.error("Failed to publish training progress event",
|
|
|
|
|
job_id=job_id,
|
|
|
|
|
progress=progress)
|
|
|
|
|
|
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
2025-10-09 14:11:02 +02:00
|
|
|
async def publish_product_training_completed(
|
|
|
|
|
job_id: str,
|
|
|
|
|
tenant_id: str,
|
|
|
|
|
product_name: str,
|
|
|
|
|
products_completed: int,
|
2025-10-15 16:12:49 +02:00
|
|
|
total_products: int,
|
2025-10-15 21:09:42 +02:00
|
|
|
estimated_time_remaining_seconds: Optional[int] = None,
|
|
|
|
|
estimated_completion_time: Optional[str] = None
|
2025-10-09 14:11:02 +02:00
|
|
|
) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
Event 3: Product Training Completed (contributes to 20-80% progress)
|
|
|
|
|
|
|
|
|
|
This event is published each time a product training completes.
|
|
|
|
|
The frontend/consumer will calculate the progress as:
|
|
|
|
|
progress = 20 + (products_completed / total_products) * 60
|
2025-10-15 16:12:49 +02:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
job_id: Training job identifier
|
|
|
|
|
tenant_id: Tenant identifier
|
|
|
|
|
product_name: Name of the product that was trained
|
|
|
|
|
products_completed: Number of products completed so far
|
|
|
|
|
total_products: Total number of products
|
|
|
|
|
estimated_time_remaining_seconds: Estimated time remaining in seconds
|
2025-10-15 21:09:42 +02:00
|
|
|
estimated_completion_time: ISO timestamp of estimated completion
|
2025-10-09 14:11:02 +02:00
|
|
|
"""
|
|
|
|
|
event_data = {
|
|
|
|
|
"service_name": "training-service",
|
|
|
|
|
"event_type": "training.product.completed",
|
|
|
|
|
"timestamp": datetime.now().isoformat(),
|
|
|
|
|
"data": {
|
|
|
|
|
"job_id": job_id,
|
|
|
|
|
"tenant_id": tenant_id,
|
|
|
|
|
"product_name": product_name,
|
|
|
|
|
"products_completed": products_completed,
|
|
|
|
|
"total_products": total_products,
|
|
|
|
|
"current_step": "Model Training",
|
2025-10-15 16:12:49 +02:00
|
|
|
"step_details": f"Completed training for {product_name} ({products_completed}/{total_products})",
|
2025-10-15 21:09:42 +02:00
|
|
|
"estimated_time_remaining_seconds": estimated_time_remaining_seconds,
|
|
|
|
|
"estimated_completion_time": estimated_completion_time
|
2025-10-09 14:11:02 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
success = await training_publisher.publish_event(
|
|
|
|
|
exchange_name="training.events",
|
|
|
|
|
routing_key="training.product.completed",
|
|
|
|
|
event_data=event_data
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if success:
|
|
|
|
|
logger.info("Published product training completed event",
|
|
|
|
|
job_id=job_id,
|
|
|
|
|
product_name=product_name,
|
|
|
|
|
products_completed=products_completed,
|
|
|
|
|
total_products=total_products)
|
|
|
|
|
else:
|
|
|
|
|
logger.error("Failed to publish product training completed event",
|
|
|
|
|
job_id=job_id)
|
|
|
|
|
|
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def publish_training_completed(
|
|
|
|
|
job_id: str,
|
|
|
|
|
tenant_id: str,
|
|
|
|
|
successful_trainings: int,
|
|
|
|
|
failed_trainings: int,
|
|
|
|
|
total_duration_seconds: float
|
|
|
|
|
) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
Event 4: Training Completed (100% progress)
|
|
|
|
|
"""
|
|
|
|
|
event_data = {
|
|
|
|
|
"service_name": "training-service",
|
|
|
|
|
"event_type": "training.completed",
|
|
|
|
|
"timestamp": datetime.now().isoformat(),
|
|
|
|
|
"data": {
|
|
|
|
|
"job_id": job_id,
|
|
|
|
|
"tenant_id": tenant_id,
|
|
|
|
|
"progress": 100,
|
|
|
|
|
"current_step": "Training Completed",
|
|
|
|
|
"step_details": f"Training completed: {successful_trainings} successful, {failed_trainings} failed",
|
|
|
|
|
"successful_trainings": successful_trainings,
|
|
|
|
|
"failed_trainings": failed_trainings,
|
|
|
|
|
"total_duration_seconds": total_duration_seconds
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
success = await training_publisher.publish_event(
|
|
|
|
|
exchange_name="training.events",
|
|
|
|
|
routing_key="training.completed",
|
|
|
|
|
event_data=event_data
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if success:
|
|
|
|
|
logger.info("Published training completed event",
|
|
|
|
|
job_id=job_id,
|
|
|
|
|
successful_trainings=successful_trainings,
|
|
|
|
|
failed_trainings=failed_trainings)
|
|
|
|
|
else:
|
|
|
|
|
logger.error("Failed to publish training completed event", job_id=job_id)
|
|
|
|
|
|
|
|
|
|
return success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def publish_training_failed(
|
|
|
|
|
job_id: str,
|
|
|
|
|
tenant_id: str,
|
|
|
|
|
error_message: str
|
|
|
|
|
) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
Event: Training Failed
|
|
|
|
|
"""
|
|
|
|
|
event_data = {
|
|
|
|
|
"service_name": "training-service",
|
|
|
|
|
"event_type": "training.failed",
|
|
|
|
|
"timestamp": datetime.now().isoformat(),
|
|
|
|
|
"data": {
|
|
|
|
|
"job_id": job_id,
|
|
|
|
|
"tenant_id": tenant_id,
|
|
|
|
|
"current_step": "Training Failed",
|
|
|
|
|
"error_message": error_message
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
success = await training_publisher.publish_event(
|
|
|
|
|
exchange_name="training.events",
|
|
|
|
|
routing_key="training.failed",
|
|
|
|
|
event_data=event_data
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if success:
|
|
|
|
|
logger.info("Published training failed event",
|
|
|
|
|
job_id=job_id,
|
|
|
|
|
error=error_message)
|
|
|
|
|
else:
|
|
|
|
|
logger.error("Failed to publish training failed event", job_id=job_id)
|
|
|
|
|
|
|
|
|
|
return success
|