bakery-ia/services/training/app/services/training_events.py

"""
Training Progress Events Publisher
Simple, clean event publisher for the 4 main training steps
"""

import structlog
from datetime import datetime
from typing import Dict, Any, Optional
from shared.messaging import RabbitMQClient
from app.core.config import settings

logger = structlog.get_logger()

# Single global publisher instance
training_publisher = RabbitMQClient(settings.RABBITMQ_URL, "training-service")


async def setup_messaging():
    """Initialize messaging"""
    success = await training_publisher.connect()
    if success:
        logger.info("Training messaging initialized")
    else:
        logger.warning("Training messaging failed to initialize")
    return success


async def cleanup_messaging():
    """Cleanup messaging"""
    await training_publisher.disconnect()
    logger.info("Training messaging cleaned up")


# ==========================================
# 4 MAIN TRAINING PROGRESS EVENTS
# ==========================================

async def publish_training_started(
    job_id: str,
    tenant_id: str,
    total_products: int,
    estimated_duration_minutes: Optional[int] = None,
    estimated_completion_time: Optional[str] = None
) -> bool:
    """
    Event 1: Training Started (0% progress)

    Args:
        job_id: Training job identifier
        tenant_id: Tenant identifier
        total_products: Number of products to train
        estimated_duration_minutes: Estimated time to completion in minutes
        estimated_completion_time: ISO timestamp of estimated completion
    """
    event_data = {
        "service_name": "training-service",
        "event_type": "training.started",
        "timestamp": datetime.now().isoformat(),
        "data": {
            "job_id": job_id,
            "tenant_id": tenant_id,
            "progress": 0,
            "current_step": "Training Started",
            "step_details": f"Starting training for {total_products} products",
            "total_products": total_products,
            "estimated_duration_minutes": estimated_duration_minutes,
            "estimated_completion_time": estimated_completion_time,
            "estimated_time_remaining_seconds": estimated_duration_minutes * 60 if estimated_duration_minutes else None
        }
    }

    success = await training_publisher.publish_event(
        exchange_name="training.events",
        routing_key="training.started",
        event_data=event_data
    )

    if success:
        logger.info("Published training started event",
                   job_id=job_id,
                   tenant_id=tenant_id,
                   total_products=total_products,
                   estimated_duration_minutes=estimated_duration_minutes)
    else:
        logger.error("Failed to publish training started event", job_id=job_id)

    return success


async def publish_data_analysis(
    job_id: str,
    tenant_id: str,
    analysis_details: Optional[str] = None,
    estimated_time_remaining_seconds: Optional[int] = None,
    estimated_completion_time: Optional[str] = None
) -> bool:
    """
    Event 2: Data Analysis (20% progress)

    Args:
        job_id: Training job identifier
        tenant_id: Tenant identifier
        analysis_details: Details about the analysis
        estimated_time_remaining_seconds: Estimated time remaining in seconds
        estimated_completion_time: ISO timestamp of estimated completion
    """
    event_data = {
        "service_name": "training-service",
        "event_type": "training.progress",
        "timestamp": datetime.now().isoformat(),
        "data": {
            "job_id": job_id,
            "tenant_id": tenant_id,
            "progress": 20,
            "current_step": "Data Analysis",
            "step_details": analysis_details or "Analyzing sales, weather, and traffic data",
            "estimated_time_remaining_seconds": estimated_time_remaining_seconds,
            "estimated_completion_time": estimated_completion_time
        }
    }

    success = await training_publisher.publish_event(
        exchange_name="training.events",
        routing_key="training.progress",
        event_data=event_data
    )

    if success:
        logger.info("Published data analysis event",
                   job_id=job_id,
                   progress=20)
    else:
        logger.error("Failed to publish data analysis event", job_id=job_id)

    return success


async def publish_training_progress(
    job_id: str,
    tenant_id: str,
    progress: int,
    current_step: str,
    step_details: Optional[str] = None,
    estimated_time_remaining_seconds: Optional[int] = None,
    estimated_completion_time: Optional[str] = None
) -> bool:
    """
    Generic Training Progress Event (for any progress percentage)

    Args:
        job_id: Training job identifier
        tenant_id: Tenant identifier
        progress: Progress percentage (0-100)
        current_step: Current step name
        step_details: Details about the current step
        estimated_time_remaining_seconds: Estimated time remaining in seconds
        estimated_completion_time: ISO timestamp of estimated completion
    """
    event_data = {
        "service_name": "training-service",
        "event_type": "training.progress",
        "timestamp": datetime.now().isoformat(),
        "data": {
            "job_id": job_id,
            "tenant_id": tenant_id,
            "progress": progress,
            "current_step": current_step,
            "step_details": step_details or current_step,
            "estimated_time_remaining_seconds": estimated_time_remaining_seconds,
            "estimated_completion_time": estimated_completion_time
        }
    }

    success = await training_publisher.publish_event(
        exchange_name="training.events",
        routing_key="training.progress",
        event_data=event_data
    )

    if success:
        logger.info("Published training progress event",
                   job_id=job_id,
                   progress=progress,
                   current_step=current_step)
    else:
        logger.error("Failed to publish training progress event",
                    job_id=job_id,
                    progress=progress)

    return success


async def publish_product_training_completed(
    job_id: str,
    tenant_id: str,
    product_name: str,
    products_completed: int,
    total_products: int,
    estimated_time_remaining_seconds: Optional[int] = None,
    estimated_completion_time: Optional[str] = None
) -> bool:
    """
    Event 3: Product Training Completed (contributes to 20-80% progress)

    This event is published each time a product training completes.
    The frontend/consumer will calculate the progress as:
    progress = 20 + (products_completed / total_products) * 60

    Args:
        job_id: Training job identifier
        tenant_id: Tenant identifier
        product_name: Name of the product that was trained
        products_completed: Number of products completed so far
        total_products: Total number of products
        estimated_time_remaining_seconds: Estimated time remaining in seconds
        estimated_completion_time: ISO timestamp of estimated completion
    """
    event_data = {
        "service_name": "training-service",
        "event_type": "training.product.completed",
        "timestamp": datetime.now().isoformat(),
        "data": {
            "job_id": job_id,
            "tenant_id": tenant_id,
            "product_name": product_name,
            "products_completed": products_completed,
            "total_products": total_products,
            "current_step": "Model Training",
            "step_details": f"Completed training for {product_name} ({products_completed}/{total_products})",
            "estimated_time_remaining_seconds": estimated_time_remaining_seconds,
            "estimated_completion_time": estimated_completion_time
        }
    }

    success = await training_publisher.publish_event(
        exchange_name="training.events",
        routing_key="training.product.completed",
        event_data=event_data
    )

    if success:
        logger.info("Published product training completed event",
                   job_id=job_id,
                   product_name=product_name,
                   products_completed=products_completed,
                   total_products=total_products)
    else:
        logger.error("Failed to publish product training completed event",
                    job_id=job_id)

    return success


async def publish_training_completed(
    job_id: str,
    tenant_id: str,
    successful_trainings: int,
    failed_trainings: int,
    total_duration_seconds: float
) -> bool:
    """
    Event 4: Training Completed (100% progress)
    """
    event_data = {
        "service_name": "training-service",
        "event_type": "training.completed",
        "timestamp": datetime.now().isoformat(),
        "data": {
            "job_id": job_id,
            "tenant_id": tenant_id,
            "progress": 100,
            "current_step": "Training Completed",
            "step_details": f"Training completed: {successful_trainings} successful, {failed_trainings} failed",
            "successful_trainings": successful_trainings,
            "failed_trainings": failed_trainings,
            "total_duration_seconds": total_duration_seconds
        }
    }

    success = await training_publisher.publish_event(
        exchange_name="training.events",
        routing_key="training.completed",
        event_data=event_data
    )

    if success:
        logger.info("Published training completed event",
                   job_id=job_id,
                   successful_trainings=successful_trainings,
                   failed_trainings=failed_trainings)
    else:
        logger.error("Failed to publish training completed event", job_id=job_id)

    return success


async def publish_training_failed(
    job_id: str,
    tenant_id: str,
    error_message: str
) -> bool:
    """
    Event: Training Failed
    """
    event_data = {
        "service_name": "training-service",
        "event_type": "training.failed",
        "timestamp": datetime.now().isoformat(),
        "data": {
            "job_id": job_id,
            "tenant_id": tenant_id,
            "current_step": "Training Failed",
            "error_message": error_message
        }
    }

    success = await training_publisher.publish_event(
        exchange_name="training.events",
        routing_key="training.failed",
        event_data=event_data
    )

    if success:
        logger.info("Published training failed event",
                   job_id=job_id,
                   error=error_message)
    else:
        logger.error("Failed to publish training failed event", job_id=job_id)

    return success
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`"""`
			`Training Progress Events Publisher`
			`Simple, clean event publisher for the 4 main training steps`
			`"""`

			`import structlog`
			`from datetime import datetime`
			`from typing import Dict, Any, Optional`
New alert service 2025-12-05 20:07:01 +01:00			`from shared.messaging import RabbitMQClient`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`from app.core.config import settings`

			`logger = structlog.get_logger()`

			`# Single global publisher instance`
			`training_publisher = RabbitMQClient(settings.RABBITMQ_URL, "training-service")`


			`async def setup_messaging():`
			`"""Initialize messaging"""`
			`success = await training_publisher.connect()`
			`if success:`
			`logger.info("Training messaging initialized")`
			`else:`
			`logger.warning("Training messaging failed to initialize")`
			`return success`


			`async def cleanup_messaging():`
			`"""Cleanup messaging"""`
			`await training_publisher.disconnect()`
			`logger.info("Training messaging cleaned up")`


			`# ==========================================`
			`# 4 MAIN TRAINING PROGRESS EVENTS`
			`# ==========================================`

			`async def publish_training_started(`
			`job_id: str,`
			`tenant_id: str,`
Add role-based filtering and imporve code 2025-10-15 16:12:49 +02:00			`total_products: int,`
			`estimated_duration_minutes: Optional[int] = None,`
			`estimated_completion_time: Optional[str] = None`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`) -> bool:`
			`"""`
			`Event 1: Training Started (0% progress)`
Add role-based filtering and imporve code 2025-10-15 16:12:49 +02:00
			`Args:`
			`job_id: Training job identifier`
			`tenant_id: Tenant identifier`
			`total_products: Number of products to train`
			`estimated_duration_minutes: Estimated time to completion in minutes`
			`estimated_completion_time: ISO timestamp of estimated completion`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`"""`
			`event_data = {`
			`"service_name": "training-service",`
			`"event_type": "training.started",`
			`"timestamp": datetime.now().isoformat(),`
			`"data": {`
			`"job_id": job_id,`
			`"tenant_id": tenant_id,`
			`"progress": 0,`
			`"current_step": "Training Started",`
			`"step_details": f"Starting training for {total_products} products",`
Add role-based filtering and imporve code 2025-10-15 16:12:49 +02:00			`"total_products": total_products,`
			`"estimated_duration_minutes": estimated_duration_minutes,`
			`"estimated_completion_time": estimated_completion_time,`
			`"estimated_time_remaining_seconds": estimated_duration_minutes * 60 if estimated_duration_minutes else None`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`}`
			`}`

			`success = await training_publisher.publish_event(`
			`exchange_name="training.events",`
			`routing_key="training.started",`
			`event_data=event_data`
			`)`

			`if success:`
			`logger.info("Published training started event",`
			`job_id=job_id,`
			`tenant_id=tenant_id,`
Add role-based filtering and imporve code 2025-10-15 16:12:49 +02:00			`total_products=total_products,`
			`estimated_duration_minutes=estimated_duration_minutes)`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`else:`
			`logger.error("Failed to publish training started event", job_id=job_id)`

			`return success`


			`async def publish_data_analysis(`
			`job_id: str,`
			`tenant_id: str,`
Add role-based filtering and imporve code 2025-10-15 16:12:49 +02:00			`analysis_details: Optional[str] = None,`
Improve the sales import 2025-10-15 21:09:42 +02:00			`estimated_time_remaining_seconds: Optional[int] = None,`
			`estimated_completion_time: Optional[str] = None`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`) -> bool:`
			`"""`
			`Event 2: Data Analysis (20% progress)`
Add role-based filtering and imporve code 2025-10-15 16:12:49 +02:00
			`Args:`
			`job_id: Training job identifier`
			`tenant_id: Tenant identifier`
			`analysis_details: Details about the analysis`
			`estimated_time_remaining_seconds: Estimated time remaining in seconds`
Improve the sales import 2025-10-15 21:09:42 +02:00			`estimated_completion_time: ISO timestamp of estimated completion`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`"""`
			`event_data = {`
			`"service_name": "training-service",`
			`"event_type": "training.progress",`
			`"timestamp": datetime.now().isoformat(),`
			`"data": {`
			`"job_id": job_id,`
			`"tenant_id": tenant_id,`
			`"progress": 20,`
			`"current_step": "Data Analysis",`
Add role-based filtering and imporve code 2025-10-15 16:12:49 +02:00			`"step_details": analysis_details or "Analyzing sales, weather, and traffic data",`
Improve the sales import 2025-10-15 21:09:42 +02:00			`"estimated_time_remaining_seconds": estimated_time_remaining_seconds,`
			`"estimated_completion_time": estimated_completion_time`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`}`
			`}`

			`success = await training_publisher.publish_event(`
			`exchange_name="training.events",`
			`routing_key="training.progress",`
			`event_data=event_data`
			`)`

			`if success:`
			`logger.info("Published data analysis event",`
			`job_id=job_id,`
			`progress=20)`
			`else:`
			`logger.error("Failed to publish data analysis event", job_id=job_id)`

			`return success`


Fix multiple critical bugs in onboarding training step This commit addresses all identified bugs and issues in the training code path: ## Critical Fixes: - Add get_start_time() method to TrainingLogRepository and fix non-existent method call - Remove duplicate training.started event from API endpoint (trainer publishes the accurate one) - Add missing progress events for 80-100% range (85%, 92%, 94%) to eliminate progress "dead zone" ## High Priority Fixes: - Fix division by zero risk in time estimation with double-check and max() safety - Remove unreachable exception handler in training_operations.py - Simplify WebSocket token refresh logic to only reconnect on actual user session changes ## Medium Priority Fixes: - Fix auto-start training effect with useRef to prevent duplicate starts - Add HTTP polling debounce delay (5s) to prevent race conditions with WebSocket - Extract all magic numbers to centralized constants files: - Backend: services/training/app/core/training_constants.py - Frontend: frontend/src/constants/training.ts - Standardize error logging with exc_info=True on critical errors ## Code Quality Improvements: - All progress percentages now use named constants - All timeouts and intervals now use named constants - Improved code maintainability and readability - Better separation of concerns ## Files Changed: - Backend: training_service.py, trainer.py, training_events.py, progress_tracker.py - Backend: training_operations.py, training_log_repository.py, training_constants.py (new) - Frontend: training.ts (hooks), MLTrainingStep.tsx, training.ts (constants, new) All training progress events now properly flow from 0% to 100% with no gaps. 2025-11-05 13:02:39 +00:00			`async def publish_training_progress(`
			`job_id: str,`
			`tenant_id: str,`
			`progress: int,`
			`current_step: str,`
			`step_details: Optional[str] = None,`
			`estimated_time_remaining_seconds: Optional[int] = None,`
			`estimated_completion_time: Optional[str] = None`
			`) -> bool:`
			`"""`
			`Generic Training Progress Event (for any progress percentage)`

			`Args:`
			`job_id: Training job identifier`
			`tenant_id: Tenant identifier`
			`progress: Progress percentage (0-100)`
			`current_step: Current step name`
			`step_details: Details about the current step`
			`estimated_time_remaining_seconds: Estimated time remaining in seconds`
			`estimated_completion_time: ISO timestamp of estimated completion`
			`"""`
			`event_data = {`
			`"service_name": "training-service",`
			`"event_type": "training.progress",`
			`"timestamp": datetime.now().isoformat(),`
			`"data": {`
			`"job_id": job_id,`
			`"tenant_id": tenant_id,`
			`"progress": progress,`
			`"current_step": current_step,`
			`"step_details": step_details or current_step,`
			`"estimated_time_remaining_seconds": estimated_time_remaining_seconds,`
			`"estimated_completion_time": estimated_completion_time`
			`}`
			`}`

			`success = await training_publisher.publish_event(`
			`exchange_name="training.events",`
			`routing_key="training.progress",`
			`event_data=event_data`
			`)`

			`if success:`
			`logger.info("Published training progress event",`
			`job_id=job_id,`
			`progress=progress,`
			`current_step=current_step)`
			`else:`
			`logger.error("Failed to publish training progress event",`
			`job_id=job_id,`
			`progress=progress)`

			`return success`


REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`async def publish_product_training_completed(`
			`job_id: str,`
			`tenant_id: str,`
			`product_name: str,`
			`products_completed: int,`
Add role-based filtering and imporve code 2025-10-15 16:12:49 +02:00			`total_products: int,`
Improve the sales import 2025-10-15 21:09:42 +02:00			`estimated_time_remaining_seconds: Optional[int] = None,`
			`estimated_completion_time: Optional[str] = None`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`) -> bool:`
			`"""`
			`Event 3: Product Training Completed (contributes to 20-80% progress)`

			`This event is published each time a product training completes.`
			`The frontend/consumer will calculate the progress as:`
			`progress = 20 + (products_completed / total_products) * 60`
Add role-based filtering and imporve code 2025-10-15 16:12:49 +02:00
			`Args:`
			`job_id: Training job identifier`
			`tenant_id: Tenant identifier`
			`product_name: Name of the product that was trained`
			`products_completed: Number of products completed so far`
			`total_products: Total number of products`
			`estimated_time_remaining_seconds: Estimated time remaining in seconds`
Improve the sales import 2025-10-15 21:09:42 +02:00			`estimated_completion_time: ISO timestamp of estimated completion`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`"""`
			`event_data = {`
			`"service_name": "training-service",`
			`"event_type": "training.product.completed",`
			`"timestamp": datetime.now().isoformat(),`
			`"data": {`
			`"job_id": job_id,`
			`"tenant_id": tenant_id,`
			`"product_name": product_name,`
			`"products_completed": products_completed,`
			`"total_products": total_products,`
			`"current_step": "Model Training",`
Add role-based filtering and imporve code 2025-10-15 16:12:49 +02:00			`"step_details": f"Completed training for {product_name} ({products_completed}/{total_products})",`
Improve the sales import 2025-10-15 21:09:42 +02:00			`"estimated_time_remaining_seconds": estimated_time_remaining_seconds,`
			`"estimated_completion_time": estimated_completion_time`
REFACTOR external service and improve websocket training 2025-10-09 14:11:02 +02:00			`}`
			`}`

			`success = await training_publisher.publish_event(`
			`exchange_name="training.events",`
			`routing_key="training.product.completed",`
			`event_data=event_data`
			`)`

			`if success:`
			`logger.info("Published product training completed event",`
			`job_id=job_id,`
			`product_name=product_name,`
			`products_completed=products_completed,`
			`total_products=total_products)`
			`else:`
			`logger.error("Failed to publish product training completed event",`
			`job_id=job_id)`

			`return success`


			`async def publish_training_completed(`
			`job_id: str,`
			`tenant_id: str,`
			`successful_trainings: int,`
			`failed_trainings: int,`
			`total_duration_seconds: float`
			`) -> bool:`
			`"""`
			`Event 4: Training Completed (100% progress)`
			`"""`
			`event_data = {`
			`"service_name": "training-service",`
			`"event_type": "training.completed",`
			`"timestamp": datetime.now().isoformat(),`
			`"data": {`
			`"job_id": job_id,`
			`"tenant_id": tenant_id,`
			`"progress": 100,`
			`"current_step": "Training Completed",`
			`"step_details": f"Training completed: {successful_trainings} successful, {failed_trainings} failed",`
			`"successful_trainings": successful_trainings,`
			`"failed_trainings": failed_trainings,`
			`"total_duration_seconds": total_duration_seconds`
			`}`
			`}`

			`success = await training_publisher.publish_event(`
			`exchange_name="training.events",`
			`routing_key="training.completed",`
			`event_data=event_data`
			`)`

			`if success:`
			`logger.info("Published training completed event",`
			`job_id=job_id,`
			`successful_trainings=successful_trainings,`
			`failed_trainings=failed_trainings)`
			`else:`
			`logger.error("Failed to publish training completed event", job_id=job_id)`

			`return success`


			`async def publish_training_failed(`
			`job_id: str,`
			`tenant_id: str,`
			`error_message: str`
			`) -> bool:`
			`"""`
			`Event: Training Failed`
			`"""`
			`event_data = {`
			`"service_name": "training-service",`
			`"event_type": "training.failed",`
			`"timestamp": datetime.now().isoformat(),`
			`"data": {`
			`"job_id": job_id,`
			`"tenant_id": tenant_id,`
			`"current_step": "Training Failed",`
			`"error_message": error_message`
			`}`
			`}`

			`success = await training_publisher.publish_event(`
			`exchange_name="training.events",`
			`routing_key="training.failed",`
			`event_data=event_data`
			`)`

			`if success:`
			`logger.info("Published training failed event",`
			`job_id=job_id,`
			`error=error_message)`
			`else:`
			`logger.error("Failed to publish training failed event", job_id=job_id)`

			`return success`