New alert service

2025-12-05 20:07:01 +01:00
parent 1fe3a73549
commit 667e6e0404
393 changed files with 26002 additions and 61033 deletions
--- a/services/training/app/consumers/training_event_consumer.py
+++ b/services/training/app/consumers/training_event_consumer.py
@@ -0,0 +1,435 @@
+"""
+Training Event Consumer
+Processes ML model retraining requests from RabbitMQ
+Queues training jobs and manages model lifecycle
+"""
+import json
+import structlog
+from typing import Dict, Any, Optional
+from datetime import datetime
+from uuid import UUID
+
+from shared.messaging import RabbitMQClient
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select
+
+logger = structlog.get_logger()
+
+
+class TrainingEventConsumer:
+    """
+    Consumes training retraining events and queues ML training jobs
+    Ensures no duplicate training jobs and manages priorities
+    """
+
+    def __init__(self, db_session: AsyncSession):
+        self.db_session = db_session
+
+    async def consume_training_events(
+        self,
+        rabbitmq_client: RabbitMQClient
+    ):
+        """
+        Start consuming training events from RabbitMQ
+        """
+        async def process_message(message):
+            """Process a single training event message"""
+            try:
+                async with message.process():
+                    # Parse event data
+                    event_data = json.loads(message.body.decode())
+                    logger.info(
+                        "Received training event",
+                        event_id=event_data.get('event_id'),
+                        event_type=event_data.get('event_type'),
+                        tenant_id=event_data.get('tenant_id')
+                    )
+
+                    # Process the event
+                    await self.process_training_event(event_data)
+
+            except Exception as e:
+                logger.error(
+                    "Error processing training event",
+                    error=str(e),
+                    exc_info=True
+                )
+
+        # Start consuming events
+        await rabbitmq_client.consume_events(
+            exchange_name="training.events",
+            queue_name="training.retraining.queue",
+            routing_key="training.retrain.*",
+            callback=process_message
+        )
+
+        logger.info("Started consuming training events")
+
+    async def process_training_event(self, event_data: Dict[str, Any]) -> bool:
+        """
+        Process a training event based on type
+
+        Args:
+            event_data: Full event payload from RabbitMQ
+
+        Returns:
+            bool: True if processed successfully
+        """
+        try:
+            event_type = event_data.get('event_type')
+            data = event_data.get('data', {})
+            tenant_id = event_data.get('tenant_id')
+
+            if not tenant_id:
+                logger.warning("Training event missing tenant_id", event_data=event_data)
+                return False
+
+            # Route to appropriate handler
+            if event_type == 'training.retrain.requested':
+                success = await self._handle_retrain_requested(tenant_id, data, event_data)
+            elif event_type == 'training.retrain.scheduled':
+                success = await self._handle_retrain_scheduled(tenant_id, data)
+            else:
+                logger.warning("Unknown training event type", event_type=event_type)
+                success = True  # Mark as processed to avoid retry
+
+            if success:
+                logger.info(
+                    "Training event processed successfully",
+                    event_type=event_type,
+                    tenant_id=tenant_id
+                )
+            else:
+                logger.error(
+                    "Training event processing failed",
+                    event_type=event_type,
+                    tenant_id=tenant_id
+                )
+
+            return success
+
+        except Exception as e:
+            logger.error(
+                "Error in process_training_event",
+                error=str(e),
+                event_id=event_data.get('event_id'),
+                exc_info=True
+            )
+            return False
+
+    async def _handle_retrain_requested(
+        self,
+        tenant_id: str,
+        data: Dict[str, Any],
+        event_data: Dict[str, Any]
+    ) -> bool:
+        """
+        Handle retraining request event
+
+        Validates model, checks for existing jobs, queues training job
+
+        Args:
+            tenant_id: Tenant ID
+            data: Retraining request data
+            event_data: Full event payload
+
+        Returns:
+            bool: True if handled successfully
+        """
+        try:
+            model_id = data.get('model_id')
+            product_id = data.get('product_id')
+            trigger_reason = data.get('trigger_reason', 'unknown')
+            priority = data.get('priority', 'normal')
+            event_id = event_data.get('event_id')
+
+            if not model_id:
+                logger.warning("Retraining request missing model_id", data=data)
+                return False
+
+            # Validate model exists
+            from app.models import TrainedModel
+
+            stmt = select(TrainedModel).where(
+                TrainedModel.id == UUID(model_id),
+                TrainedModel.tenant_id == UUID(tenant_id)
+            )
+            result = await self.db_session.execute(stmt)
+            model = result.scalar_one_or_none()
+
+            if not model:
+                logger.error(
+                    "Model not found for retraining",
+                    model_id=model_id,
+                    tenant_id=tenant_id
+                )
+                return False
+
+            # Check if model is already in training
+            if model.status in ['training', 'retraining_queued']:
+                logger.info(
+                    "Model already in training, skipping duplicate request",
+                    model_id=model_id,
+                    current_status=model.status
+                )
+                return True  # Consider successful (idempotent)
+
+            # Check for existing job in queue
+            from app.models import TrainingJobQueue
+
+            existing_job_stmt = select(TrainingJobQueue).where(
+                TrainingJobQueue.model_id == UUID(model_id),
+                TrainingJobQueue.status.in_(['pending', 'running'])
+            )
+            existing_job_result = await self.db_session.execute(existing_job_stmt)
+            existing_job = existing_job_result.scalar_one_or_none()
+
+            if existing_job:
+                logger.info(
+                    "Training job already queued, skipping duplicate",
+                    model_id=model_id,
+                    job_id=str(existing_job.id)
+                )
+                return True  # Idempotent
+
+            # Queue training job
+            job_id = await self._queue_training_job(
+                tenant_id=tenant_id,
+                model_id=model_id,
+                product_id=product_id,
+                trigger_reason=trigger_reason,
+                priority=priority,
+                event_id=event_id,
+                metadata=data
+            )
+
+            if not job_id:
+                logger.error("Failed to queue training job", model_id=model_id)
+                return False
+
+            # Update model status
+            model.status = 'retraining_queued'
+            model.updated_at = datetime.utcnow()
+            await self.db_session.commit()
+
+            # Publish job queued event
+            await self._publish_job_queued_event(
+                tenant_id=tenant_id,
+                model_id=model_id,
+                job_id=job_id,
+                priority=priority
+            )
+
+            logger.info(
+                "Retraining job queued successfully",
+                model_id=model_id,
+                job_id=job_id,
+                trigger_reason=trigger_reason,
+                priority=priority
+            )
+
+            return True
+
+        except Exception as e:
+            await self.db_session.rollback()
+            logger.error(
+                "Error handling retrain requested",
+                error=str(e),
+                model_id=data.get('model_id'),
+                exc_info=True
+            )
+            return False
+
+    async def _handle_retrain_scheduled(
+        self,
+        tenant_id: str,
+        data: Dict[str, Any]
+    ) -> bool:
+        """
+        Handle scheduled retraining event
+
+        Similar to retrain_requested but for scheduled/batch retraining
+
+        Args:
+            tenant_id: Tenant ID
+            data: Scheduled retraining data
+
+        Returns:
+            bool: True if handled successfully
+        """
+        try:
+            # Similar logic to _handle_retrain_requested
+            # but may have different priority or batching logic
+            logger.info(
+                "Handling scheduled retraining",
+                tenant_id=tenant_id,
+                model_count=len(data.get('models', []))
+            )
+
+            # For now, redirect to retrain_requested handler
+            success_count = 0
+            for model_data in data.get('models', []):
+                if await self._handle_retrain_requested(
+                    tenant_id,
+                    model_data,
+                    {'event_id': data.get('schedule_id'), 'tenant_id': tenant_id}
+                ):
+                    success_count += 1
+
+            logger.info(
+                "Scheduled retraining processed",
+                tenant_id=tenant_id,
+                successful=success_count,
+                total=len(data.get('models', []))
+            )
+
+            return success_count > 0
+
+        except Exception as e:
+            logger.error(
+                "Error handling retrain scheduled",
+                error=str(e),
+                tenant_id=tenant_id,
+                exc_info=True
+            )
+            return False
+
+    async def _queue_training_job(
+        self,
+        tenant_id: str,
+        model_id: str,
+        product_id: str,
+        trigger_reason: str,
+        priority: str,
+        event_id: str,
+        metadata: Dict[str, Any]
+    ) -> Optional[str]:
+        """
+        Queue a training job in the database
+
+        Args:
+            tenant_id: Tenant ID
+            model_id: Model ID to retrain
+            product_id: Product ID
+            trigger_reason: Why retraining was triggered
+            priority: Job priority (low, normal, high)
+            event_id: Originating event ID
+            metadata: Additional job metadata
+
+        Returns:
+            Job ID if successful, None otherwise
+        """
+        try:
+            from app.models import TrainingJobQueue
+            import uuid
+
+            # Map priority to numeric value for sorting
+            priority_map = {
+                'low': 1,
+                'normal': 2,
+                'high': 3,
+                'critical': 4
+            }
+
+            job = TrainingJobQueue(
+                id=uuid.uuid4(),
+                tenant_id=UUID(tenant_id),
+                model_id=UUID(model_id),
+                product_id=UUID(product_id) if product_id else None,
+                job_type='retrain',
+                status='pending',
+                priority=priority,
+                priority_score=priority_map.get(priority, 2),
+                trigger_reason=trigger_reason,
+                event_id=event_id,
+                metadata=metadata,
+                created_at=datetime.utcnow(),
+                scheduled_at=datetime.utcnow()
+            )
+
+            self.db_session.add(job)
+            await self.db_session.commit()
+
+            logger.info(
+                "Training job created",
+                job_id=str(job.id),
+                model_id=model_id,
+                priority=priority,
+                trigger_reason=trigger_reason
+            )
+
+            return str(job.id)
+
+        except Exception as e:
+            await self.db_session.rollback()
+            logger.error(
+                "Failed to queue training job",
+                model_id=model_id,
+                error=str(e),
+                exc_info=True
+            )
+            return None
+
+    async def _publish_job_queued_event(
+        self,
+        tenant_id: str,
+        model_id: str,
+        job_id: str,
+        priority: str
+    ):
+        """
+        Publish event that training job was queued
+
+        Args:
+            tenant_id: Tenant ID
+            model_id: Model ID
+            job_id: Training job ID
+            priority: Job priority
+        """
+        try:
+            from shared.messaging import get_rabbitmq_client
+            import uuid
+
+            rabbitmq_client = get_rabbitmq_client()
+            if not rabbitmq_client:
+                logger.warning("RabbitMQ client not available for event publishing")
+                return
+
+            event_payload = {
+                "event_id": str(uuid.uuid4()),
+                "event_type": "training.retrain.queued",
+                "timestamp": datetime.utcnow().isoformat(),
+                "tenant_id": tenant_id,
+                "data": {
+                    "job_id": job_id,
+                    "model_id": model_id,
+                    "priority": priority,
+                    "status": "queued"
+                }
+            }
+
+            await rabbitmq_client.publish_event(
+                exchange_name="training.events",
+                routing_key="training.retrain.queued",
+                event_data=event_payload
+            )
+
+            logger.info(
+                "Published job queued event",
+                job_id=job_id,
+                event_id=event_payload["event_id"]
+            )
+
+        except Exception as e:
+            logger.error(
+                "Failed to publish job queued event",
+                job_id=job_id,
+                error=str(e)
+            )
+            # Don't fail the main operation if event publishing fails
+
+
+# Factory function for creating consumer instance
+def create_training_event_consumer(db_session: AsyncSession) -> TrainingEventConsumer:
+    """Create training event consumer instance"""
+    return TrainingEventConsumer(db_session)
--- a/services/training/app/repositories/artifact_repository.py
+++ b/services/training/app/repositories/artifact_repository.py
@@ -340,27 +340,85 @@ class ArtifactRepository(TrainingBaseRepository):
            }
    
    async def verify_artifact_integrity(self, artifact_id: int) -> Dict[str, Any]:
-        """Verify artifact file integrity (placeholder for file system checks)"""
+        """Verify artifact file integrity with actual file system checks"""
        try:
+            import os
+            import hashlib
+
            artifact = await self.get_by_id(artifact_id)
            if not artifact:
                return {"exists": False, "error": "Artifact not found"}
-            
-            # This is a placeholder - in a real implementation, you would:
-            # 1. Check if the file exists at artifact.file_path
-            # 2. Calculate current checksum and compare with stored checksum
-            # 3. Verify file size matches stored file_size_bytes
-            
-            return {
+
+            # Check if file exists
+            file_exists = os.path.exists(artifact.file_path)
+            if not file_exists:
+                return {
+                    "artifact_id": artifact_id,
+                    "file_path": artifact.file_path,
+                    "exists": False,
+                    "checksum_valid": False,
+                    "size_valid": False,
+                    "storage_location": artifact.storage_location,
+                    "last_verified": datetime.now().isoformat(),
+                    "error": "File does not exist on disk"
+                }
+
+            # Verify file size
+            actual_size = os.path.getsize(artifact.file_path)
+            size_valid = True
+            if artifact.file_size_bytes:
+                size_valid = (actual_size == artifact.file_size_bytes)
+
+            # Verify checksum if stored
+            checksum_valid = True
+            actual_checksum = None
+            if artifact.checksum:
+                # Calculate checksum of actual file
+                sha256_hash = hashlib.sha256()
+                try:
+                    with open(artifact.file_path, "rb") as f:
+                        # Read file in chunks to handle large files
+                        for byte_block in iter(lambda: f.read(4096), b""):
+                            sha256_hash.update(byte_block)
+                    actual_checksum = sha256_hash.hexdigest()
+                    checksum_valid = (actual_checksum == artifact.checksum)
+                except Exception as checksum_error:
+                    logger.error(f"Failed to calculate checksum: {checksum_error}")
+                    checksum_valid = False
+                    actual_checksum = None
+
+            # Overall integrity status
+            integrity_valid = file_exists and size_valid and checksum_valid
+
+            result = {
                "artifact_id": artifact_id,
                "file_path": artifact.file_path,
-                "exists": True,  # Would check actual file existence
-                "checksum_valid": True,  # Would verify actual checksum
-                "size_valid": True,  # Would verify actual file size
+                "exists": file_exists,
+                "checksum_valid": checksum_valid,
+                "size_valid": size_valid,
+                "integrity_valid": integrity_valid,
                "storage_location": artifact.storage_location,
-                "last_verified": datetime.now().isoformat()
+                "last_verified": datetime.now().isoformat(),
+                "details": {
+                    "stored_size_bytes": artifact.file_size_bytes,
+                    "actual_size_bytes": actual_size if file_exists else None,
+                    "stored_checksum": artifact.checksum,
+                    "actual_checksum": actual_checksum
+                }
            }
-            
+
+            if not integrity_valid:
+                issues = []
+                if not file_exists:
+                    issues.append("file_missing")
+                if not size_valid:
+                    issues.append("size_mismatch")
+                if not checksum_valid:
+                    issues.append("checksum_mismatch")
+                result["issues"] = issues
+
+            return result
+
        except Exception as e:
            logger.error("Failed to verify artifact integrity",
                        artifact_id=artifact_id,
@@ -374,55 +432,124 @@ class ArtifactRepository(TrainingBaseRepository):
        self,
        from_location: str,
        to_location: str,
-        tenant_id: str = None
+        tenant_id: str = None,
+        copy_only: bool = False,
+        verify: bool = True
    ) -> Dict[str, Any]:
-        """Migrate artifacts from one storage location to another (placeholder)"""
+        """Migrate artifacts from one storage location to another with actual file operations"""
        try:
+            import os
+            import shutil
+            import hashlib
+
            # Get artifacts to migrate
            artifacts = await self.get_artifacts_by_storage_location(from_location, tenant_id)
-            
+
            migrated_count = 0
            failed_count = 0
-            
-            # This is a placeholder - in a real implementation, you would:
-            # 1. Copy files from old location to new location
-            # 2. Update file paths in database
-            # 3. Verify successful migration
-            # 4. Clean up old files
-            
+            failed_artifacts = []
+            verified_count = 0
+
            for artifact in artifacts:
                try:
-                    # Placeholder migration logic
-                    new_file_path = artifact.file_path.replace(from_location, to_location)
-                    
+                    # Determine new file path
+                    new_file_path = artifact.file_path.replace(from_location, to_location, 1)
+
+                    # Create destination directory if it doesn't exist
+                    dest_dir = os.path.dirname(new_file_path)
+                    os.makedirs(dest_dir, exist_ok=True)
+
+                    # Check if source file exists
+                    if not os.path.exists(artifact.file_path):
+                        logger.warning(f"Source file not found: {artifact.file_path}")
+                        failed_count += 1
+                        failed_artifacts.append({
+                            "artifact_id": artifact.id,
+                            "file_path": artifact.file_path,
+                            "reason": "source_file_not_found"
+                        })
+                        continue
+
+                    # Copy or move file
+                    if copy_only:
+                        shutil.copy2(artifact.file_path, new_file_path)
+                        logger.debug(f"Copied file from {artifact.file_path} to {new_file_path}")
+                    else:
+                        shutil.move(artifact.file_path, new_file_path)
+                        logger.debug(f"Moved file from {artifact.file_path} to {new_file_path}")
+
+                    # Verify file was copied/moved successfully
+                    if verify and os.path.exists(new_file_path):
+                        # Verify file size
+                        new_size = os.path.getsize(new_file_path)
+                        if artifact.file_size_bytes and new_size != artifact.file_size_bytes:
+                            logger.warning(f"File size mismatch after migration: {new_file_path}")
+                            failed_count += 1
+                            failed_artifacts.append({
+                                "artifact_id": artifact.id,
+                                "file_path": new_file_path,
+                                "reason": "size_mismatch_after_migration"
+                            })
+                            continue
+
+                        # Verify checksum if available
+                        if artifact.checksum:
+                            sha256_hash = hashlib.sha256()
+                            with open(new_file_path, "rb") as f:
+                                for byte_block in iter(lambda: f.read(4096), b""):
+                                    sha256_hash.update(byte_block)
+                            new_checksum = sha256_hash.hexdigest()
+
+                            if new_checksum != artifact.checksum:
+                                logger.warning(f"Checksum mismatch after migration: {new_file_path}")
+                                failed_count += 1
+                                failed_artifacts.append({
+                                    "artifact_id": artifact.id,
+                                    "file_path": new_file_path,
+                                    "reason": "checksum_mismatch_after_migration"
+                                })
+                                continue
+
+                        verified_count += 1
+
+                    # Update database with new location
                    await self.update(artifact.id, {
                        "storage_location": to_location,
                        "file_path": new_file_path
                    })
-                    
+
                    migrated_count += 1
-                    
+
                except Exception as migration_error:
                    logger.error("Failed to migrate artifact",
                               artifact_id=artifact.id,
                               error=str(migration_error))
                    failed_count += 1
-            
+                    failed_artifacts.append({
+                        "artifact_id": artifact.id,
+                        "file_path": artifact.file_path,
+                        "reason": str(migration_error)
+                    })
+
            logger.info("Artifact migration completed",
                       from_location=from_location,
                       to_location=to_location,
                       migrated_count=migrated_count,
-                       failed_count=failed_count)
-            
+                       failed_count=failed_count,
+                       verified_count=verified_count)
+
            return {
                "from_location": from_location,
                "to_location": to_location,
                "total_artifacts": len(artifacts),
                "migrated_count": migrated_count,
                "failed_count": failed_count,
-                "success_rate": round((migrated_count / len(artifacts)) * 100, 2) if artifacts else 100
+                "verified_count": verified_count if verify else None,
+                "success_rate": round((migrated_count / len(artifacts)) * 100, 2) if artifacts else 100,
+                "copy_only": copy_only,
+                "failed_artifacts": failed_artifacts if failed_artifacts else None
            }
-            
+
        except Exception as e:
            logger.error("Failed to migrate artifacts",
                        from_location=from_location,
--- a/services/training/app/services/tenant_deletion_service.py
+++ b/services/training/app/services/tenant_deletion_service.py
@@ -135,26 +135,45 @@ class TrainingTenantDeletionService(BaseTenantDataDeletionService):
        result = TenantDataDeletionResult(tenant_id=tenant_id, service_name=self.service_name)

        try:
+            import os
+
            # Step 1: Delete model artifacts (references models)
            logger.info("training.tenant_deletion.deleting_artifacts", tenant_id=tenant_id)

-            # TODO: Delete physical files from storage before deleting DB records
-            # artifacts = await self.db.execute(
-            #     select(ModelArtifact).where(ModelArtifact.tenant_id == tenant_id)
-            # )
-            # for artifact in artifacts.scalars():
-            #     try:
-            #         os.remove(artifact.file_path)  # Delete physical file
-            #     except Exception as e:
-            #         logger.warning("Failed to delete artifact file",
-            #                       path=artifact.file_path, error=str(e))
+            # Delete physical files from storage before deleting DB records
+            artifacts = await self.db.execute(
+                select(ModelArtifact).where(ModelArtifact.tenant_id == tenant_id)
+            )
+            deleted_files = 0
+            failed_files = 0
+            for artifact in artifacts.scalars():
+                try:
+                    if artifact.file_path and os.path.exists(artifact.file_path):
+                        os.remove(artifact.file_path)
+                        deleted_files += 1
+                        logger.info("Deleted artifact file",
+                                   path=artifact.file_path,
+                                   artifact_id=artifact.id)
+                except Exception as e:
+                    failed_files += 1
+                    logger.warning("Failed to delete artifact file",
+                                  path=artifact.file_path,
+                                  artifact_id=artifact.id if hasattr(artifact, 'id') else 'unknown',
+                                  error=str(e))

+            logger.info("Artifact files deletion complete",
+                       deleted_files=deleted_files,
+                       failed_files=failed_files)
+
+            # Now delete DB records
            artifacts_result = await self.db.execute(
                delete(ModelArtifact).where(
                    ModelArtifact.tenant_id == tenant_id
                )
            )
            result.deleted_counts["model_artifacts"] = artifacts_result.rowcount
+            result.deleted_counts["artifact_files_deleted"] = deleted_files
+            result.deleted_counts["artifact_files_failed"] = failed_files
            logger.info(
                "training.tenant_deletion.artifacts_deleted",
                tenant_id=tenant_id,
@@ -206,26 +225,54 @@ class TrainingTenantDeletionService(BaseTenantDataDeletionService):
            # Step 5: Delete trained models (parent records)
            logger.info("training.tenant_deletion.deleting_models", tenant_id=tenant_id)

-            # TODO: Delete physical model files (.pkl) before deleting DB records
-            # models = await self.db.execute(
-            #     select(TrainedModel).where(TrainedModel.tenant_id == tenant_id)
-            # )
-            # for model in models.scalars():
-            #     try:
-            #         if model.model_path:
-            #             os.remove(model.model_path)  # Delete .pkl file
-            #         if model.metadata_path:
-            #             os.remove(model.metadata_path)  # Delete metadata file
-            #     except Exception as e:
-            #         logger.warning("Failed to delete model file",
-            #                       path=model.model_path, error=str(e))
+            # Delete physical model files (.pkl) before deleting DB records
+            models = await self.db.execute(
+                select(TrainedModel).where(TrainedModel.tenant_id == tenant_id)
+            )
+            deleted_model_files = 0
+            failed_model_files = 0
+            for model in models.scalars():
+                try:
+                    # Delete .pkl file
+                    if hasattr(model, 'model_path') and model.model_path and os.path.exists(model.model_path):
+                        os.remove(model.model_path)
+                        deleted_model_files += 1
+                        logger.info("Deleted model file",
+                                   path=model.model_path,
+                                   model_id=model.id)
+                    # Delete model_file_path if it exists
+                    if hasattr(model, 'model_file_path') and model.model_file_path and os.path.exists(model.model_file_path):
+                        os.remove(model.model_file_path)
+                        deleted_model_files += 1
+                        logger.info("Deleted model file",
+                                   path=model.model_file_path,
+                                   model_id=model.id)
+                    # Delete metadata file if exists
+                    if hasattr(model, 'metadata_path') and model.metadata_path and os.path.exists(model.metadata_path):
+                        os.remove(model.metadata_path)
+                        logger.info("Deleted metadata file",
+                                   path=model.metadata_path,
+                                   model_id=model.id)
+                except Exception as e:
+                    failed_model_files += 1
+                    logger.warning("Failed to delete model file",
+                                  path=getattr(model, 'model_path', getattr(model, 'model_file_path', 'unknown')),
+                                  model_id=model.id if hasattr(model, 'id') else 'unknown',
+                                  error=str(e))

+            logger.info("Model files deletion complete",
+                       deleted_files=deleted_model_files,
+                       failed_files=failed_model_files)
+
+            # Now delete DB records
            models_result = await self.db.execute(
                delete(TrainedModel).where(
                    TrainedModel.tenant_id == tenant_id
                )
            )
            result.deleted_counts["trained_models"] = models_result.rowcount
+            result.deleted_counts["model_files_deleted"] = deleted_model_files
+            result.deleted_counts["model_files_failed"] = failed_model_files
            logger.info(
                "training.tenant_deletion.models_deleted",
                tenant_id=tenant_id,
--- a/services/training/app/services/training_events.py
+++ b/services/training/app/services/training_events.py
@@ -6,7 +6,7 @@ Simple, clean event publisher for the 4 main training steps
 import structlog
 from datetime import datetime
 from typing import Dict, Any, Optional
-from shared.messaging.rabbitmq import RabbitMQClient
+from shared.messaging import RabbitMQClient
 from app.core.config import settings

 logger = structlog.get_logger()