New alert service

This commit is contained in:
Urtzi Alfaro
2025-12-05 20:07:01 +01:00
parent 1fe3a73549
commit 667e6e0404
393 changed files with 26002 additions and 61033 deletions

View File

@@ -0,0 +1,435 @@
"""
Training Event Consumer
Processes ML model retraining requests from RabbitMQ
Queues training jobs and manages model lifecycle
"""
import json
import structlog
from typing import Dict, Any, Optional
from datetime import datetime
from uuid import UUID
from shared.messaging import RabbitMQClient
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select
logger = structlog.get_logger()
class TrainingEventConsumer:
"""
Consumes training retraining events and queues ML training jobs
Ensures no duplicate training jobs and manages priorities
"""
def __init__(self, db_session: AsyncSession):
self.db_session = db_session
async def consume_training_events(
self,
rabbitmq_client: RabbitMQClient
):
"""
Start consuming training events from RabbitMQ
"""
async def process_message(message):
"""Process a single training event message"""
try:
async with message.process():
# Parse event data
event_data = json.loads(message.body.decode())
logger.info(
"Received training event",
event_id=event_data.get('event_id'),
event_type=event_data.get('event_type'),
tenant_id=event_data.get('tenant_id')
)
# Process the event
await self.process_training_event(event_data)
except Exception as e:
logger.error(
"Error processing training event",
error=str(e),
exc_info=True
)
# Start consuming events
await rabbitmq_client.consume_events(
exchange_name="training.events",
queue_name="training.retraining.queue",
routing_key="training.retrain.*",
callback=process_message
)
logger.info("Started consuming training events")
async def process_training_event(self, event_data: Dict[str, Any]) -> bool:
"""
Process a training event based on type
Args:
event_data: Full event payload from RabbitMQ
Returns:
bool: True if processed successfully
"""
try:
event_type = event_data.get('event_type')
data = event_data.get('data', {})
tenant_id = event_data.get('tenant_id')
if not tenant_id:
logger.warning("Training event missing tenant_id", event_data=event_data)
return False
# Route to appropriate handler
if event_type == 'training.retrain.requested':
success = await self._handle_retrain_requested(tenant_id, data, event_data)
elif event_type == 'training.retrain.scheduled':
success = await self._handle_retrain_scheduled(tenant_id, data)
else:
logger.warning("Unknown training event type", event_type=event_type)
success = True # Mark as processed to avoid retry
if success:
logger.info(
"Training event processed successfully",
event_type=event_type,
tenant_id=tenant_id
)
else:
logger.error(
"Training event processing failed",
event_type=event_type,
tenant_id=tenant_id
)
return success
except Exception as e:
logger.error(
"Error in process_training_event",
error=str(e),
event_id=event_data.get('event_id'),
exc_info=True
)
return False
async def _handle_retrain_requested(
self,
tenant_id: str,
data: Dict[str, Any],
event_data: Dict[str, Any]
) -> bool:
"""
Handle retraining request event
Validates model, checks for existing jobs, queues training job
Args:
tenant_id: Tenant ID
data: Retraining request data
event_data: Full event payload
Returns:
bool: True if handled successfully
"""
try:
model_id = data.get('model_id')
product_id = data.get('product_id')
trigger_reason = data.get('trigger_reason', 'unknown')
priority = data.get('priority', 'normal')
event_id = event_data.get('event_id')
if not model_id:
logger.warning("Retraining request missing model_id", data=data)
return False
# Validate model exists
from app.models import TrainedModel
stmt = select(TrainedModel).where(
TrainedModel.id == UUID(model_id),
TrainedModel.tenant_id == UUID(tenant_id)
)
result = await self.db_session.execute(stmt)
model = result.scalar_one_or_none()
if not model:
logger.error(
"Model not found for retraining",
model_id=model_id,
tenant_id=tenant_id
)
return False
# Check if model is already in training
if model.status in ['training', 'retraining_queued']:
logger.info(
"Model already in training, skipping duplicate request",
model_id=model_id,
current_status=model.status
)
return True # Consider successful (idempotent)
# Check for existing job in queue
from app.models import TrainingJobQueue
existing_job_stmt = select(TrainingJobQueue).where(
TrainingJobQueue.model_id == UUID(model_id),
TrainingJobQueue.status.in_(['pending', 'running'])
)
existing_job_result = await self.db_session.execute(existing_job_stmt)
existing_job = existing_job_result.scalar_one_or_none()
if existing_job:
logger.info(
"Training job already queued, skipping duplicate",
model_id=model_id,
job_id=str(existing_job.id)
)
return True # Idempotent
# Queue training job
job_id = await self._queue_training_job(
tenant_id=tenant_id,
model_id=model_id,
product_id=product_id,
trigger_reason=trigger_reason,
priority=priority,
event_id=event_id,
metadata=data
)
if not job_id:
logger.error("Failed to queue training job", model_id=model_id)
return False
# Update model status
model.status = 'retraining_queued'
model.updated_at = datetime.utcnow()
await self.db_session.commit()
# Publish job queued event
await self._publish_job_queued_event(
tenant_id=tenant_id,
model_id=model_id,
job_id=job_id,
priority=priority
)
logger.info(
"Retraining job queued successfully",
model_id=model_id,
job_id=job_id,
trigger_reason=trigger_reason,
priority=priority
)
return True
except Exception as e:
await self.db_session.rollback()
logger.error(
"Error handling retrain requested",
error=str(e),
model_id=data.get('model_id'),
exc_info=True
)
return False
async def _handle_retrain_scheduled(
self,
tenant_id: str,
data: Dict[str, Any]
) -> bool:
"""
Handle scheduled retraining event
Similar to retrain_requested but for scheduled/batch retraining
Args:
tenant_id: Tenant ID
data: Scheduled retraining data
Returns:
bool: True if handled successfully
"""
try:
# Similar logic to _handle_retrain_requested
# but may have different priority or batching logic
logger.info(
"Handling scheduled retraining",
tenant_id=tenant_id,
model_count=len(data.get('models', []))
)
# For now, redirect to retrain_requested handler
success_count = 0
for model_data in data.get('models', []):
if await self._handle_retrain_requested(
tenant_id,
model_data,
{'event_id': data.get('schedule_id'), 'tenant_id': tenant_id}
):
success_count += 1
logger.info(
"Scheduled retraining processed",
tenant_id=tenant_id,
successful=success_count,
total=len(data.get('models', []))
)
return success_count > 0
except Exception as e:
logger.error(
"Error handling retrain scheduled",
error=str(e),
tenant_id=tenant_id,
exc_info=True
)
return False
async def _queue_training_job(
self,
tenant_id: str,
model_id: str,
product_id: str,
trigger_reason: str,
priority: str,
event_id: str,
metadata: Dict[str, Any]
) -> Optional[str]:
"""
Queue a training job in the database
Args:
tenant_id: Tenant ID
model_id: Model ID to retrain
product_id: Product ID
trigger_reason: Why retraining was triggered
priority: Job priority (low, normal, high)
event_id: Originating event ID
metadata: Additional job metadata
Returns:
Job ID if successful, None otherwise
"""
try:
from app.models import TrainingJobQueue
import uuid
# Map priority to numeric value for sorting
priority_map = {
'low': 1,
'normal': 2,
'high': 3,
'critical': 4
}
job = TrainingJobQueue(
id=uuid.uuid4(),
tenant_id=UUID(tenant_id),
model_id=UUID(model_id),
product_id=UUID(product_id) if product_id else None,
job_type='retrain',
status='pending',
priority=priority,
priority_score=priority_map.get(priority, 2),
trigger_reason=trigger_reason,
event_id=event_id,
metadata=metadata,
created_at=datetime.utcnow(),
scheduled_at=datetime.utcnow()
)
self.db_session.add(job)
await self.db_session.commit()
logger.info(
"Training job created",
job_id=str(job.id),
model_id=model_id,
priority=priority,
trigger_reason=trigger_reason
)
return str(job.id)
except Exception as e:
await self.db_session.rollback()
logger.error(
"Failed to queue training job",
model_id=model_id,
error=str(e),
exc_info=True
)
return None
async def _publish_job_queued_event(
self,
tenant_id: str,
model_id: str,
job_id: str,
priority: str
):
"""
Publish event that training job was queued
Args:
tenant_id: Tenant ID
model_id: Model ID
job_id: Training job ID
priority: Job priority
"""
try:
from shared.messaging import get_rabbitmq_client
import uuid
rabbitmq_client = get_rabbitmq_client()
if not rabbitmq_client:
logger.warning("RabbitMQ client not available for event publishing")
return
event_payload = {
"event_id": str(uuid.uuid4()),
"event_type": "training.retrain.queued",
"timestamp": datetime.utcnow().isoformat(),
"tenant_id": tenant_id,
"data": {
"job_id": job_id,
"model_id": model_id,
"priority": priority,
"status": "queued"
}
}
await rabbitmq_client.publish_event(
exchange_name="training.events",
routing_key="training.retrain.queued",
event_data=event_payload
)
logger.info(
"Published job queued event",
job_id=job_id,
event_id=event_payload["event_id"]
)
except Exception as e:
logger.error(
"Failed to publish job queued event",
job_id=job_id,
error=str(e)
)
# Don't fail the main operation if event publishing fails
# Factory function for creating consumer instance
def create_training_event_consumer(db_session: AsyncSession) -> TrainingEventConsumer:
"""Create training event consumer instance"""
return TrainingEventConsumer(db_session)

View File

@@ -340,27 +340,85 @@ class ArtifactRepository(TrainingBaseRepository):
}
async def verify_artifact_integrity(self, artifact_id: int) -> Dict[str, Any]:
"""Verify artifact file integrity (placeholder for file system checks)"""
"""Verify artifact file integrity with actual file system checks"""
try:
import os
import hashlib
artifact = await self.get_by_id(artifact_id)
if not artifact:
return {"exists": False, "error": "Artifact not found"}
# This is a placeholder - in a real implementation, you would:
# 1. Check if the file exists at artifact.file_path
# 2. Calculate current checksum and compare with stored checksum
# 3. Verify file size matches stored file_size_bytes
return {
# Check if file exists
file_exists = os.path.exists(artifact.file_path)
if not file_exists:
return {
"artifact_id": artifact_id,
"file_path": artifact.file_path,
"exists": False,
"checksum_valid": False,
"size_valid": False,
"storage_location": artifact.storage_location,
"last_verified": datetime.now().isoformat(),
"error": "File does not exist on disk"
}
# Verify file size
actual_size = os.path.getsize(artifact.file_path)
size_valid = True
if artifact.file_size_bytes:
size_valid = (actual_size == artifact.file_size_bytes)
# Verify checksum if stored
checksum_valid = True
actual_checksum = None
if artifact.checksum:
# Calculate checksum of actual file
sha256_hash = hashlib.sha256()
try:
with open(artifact.file_path, "rb") as f:
# Read file in chunks to handle large files
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
actual_checksum = sha256_hash.hexdigest()
checksum_valid = (actual_checksum == artifact.checksum)
except Exception as checksum_error:
logger.error(f"Failed to calculate checksum: {checksum_error}")
checksum_valid = False
actual_checksum = None
# Overall integrity status
integrity_valid = file_exists and size_valid and checksum_valid
result = {
"artifact_id": artifact_id,
"file_path": artifact.file_path,
"exists": True, # Would check actual file existence
"checksum_valid": True, # Would verify actual checksum
"size_valid": True, # Would verify actual file size
"exists": file_exists,
"checksum_valid": checksum_valid,
"size_valid": size_valid,
"integrity_valid": integrity_valid,
"storage_location": artifact.storage_location,
"last_verified": datetime.now().isoformat()
"last_verified": datetime.now().isoformat(),
"details": {
"stored_size_bytes": artifact.file_size_bytes,
"actual_size_bytes": actual_size if file_exists else None,
"stored_checksum": artifact.checksum,
"actual_checksum": actual_checksum
}
}
if not integrity_valid:
issues = []
if not file_exists:
issues.append("file_missing")
if not size_valid:
issues.append("size_mismatch")
if not checksum_valid:
issues.append("checksum_mismatch")
result["issues"] = issues
return result
except Exception as e:
logger.error("Failed to verify artifact integrity",
artifact_id=artifact_id,
@@ -374,55 +432,124 @@ class ArtifactRepository(TrainingBaseRepository):
self,
from_location: str,
to_location: str,
tenant_id: str = None
tenant_id: str = None,
copy_only: bool = False,
verify: bool = True
) -> Dict[str, Any]:
"""Migrate artifacts from one storage location to another (placeholder)"""
"""Migrate artifacts from one storage location to another with actual file operations"""
try:
import os
import shutil
import hashlib
# Get artifacts to migrate
artifacts = await self.get_artifacts_by_storage_location(from_location, tenant_id)
migrated_count = 0
failed_count = 0
# This is a placeholder - in a real implementation, you would:
# 1. Copy files from old location to new location
# 2. Update file paths in database
# 3. Verify successful migration
# 4. Clean up old files
failed_artifacts = []
verified_count = 0
for artifact in artifacts:
try:
# Placeholder migration logic
new_file_path = artifact.file_path.replace(from_location, to_location)
# Determine new file path
new_file_path = artifact.file_path.replace(from_location, to_location, 1)
# Create destination directory if it doesn't exist
dest_dir = os.path.dirname(new_file_path)
os.makedirs(dest_dir, exist_ok=True)
# Check if source file exists
if not os.path.exists(artifact.file_path):
logger.warning(f"Source file not found: {artifact.file_path}")
failed_count += 1
failed_artifacts.append({
"artifact_id": artifact.id,
"file_path": artifact.file_path,
"reason": "source_file_not_found"
})
continue
# Copy or move file
if copy_only:
shutil.copy2(artifact.file_path, new_file_path)
logger.debug(f"Copied file from {artifact.file_path} to {new_file_path}")
else:
shutil.move(artifact.file_path, new_file_path)
logger.debug(f"Moved file from {artifact.file_path} to {new_file_path}")
# Verify file was copied/moved successfully
if verify and os.path.exists(new_file_path):
# Verify file size
new_size = os.path.getsize(new_file_path)
if artifact.file_size_bytes and new_size != artifact.file_size_bytes:
logger.warning(f"File size mismatch after migration: {new_file_path}")
failed_count += 1
failed_artifacts.append({
"artifact_id": artifact.id,
"file_path": new_file_path,
"reason": "size_mismatch_after_migration"
})
continue
# Verify checksum if available
if artifact.checksum:
sha256_hash = hashlib.sha256()
with open(new_file_path, "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
sha256_hash.update(byte_block)
new_checksum = sha256_hash.hexdigest()
if new_checksum != artifact.checksum:
logger.warning(f"Checksum mismatch after migration: {new_file_path}")
failed_count += 1
failed_artifacts.append({
"artifact_id": artifact.id,
"file_path": new_file_path,
"reason": "checksum_mismatch_after_migration"
})
continue
verified_count += 1
# Update database with new location
await self.update(artifact.id, {
"storage_location": to_location,
"file_path": new_file_path
})
migrated_count += 1
except Exception as migration_error:
logger.error("Failed to migrate artifact",
artifact_id=artifact.id,
error=str(migration_error))
failed_count += 1
failed_artifacts.append({
"artifact_id": artifact.id,
"file_path": artifact.file_path,
"reason": str(migration_error)
})
logger.info("Artifact migration completed",
from_location=from_location,
to_location=to_location,
migrated_count=migrated_count,
failed_count=failed_count)
failed_count=failed_count,
verified_count=verified_count)
return {
"from_location": from_location,
"to_location": to_location,
"total_artifacts": len(artifacts),
"migrated_count": migrated_count,
"failed_count": failed_count,
"success_rate": round((migrated_count / len(artifacts)) * 100, 2) if artifacts else 100
"verified_count": verified_count if verify else None,
"success_rate": round((migrated_count / len(artifacts)) * 100, 2) if artifacts else 100,
"copy_only": copy_only,
"failed_artifacts": failed_artifacts if failed_artifacts else None
}
except Exception as e:
logger.error("Failed to migrate artifacts",
from_location=from_location,

View File

@@ -135,26 +135,45 @@ class TrainingTenantDeletionService(BaseTenantDataDeletionService):
result = TenantDataDeletionResult(tenant_id=tenant_id, service_name=self.service_name)
try:
import os
# Step 1: Delete model artifacts (references models)
logger.info("training.tenant_deletion.deleting_artifacts", tenant_id=tenant_id)
# TODO: Delete physical files from storage before deleting DB records
# artifacts = await self.db.execute(
# select(ModelArtifact).where(ModelArtifact.tenant_id == tenant_id)
# )
# for artifact in artifacts.scalars():
# try:
# os.remove(artifact.file_path) # Delete physical file
# except Exception as e:
# logger.warning("Failed to delete artifact file",
# path=artifact.file_path, error=str(e))
# Delete physical files from storage before deleting DB records
artifacts = await self.db.execute(
select(ModelArtifact).where(ModelArtifact.tenant_id == tenant_id)
)
deleted_files = 0
failed_files = 0
for artifact in artifacts.scalars():
try:
if artifact.file_path and os.path.exists(artifact.file_path):
os.remove(artifact.file_path)
deleted_files += 1
logger.info("Deleted artifact file",
path=artifact.file_path,
artifact_id=artifact.id)
except Exception as e:
failed_files += 1
logger.warning("Failed to delete artifact file",
path=artifact.file_path,
artifact_id=artifact.id if hasattr(artifact, 'id') else 'unknown',
error=str(e))
logger.info("Artifact files deletion complete",
deleted_files=deleted_files,
failed_files=failed_files)
# Now delete DB records
artifacts_result = await self.db.execute(
delete(ModelArtifact).where(
ModelArtifact.tenant_id == tenant_id
)
)
result.deleted_counts["model_artifacts"] = artifacts_result.rowcount
result.deleted_counts["artifact_files_deleted"] = deleted_files
result.deleted_counts["artifact_files_failed"] = failed_files
logger.info(
"training.tenant_deletion.artifacts_deleted",
tenant_id=tenant_id,
@@ -206,26 +225,54 @@ class TrainingTenantDeletionService(BaseTenantDataDeletionService):
# Step 5: Delete trained models (parent records)
logger.info("training.tenant_deletion.deleting_models", tenant_id=tenant_id)
# TODO: Delete physical model files (.pkl) before deleting DB records
# models = await self.db.execute(
# select(TrainedModel).where(TrainedModel.tenant_id == tenant_id)
# )
# for model in models.scalars():
# try:
# if model.model_path:
# os.remove(model.model_path) # Delete .pkl file
# if model.metadata_path:
# os.remove(model.metadata_path) # Delete metadata file
# except Exception as e:
# logger.warning("Failed to delete model file",
# path=model.model_path, error=str(e))
# Delete physical model files (.pkl) before deleting DB records
models = await self.db.execute(
select(TrainedModel).where(TrainedModel.tenant_id == tenant_id)
)
deleted_model_files = 0
failed_model_files = 0
for model in models.scalars():
try:
# Delete .pkl file
if hasattr(model, 'model_path') and model.model_path and os.path.exists(model.model_path):
os.remove(model.model_path)
deleted_model_files += 1
logger.info("Deleted model file",
path=model.model_path,
model_id=model.id)
# Delete model_file_path if it exists
if hasattr(model, 'model_file_path') and model.model_file_path and os.path.exists(model.model_file_path):
os.remove(model.model_file_path)
deleted_model_files += 1
logger.info("Deleted model file",
path=model.model_file_path,
model_id=model.id)
# Delete metadata file if exists
if hasattr(model, 'metadata_path') and model.metadata_path and os.path.exists(model.metadata_path):
os.remove(model.metadata_path)
logger.info("Deleted metadata file",
path=model.metadata_path,
model_id=model.id)
except Exception as e:
failed_model_files += 1
logger.warning("Failed to delete model file",
path=getattr(model, 'model_path', getattr(model, 'model_file_path', 'unknown')),
model_id=model.id if hasattr(model, 'id') else 'unknown',
error=str(e))
logger.info("Model files deletion complete",
deleted_files=deleted_model_files,
failed_files=failed_model_files)
# Now delete DB records
models_result = await self.db.execute(
delete(TrainedModel).where(
TrainedModel.tenant_id == tenant_id
)
)
result.deleted_counts["trained_models"] = models_result.rowcount
result.deleted_counts["model_files_deleted"] = deleted_model_files
result.deleted_counts["model_files_failed"] = failed_model_files
logger.info(
"training.tenant_deletion.models_deleted",
tenant_id=tenant_id,

View File

@@ -6,7 +6,7 @@ Simple, clean event publisher for the 4 main training steps
import structlog
from datetime import datetime
from typing import Dict, Any, Optional
from shared.messaging.rabbitmq import RabbitMQClient
from shared.messaging import RabbitMQClient
from app.core.config import settings
logger = structlog.get_logger()