New alert service
This commit is contained in:
435
services/training/app/consumers/training_event_consumer.py
Normal file
435
services/training/app/consumers/training_event_consumer.py
Normal file
@@ -0,0 +1,435 @@
|
||||
"""
|
||||
Training Event Consumer
|
||||
Processes ML model retraining requests from RabbitMQ
|
||||
Queues training jobs and manages model lifecycle
|
||||
"""
|
||||
import json
|
||||
import structlog
|
||||
from typing import Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
from uuid import UUID
|
||||
|
||||
from shared.messaging import RabbitMQClient
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class TrainingEventConsumer:
|
||||
"""
|
||||
Consumes training retraining events and queues ML training jobs
|
||||
Ensures no duplicate training jobs and manages priorities
|
||||
"""
|
||||
|
||||
def __init__(self, db_session: AsyncSession):
|
||||
self.db_session = db_session
|
||||
|
||||
async def consume_training_events(
|
||||
self,
|
||||
rabbitmq_client: RabbitMQClient
|
||||
):
|
||||
"""
|
||||
Start consuming training events from RabbitMQ
|
||||
"""
|
||||
async def process_message(message):
|
||||
"""Process a single training event message"""
|
||||
try:
|
||||
async with message.process():
|
||||
# Parse event data
|
||||
event_data = json.loads(message.body.decode())
|
||||
logger.info(
|
||||
"Received training event",
|
||||
event_id=event_data.get('event_id'),
|
||||
event_type=event_data.get('event_type'),
|
||||
tenant_id=event_data.get('tenant_id')
|
||||
)
|
||||
|
||||
# Process the event
|
||||
await self.process_training_event(event_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Error processing training event",
|
||||
error=str(e),
|
||||
exc_info=True
|
||||
)
|
||||
|
||||
# Start consuming events
|
||||
await rabbitmq_client.consume_events(
|
||||
exchange_name="training.events",
|
||||
queue_name="training.retraining.queue",
|
||||
routing_key="training.retrain.*",
|
||||
callback=process_message
|
||||
)
|
||||
|
||||
logger.info("Started consuming training events")
|
||||
|
||||
async def process_training_event(self, event_data: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Process a training event based on type
|
||||
|
||||
Args:
|
||||
event_data: Full event payload from RabbitMQ
|
||||
|
||||
Returns:
|
||||
bool: True if processed successfully
|
||||
"""
|
||||
try:
|
||||
event_type = event_data.get('event_type')
|
||||
data = event_data.get('data', {})
|
||||
tenant_id = event_data.get('tenant_id')
|
||||
|
||||
if not tenant_id:
|
||||
logger.warning("Training event missing tenant_id", event_data=event_data)
|
||||
return False
|
||||
|
||||
# Route to appropriate handler
|
||||
if event_type == 'training.retrain.requested':
|
||||
success = await self._handle_retrain_requested(tenant_id, data, event_data)
|
||||
elif event_type == 'training.retrain.scheduled':
|
||||
success = await self._handle_retrain_scheduled(tenant_id, data)
|
||||
else:
|
||||
logger.warning("Unknown training event type", event_type=event_type)
|
||||
success = True # Mark as processed to avoid retry
|
||||
|
||||
if success:
|
||||
logger.info(
|
||||
"Training event processed successfully",
|
||||
event_type=event_type,
|
||||
tenant_id=tenant_id
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
"Training event processing failed",
|
||||
event_type=event_type,
|
||||
tenant_id=tenant_id
|
||||
)
|
||||
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Error in process_training_event",
|
||||
error=str(e),
|
||||
event_id=event_data.get('event_id'),
|
||||
exc_info=True
|
||||
)
|
||||
return False
|
||||
|
||||
async def _handle_retrain_requested(
|
||||
self,
|
||||
tenant_id: str,
|
||||
data: Dict[str, Any],
|
||||
event_data: Dict[str, Any]
|
||||
) -> bool:
|
||||
"""
|
||||
Handle retraining request event
|
||||
|
||||
Validates model, checks for existing jobs, queues training job
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant ID
|
||||
data: Retraining request data
|
||||
event_data: Full event payload
|
||||
|
||||
Returns:
|
||||
bool: True if handled successfully
|
||||
"""
|
||||
try:
|
||||
model_id = data.get('model_id')
|
||||
product_id = data.get('product_id')
|
||||
trigger_reason = data.get('trigger_reason', 'unknown')
|
||||
priority = data.get('priority', 'normal')
|
||||
event_id = event_data.get('event_id')
|
||||
|
||||
if not model_id:
|
||||
logger.warning("Retraining request missing model_id", data=data)
|
||||
return False
|
||||
|
||||
# Validate model exists
|
||||
from app.models import TrainedModel
|
||||
|
||||
stmt = select(TrainedModel).where(
|
||||
TrainedModel.id == UUID(model_id),
|
||||
TrainedModel.tenant_id == UUID(tenant_id)
|
||||
)
|
||||
result = await self.db_session.execute(stmt)
|
||||
model = result.scalar_one_or_none()
|
||||
|
||||
if not model:
|
||||
logger.error(
|
||||
"Model not found for retraining",
|
||||
model_id=model_id,
|
||||
tenant_id=tenant_id
|
||||
)
|
||||
return False
|
||||
|
||||
# Check if model is already in training
|
||||
if model.status in ['training', 'retraining_queued']:
|
||||
logger.info(
|
||||
"Model already in training, skipping duplicate request",
|
||||
model_id=model_id,
|
||||
current_status=model.status
|
||||
)
|
||||
return True # Consider successful (idempotent)
|
||||
|
||||
# Check for existing job in queue
|
||||
from app.models import TrainingJobQueue
|
||||
|
||||
existing_job_stmt = select(TrainingJobQueue).where(
|
||||
TrainingJobQueue.model_id == UUID(model_id),
|
||||
TrainingJobQueue.status.in_(['pending', 'running'])
|
||||
)
|
||||
existing_job_result = await self.db_session.execute(existing_job_stmt)
|
||||
existing_job = existing_job_result.scalar_one_or_none()
|
||||
|
||||
if existing_job:
|
||||
logger.info(
|
||||
"Training job already queued, skipping duplicate",
|
||||
model_id=model_id,
|
||||
job_id=str(existing_job.id)
|
||||
)
|
||||
return True # Idempotent
|
||||
|
||||
# Queue training job
|
||||
job_id = await self._queue_training_job(
|
||||
tenant_id=tenant_id,
|
||||
model_id=model_id,
|
||||
product_id=product_id,
|
||||
trigger_reason=trigger_reason,
|
||||
priority=priority,
|
||||
event_id=event_id,
|
||||
metadata=data
|
||||
)
|
||||
|
||||
if not job_id:
|
||||
logger.error("Failed to queue training job", model_id=model_id)
|
||||
return False
|
||||
|
||||
# Update model status
|
||||
model.status = 'retraining_queued'
|
||||
model.updated_at = datetime.utcnow()
|
||||
await self.db_session.commit()
|
||||
|
||||
# Publish job queued event
|
||||
await self._publish_job_queued_event(
|
||||
tenant_id=tenant_id,
|
||||
model_id=model_id,
|
||||
job_id=job_id,
|
||||
priority=priority
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Retraining job queued successfully",
|
||||
model_id=model_id,
|
||||
job_id=job_id,
|
||||
trigger_reason=trigger_reason,
|
||||
priority=priority
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
await self.db_session.rollback()
|
||||
logger.error(
|
||||
"Error handling retrain requested",
|
||||
error=str(e),
|
||||
model_id=data.get('model_id'),
|
||||
exc_info=True
|
||||
)
|
||||
return False
|
||||
|
||||
async def _handle_retrain_scheduled(
|
||||
self,
|
||||
tenant_id: str,
|
||||
data: Dict[str, Any]
|
||||
) -> bool:
|
||||
"""
|
||||
Handle scheduled retraining event
|
||||
|
||||
Similar to retrain_requested but for scheduled/batch retraining
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant ID
|
||||
data: Scheduled retraining data
|
||||
|
||||
Returns:
|
||||
bool: True if handled successfully
|
||||
"""
|
||||
try:
|
||||
# Similar logic to _handle_retrain_requested
|
||||
# but may have different priority or batching logic
|
||||
logger.info(
|
||||
"Handling scheduled retraining",
|
||||
tenant_id=tenant_id,
|
||||
model_count=len(data.get('models', []))
|
||||
)
|
||||
|
||||
# For now, redirect to retrain_requested handler
|
||||
success_count = 0
|
||||
for model_data in data.get('models', []):
|
||||
if await self._handle_retrain_requested(
|
||||
tenant_id,
|
||||
model_data,
|
||||
{'event_id': data.get('schedule_id'), 'tenant_id': tenant_id}
|
||||
):
|
||||
success_count += 1
|
||||
|
||||
logger.info(
|
||||
"Scheduled retraining processed",
|
||||
tenant_id=tenant_id,
|
||||
successful=success_count,
|
||||
total=len(data.get('models', []))
|
||||
)
|
||||
|
||||
return success_count > 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Error handling retrain scheduled",
|
||||
error=str(e),
|
||||
tenant_id=tenant_id,
|
||||
exc_info=True
|
||||
)
|
||||
return False
|
||||
|
||||
async def _queue_training_job(
|
||||
self,
|
||||
tenant_id: str,
|
||||
model_id: str,
|
||||
product_id: str,
|
||||
trigger_reason: str,
|
||||
priority: str,
|
||||
event_id: str,
|
||||
metadata: Dict[str, Any]
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Queue a training job in the database
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant ID
|
||||
model_id: Model ID to retrain
|
||||
product_id: Product ID
|
||||
trigger_reason: Why retraining was triggered
|
||||
priority: Job priority (low, normal, high)
|
||||
event_id: Originating event ID
|
||||
metadata: Additional job metadata
|
||||
|
||||
Returns:
|
||||
Job ID if successful, None otherwise
|
||||
"""
|
||||
try:
|
||||
from app.models import TrainingJobQueue
|
||||
import uuid
|
||||
|
||||
# Map priority to numeric value for sorting
|
||||
priority_map = {
|
||||
'low': 1,
|
||||
'normal': 2,
|
||||
'high': 3,
|
||||
'critical': 4
|
||||
}
|
||||
|
||||
job = TrainingJobQueue(
|
||||
id=uuid.uuid4(),
|
||||
tenant_id=UUID(tenant_id),
|
||||
model_id=UUID(model_id),
|
||||
product_id=UUID(product_id) if product_id else None,
|
||||
job_type='retrain',
|
||||
status='pending',
|
||||
priority=priority,
|
||||
priority_score=priority_map.get(priority, 2),
|
||||
trigger_reason=trigger_reason,
|
||||
event_id=event_id,
|
||||
metadata=metadata,
|
||||
created_at=datetime.utcnow(),
|
||||
scheduled_at=datetime.utcnow()
|
||||
)
|
||||
|
||||
self.db_session.add(job)
|
||||
await self.db_session.commit()
|
||||
|
||||
logger.info(
|
||||
"Training job created",
|
||||
job_id=str(job.id),
|
||||
model_id=model_id,
|
||||
priority=priority,
|
||||
trigger_reason=trigger_reason
|
||||
)
|
||||
|
||||
return str(job.id)
|
||||
|
||||
except Exception as e:
|
||||
await self.db_session.rollback()
|
||||
logger.error(
|
||||
"Failed to queue training job",
|
||||
model_id=model_id,
|
||||
error=str(e),
|
||||
exc_info=True
|
||||
)
|
||||
return None
|
||||
|
||||
async def _publish_job_queued_event(
|
||||
self,
|
||||
tenant_id: str,
|
||||
model_id: str,
|
||||
job_id: str,
|
||||
priority: str
|
||||
):
|
||||
"""
|
||||
Publish event that training job was queued
|
||||
|
||||
Args:
|
||||
tenant_id: Tenant ID
|
||||
model_id: Model ID
|
||||
job_id: Training job ID
|
||||
priority: Job priority
|
||||
"""
|
||||
try:
|
||||
from shared.messaging import get_rabbitmq_client
|
||||
import uuid
|
||||
|
||||
rabbitmq_client = get_rabbitmq_client()
|
||||
if not rabbitmq_client:
|
||||
logger.warning("RabbitMQ client not available for event publishing")
|
||||
return
|
||||
|
||||
event_payload = {
|
||||
"event_id": str(uuid.uuid4()),
|
||||
"event_type": "training.retrain.queued",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"tenant_id": tenant_id,
|
||||
"data": {
|
||||
"job_id": job_id,
|
||||
"model_id": model_id,
|
||||
"priority": priority,
|
||||
"status": "queued"
|
||||
}
|
||||
}
|
||||
|
||||
await rabbitmq_client.publish_event(
|
||||
exchange_name="training.events",
|
||||
routing_key="training.retrain.queued",
|
||||
event_data=event_payload
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Published job queued event",
|
||||
job_id=job_id,
|
||||
event_id=event_payload["event_id"]
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to publish job queued event",
|
||||
job_id=job_id,
|
||||
error=str(e)
|
||||
)
|
||||
# Don't fail the main operation if event publishing fails
|
||||
|
||||
|
||||
# Factory function for creating consumer instance
|
||||
def create_training_event_consumer(db_session: AsyncSession) -> TrainingEventConsumer:
|
||||
"""Create training event consumer instance"""
|
||||
return TrainingEventConsumer(db_session)
|
||||
@@ -340,27 +340,85 @@ class ArtifactRepository(TrainingBaseRepository):
|
||||
}
|
||||
|
||||
async def verify_artifact_integrity(self, artifact_id: int) -> Dict[str, Any]:
|
||||
"""Verify artifact file integrity (placeholder for file system checks)"""
|
||||
"""Verify artifact file integrity with actual file system checks"""
|
||||
try:
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
artifact = await self.get_by_id(artifact_id)
|
||||
if not artifact:
|
||||
return {"exists": False, "error": "Artifact not found"}
|
||||
|
||||
# This is a placeholder - in a real implementation, you would:
|
||||
# 1. Check if the file exists at artifact.file_path
|
||||
# 2. Calculate current checksum and compare with stored checksum
|
||||
# 3. Verify file size matches stored file_size_bytes
|
||||
|
||||
return {
|
||||
|
||||
# Check if file exists
|
||||
file_exists = os.path.exists(artifact.file_path)
|
||||
if not file_exists:
|
||||
return {
|
||||
"artifact_id": artifact_id,
|
||||
"file_path": artifact.file_path,
|
||||
"exists": False,
|
||||
"checksum_valid": False,
|
||||
"size_valid": False,
|
||||
"storage_location": artifact.storage_location,
|
||||
"last_verified": datetime.now().isoformat(),
|
||||
"error": "File does not exist on disk"
|
||||
}
|
||||
|
||||
# Verify file size
|
||||
actual_size = os.path.getsize(artifact.file_path)
|
||||
size_valid = True
|
||||
if artifact.file_size_bytes:
|
||||
size_valid = (actual_size == artifact.file_size_bytes)
|
||||
|
||||
# Verify checksum if stored
|
||||
checksum_valid = True
|
||||
actual_checksum = None
|
||||
if artifact.checksum:
|
||||
# Calculate checksum of actual file
|
||||
sha256_hash = hashlib.sha256()
|
||||
try:
|
||||
with open(artifact.file_path, "rb") as f:
|
||||
# Read file in chunks to handle large files
|
||||
for byte_block in iter(lambda: f.read(4096), b""):
|
||||
sha256_hash.update(byte_block)
|
||||
actual_checksum = sha256_hash.hexdigest()
|
||||
checksum_valid = (actual_checksum == artifact.checksum)
|
||||
except Exception as checksum_error:
|
||||
logger.error(f"Failed to calculate checksum: {checksum_error}")
|
||||
checksum_valid = False
|
||||
actual_checksum = None
|
||||
|
||||
# Overall integrity status
|
||||
integrity_valid = file_exists and size_valid and checksum_valid
|
||||
|
||||
result = {
|
||||
"artifact_id": artifact_id,
|
||||
"file_path": artifact.file_path,
|
||||
"exists": True, # Would check actual file existence
|
||||
"checksum_valid": True, # Would verify actual checksum
|
||||
"size_valid": True, # Would verify actual file size
|
||||
"exists": file_exists,
|
||||
"checksum_valid": checksum_valid,
|
||||
"size_valid": size_valid,
|
||||
"integrity_valid": integrity_valid,
|
||||
"storage_location": artifact.storage_location,
|
||||
"last_verified": datetime.now().isoformat()
|
||||
"last_verified": datetime.now().isoformat(),
|
||||
"details": {
|
||||
"stored_size_bytes": artifact.file_size_bytes,
|
||||
"actual_size_bytes": actual_size if file_exists else None,
|
||||
"stored_checksum": artifact.checksum,
|
||||
"actual_checksum": actual_checksum
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if not integrity_valid:
|
||||
issues = []
|
||||
if not file_exists:
|
||||
issues.append("file_missing")
|
||||
if not size_valid:
|
||||
issues.append("size_mismatch")
|
||||
if not checksum_valid:
|
||||
issues.append("checksum_mismatch")
|
||||
result["issues"] = issues
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to verify artifact integrity",
|
||||
artifact_id=artifact_id,
|
||||
@@ -374,55 +432,124 @@ class ArtifactRepository(TrainingBaseRepository):
|
||||
self,
|
||||
from_location: str,
|
||||
to_location: str,
|
||||
tenant_id: str = None
|
||||
tenant_id: str = None,
|
||||
copy_only: bool = False,
|
||||
verify: bool = True
|
||||
) -> Dict[str, Any]:
|
||||
"""Migrate artifacts from one storage location to another (placeholder)"""
|
||||
"""Migrate artifacts from one storage location to another with actual file operations"""
|
||||
try:
|
||||
import os
|
||||
import shutil
|
||||
import hashlib
|
||||
|
||||
# Get artifacts to migrate
|
||||
artifacts = await self.get_artifacts_by_storage_location(from_location, tenant_id)
|
||||
|
||||
|
||||
migrated_count = 0
|
||||
failed_count = 0
|
||||
|
||||
# This is a placeholder - in a real implementation, you would:
|
||||
# 1. Copy files from old location to new location
|
||||
# 2. Update file paths in database
|
||||
# 3. Verify successful migration
|
||||
# 4. Clean up old files
|
||||
|
||||
failed_artifacts = []
|
||||
verified_count = 0
|
||||
|
||||
for artifact in artifacts:
|
||||
try:
|
||||
# Placeholder migration logic
|
||||
new_file_path = artifact.file_path.replace(from_location, to_location)
|
||||
|
||||
# Determine new file path
|
||||
new_file_path = artifact.file_path.replace(from_location, to_location, 1)
|
||||
|
||||
# Create destination directory if it doesn't exist
|
||||
dest_dir = os.path.dirname(new_file_path)
|
||||
os.makedirs(dest_dir, exist_ok=True)
|
||||
|
||||
# Check if source file exists
|
||||
if not os.path.exists(artifact.file_path):
|
||||
logger.warning(f"Source file not found: {artifact.file_path}")
|
||||
failed_count += 1
|
||||
failed_artifacts.append({
|
||||
"artifact_id": artifact.id,
|
||||
"file_path": artifact.file_path,
|
||||
"reason": "source_file_not_found"
|
||||
})
|
||||
continue
|
||||
|
||||
# Copy or move file
|
||||
if copy_only:
|
||||
shutil.copy2(artifact.file_path, new_file_path)
|
||||
logger.debug(f"Copied file from {artifact.file_path} to {new_file_path}")
|
||||
else:
|
||||
shutil.move(artifact.file_path, new_file_path)
|
||||
logger.debug(f"Moved file from {artifact.file_path} to {new_file_path}")
|
||||
|
||||
# Verify file was copied/moved successfully
|
||||
if verify and os.path.exists(new_file_path):
|
||||
# Verify file size
|
||||
new_size = os.path.getsize(new_file_path)
|
||||
if artifact.file_size_bytes and new_size != artifact.file_size_bytes:
|
||||
logger.warning(f"File size mismatch after migration: {new_file_path}")
|
||||
failed_count += 1
|
||||
failed_artifacts.append({
|
||||
"artifact_id": artifact.id,
|
||||
"file_path": new_file_path,
|
||||
"reason": "size_mismatch_after_migration"
|
||||
})
|
||||
continue
|
||||
|
||||
# Verify checksum if available
|
||||
if artifact.checksum:
|
||||
sha256_hash = hashlib.sha256()
|
||||
with open(new_file_path, "rb") as f:
|
||||
for byte_block in iter(lambda: f.read(4096), b""):
|
||||
sha256_hash.update(byte_block)
|
||||
new_checksum = sha256_hash.hexdigest()
|
||||
|
||||
if new_checksum != artifact.checksum:
|
||||
logger.warning(f"Checksum mismatch after migration: {new_file_path}")
|
||||
failed_count += 1
|
||||
failed_artifacts.append({
|
||||
"artifact_id": artifact.id,
|
||||
"file_path": new_file_path,
|
||||
"reason": "checksum_mismatch_after_migration"
|
||||
})
|
||||
continue
|
||||
|
||||
verified_count += 1
|
||||
|
||||
# Update database with new location
|
||||
await self.update(artifact.id, {
|
||||
"storage_location": to_location,
|
||||
"file_path": new_file_path
|
||||
})
|
||||
|
||||
|
||||
migrated_count += 1
|
||||
|
||||
|
||||
except Exception as migration_error:
|
||||
logger.error("Failed to migrate artifact",
|
||||
artifact_id=artifact.id,
|
||||
error=str(migration_error))
|
||||
failed_count += 1
|
||||
|
||||
failed_artifacts.append({
|
||||
"artifact_id": artifact.id,
|
||||
"file_path": artifact.file_path,
|
||||
"reason": str(migration_error)
|
||||
})
|
||||
|
||||
logger.info("Artifact migration completed",
|
||||
from_location=from_location,
|
||||
to_location=to_location,
|
||||
migrated_count=migrated_count,
|
||||
failed_count=failed_count)
|
||||
|
||||
failed_count=failed_count,
|
||||
verified_count=verified_count)
|
||||
|
||||
return {
|
||||
"from_location": from_location,
|
||||
"to_location": to_location,
|
||||
"total_artifacts": len(artifacts),
|
||||
"migrated_count": migrated_count,
|
||||
"failed_count": failed_count,
|
||||
"success_rate": round((migrated_count / len(artifacts)) * 100, 2) if artifacts else 100
|
||||
"verified_count": verified_count if verify else None,
|
||||
"success_rate": round((migrated_count / len(artifacts)) * 100, 2) if artifacts else 100,
|
||||
"copy_only": copy_only,
|
||||
"failed_artifacts": failed_artifacts if failed_artifacts else None
|
||||
}
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to migrate artifacts",
|
||||
from_location=from_location,
|
||||
|
||||
@@ -135,26 +135,45 @@ class TrainingTenantDeletionService(BaseTenantDataDeletionService):
|
||||
result = TenantDataDeletionResult(tenant_id=tenant_id, service_name=self.service_name)
|
||||
|
||||
try:
|
||||
import os
|
||||
|
||||
# Step 1: Delete model artifacts (references models)
|
||||
logger.info("training.tenant_deletion.deleting_artifacts", tenant_id=tenant_id)
|
||||
|
||||
# TODO: Delete physical files from storage before deleting DB records
|
||||
# artifacts = await self.db.execute(
|
||||
# select(ModelArtifact).where(ModelArtifact.tenant_id == tenant_id)
|
||||
# )
|
||||
# for artifact in artifacts.scalars():
|
||||
# try:
|
||||
# os.remove(artifact.file_path) # Delete physical file
|
||||
# except Exception as e:
|
||||
# logger.warning("Failed to delete artifact file",
|
||||
# path=artifact.file_path, error=str(e))
|
||||
# Delete physical files from storage before deleting DB records
|
||||
artifacts = await self.db.execute(
|
||||
select(ModelArtifact).where(ModelArtifact.tenant_id == tenant_id)
|
||||
)
|
||||
deleted_files = 0
|
||||
failed_files = 0
|
||||
for artifact in artifacts.scalars():
|
||||
try:
|
||||
if artifact.file_path and os.path.exists(artifact.file_path):
|
||||
os.remove(artifact.file_path)
|
||||
deleted_files += 1
|
||||
logger.info("Deleted artifact file",
|
||||
path=artifact.file_path,
|
||||
artifact_id=artifact.id)
|
||||
except Exception as e:
|
||||
failed_files += 1
|
||||
logger.warning("Failed to delete artifact file",
|
||||
path=artifact.file_path,
|
||||
artifact_id=artifact.id if hasattr(artifact, 'id') else 'unknown',
|
||||
error=str(e))
|
||||
|
||||
logger.info("Artifact files deletion complete",
|
||||
deleted_files=deleted_files,
|
||||
failed_files=failed_files)
|
||||
|
||||
# Now delete DB records
|
||||
artifacts_result = await self.db.execute(
|
||||
delete(ModelArtifact).where(
|
||||
ModelArtifact.tenant_id == tenant_id
|
||||
)
|
||||
)
|
||||
result.deleted_counts["model_artifacts"] = artifacts_result.rowcount
|
||||
result.deleted_counts["artifact_files_deleted"] = deleted_files
|
||||
result.deleted_counts["artifact_files_failed"] = failed_files
|
||||
logger.info(
|
||||
"training.tenant_deletion.artifacts_deleted",
|
||||
tenant_id=tenant_id,
|
||||
@@ -206,26 +225,54 @@ class TrainingTenantDeletionService(BaseTenantDataDeletionService):
|
||||
# Step 5: Delete trained models (parent records)
|
||||
logger.info("training.tenant_deletion.deleting_models", tenant_id=tenant_id)
|
||||
|
||||
# TODO: Delete physical model files (.pkl) before deleting DB records
|
||||
# models = await self.db.execute(
|
||||
# select(TrainedModel).where(TrainedModel.tenant_id == tenant_id)
|
||||
# )
|
||||
# for model in models.scalars():
|
||||
# try:
|
||||
# if model.model_path:
|
||||
# os.remove(model.model_path) # Delete .pkl file
|
||||
# if model.metadata_path:
|
||||
# os.remove(model.metadata_path) # Delete metadata file
|
||||
# except Exception as e:
|
||||
# logger.warning("Failed to delete model file",
|
||||
# path=model.model_path, error=str(e))
|
||||
# Delete physical model files (.pkl) before deleting DB records
|
||||
models = await self.db.execute(
|
||||
select(TrainedModel).where(TrainedModel.tenant_id == tenant_id)
|
||||
)
|
||||
deleted_model_files = 0
|
||||
failed_model_files = 0
|
||||
for model in models.scalars():
|
||||
try:
|
||||
# Delete .pkl file
|
||||
if hasattr(model, 'model_path') and model.model_path and os.path.exists(model.model_path):
|
||||
os.remove(model.model_path)
|
||||
deleted_model_files += 1
|
||||
logger.info("Deleted model file",
|
||||
path=model.model_path,
|
||||
model_id=model.id)
|
||||
# Delete model_file_path if it exists
|
||||
if hasattr(model, 'model_file_path') and model.model_file_path and os.path.exists(model.model_file_path):
|
||||
os.remove(model.model_file_path)
|
||||
deleted_model_files += 1
|
||||
logger.info("Deleted model file",
|
||||
path=model.model_file_path,
|
||||
model_id=model.id)
|
||||
# Delete metadata file if exists
|
||||
if hasattr(model, 'metadata_path') and model.metadata_path and os.path.exists(model.metadata_path):
|
||||
os.remove(model.metadata_path)
|
||||
logger.info("Deleted metadata file",
|
||||
path=model.metadata_path,
|
||||
model_id=model.id)
|
||||
except Exception as e:
|
||||
failed_model_files += 1
|
||||
logger.warning("Failed to delete model file",
|
||||
path=getattr(model, 'model_path', getattr(model, 'model_file_path', 'unknown')),
|
||||
model_id=model.id if hasattr(model, 'id') else 'unknown',
|
||||
error=str(e))
|
||||
|
||||
logger.info("Model files deletion complete",
|
||||
deleted_files=deleted_model_files,
|
||||
failed_files=failed_model_files)
|
||||
|
||||
# Now delete DB records
|
||||
models_result = await self.db.execute(
|
||||
delete(TrainedModel).where(
|
||||
TrainedModel.tenant_id == tenant_id
|
||||
)
|
||||
)
|
||||
result.deleted_counts["trained_models"] = models_result.rowcount
|
||||
result.deleted_counts["model_files_deleted"] = deleted_model_files
|
||||
result.deleted_counts["model_files_failed"] = failed_model_files
|
||||
logger.info(
|
||||
"training.tenant_deletion.models_deleted",
|
||||
tenant_id=tenant_id,
|
||||
|
||||
@@ -6,7 +6,7 @@ Simple, clean event publisher for the 4 main training steps
|
||||
import structlog
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional
|
||||
from shared.messaging.rabbitmq import RabbitMQClient
|
||||
from shared.messaging import RabbitMQClient
|
||||
from app.core.config import settings
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
Reference in New Issue
Block a user