Files
bakery-ia/services/training/app/repositories/job_queue_repository.py

445 lines
17 KiB
Python
Raw Normal View History

2025-08-08 09:08:41 +02:00
"""
Job Queue Repository
Repository for training job queue operations
"""
from typing import Optional, List, Dict, Any
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, and_, text, desc
from datetime import datetime, timedelta
import structlog
from .base import TrainingBaseRepository
from app.models.training import TrainingJobQueue
from shared.database.exceptions import DatabaseError, ValidationError
logger = structlog.get_logger()
class JobQueueRepository(TrainingBaseRepository):
"""Repository for training job queue operations"""
def __init__(self, session: AsyncSession, cache_ttl: Optional[int] = 60):
# Job queue changes frequently, very short cache time (1 minute)
super().__init__(TrainingJobQueue, session, cache_ttl)
async def enqueue_job(self, job_data: Dict[str, Any]) -> TrainingJobQueue:
"""Add a job to the training queue"""
try:
# Validate job data
validation_result = self._validate_training_data(
job_data,
["job_id", "tenant_id", "job_type"]
)
if not validation_result["is_valid"]:
raise ValidationError(f"Invalid job data: {validation_result['errors']}")
# Set default values
if "priority" not in job_data:
job_data["priority"] = 1
if "status" not in job_data:
job_data["status"] = "queued"
if "max_retries" not in job_data:
job_data["max_retries"] = 3
# Create queue entry
queued_job = await self.create(job_data)
logger.info("Job enqueued",
job_id=queued_job.job_id,
tenant_id=queued_job.tenant_id,
job_type=queued_job.job_type,
priority=queued_job.priority)
return queued_job
except ValidationError:
raise
except Exception as e:
logger.error("Failed to enqueue job",
job_id=job_data.get("job_id"),
error=str(e))
raise DatabaseError(f"Failed to enqueue job: {str(e)}")
async def get_next_job(self, job_types: List[str] = None) -> Optional[TrainingJobQueue]:
"""Get the next job to process from the queue"""
try:
# Build filters for job types if specified
filters = {"status": "queued"}
if job_types:
# For multiple job types, we need to use raw SQL
job_types_str = "', '".join(job_types)
query_text = f"""
SELECT * FROM training_job_queue
WHERE status = 'queued'
AND job_type IN ('{job_types_str}')
AND (scheduled_at IS NULL OR scheduled_at <= :now)
ORDER BY priority DESC, created_at ASC
LIMIT 1
"""
result = await self.session.execute(text(query_text), {"now": datetime.now()})
row = result.fetchone()
if row:
record_dict = dict(row._mapping)
return self.model(**record_dict)
return None
else:
# Simple case - get any queued job
jobs = await self.get_multi(
filters=filters,
limit=1,
order_by="priority",
order_desc=True
)
return jobs[0] if jobs else None
except Exception as e:
logger.error("Failed to get next job from queue",
job_types=job_types,
error=str(e))
raise DatabaseError(f"Failed to get next job: {str(e)}")
async def start_job(self, job_id: str) -> Optional[TrainingJobQueue]:
"""Mark a job as started"""
try:
job = await self.get_by_job_id(job_id)
if not job:
logger.error(f"Job not found in queue: {job_id}")
return None
if job.status != "queued":
logger.warning(f"Job {job_id} is not queued (status: {job.status})")
return job
updated_job = await self.update(job.id, {
"status": "running",
"started_at": datetime.now(),
"updated_at": datetime.now()
})
logger.info("Job started",
job_id=job_id,
job_type=job.job_type)
return updated_job
except Exception as e:
logger.error("Failed to start job",
job_id=job_id,
error=str(e))
raise DatabaseError(f"Failed to start job: {str(e)}")
async def complete_job(self, job_id: str) -> Optional[TrainingJobQueue]:
"""Mark a job as completed"""
try:
job = await self.get_by_job_id(job_id)
if not job:
logger.error(f"Job not found in queue: {job_id}")
return None
updated_job = await self.update(job.id, {
"status": "completed",
"updated_at": datetime.now()
})
logger.info("Job completed",
job_id=job_id,
job_type=job.job_type if job else "unknown")
return updated_job
except Exception as e:
logger.error("Failed to complete job",
job_id=job_id,
error=str(e))
raise DatabaseError(f"Failed to complete job: {str(e)}")
async def fail_job(self, job_id: str, error_message: str = None) -> Optional[TrainingJobQueue]:
"""Mark a job as failed and handle retries"""
try:
job = await self.get_by_job_id(job_id)
if not job:
logger.error(f"Job not found in queue: {job_id}")
return None
# Increment retry count
new_retry_count = job.retry_count + 1
# Check if we should retry
if new_retry_count < job.max_retries:
# Reset to queued for retry
updated_job = await self.update(job.id, {
"status": "queued",
"retry_count": new_retry_count,
"updated_at": datetime.now(),
"started_at": None # Reset started_at for retry
})
logger.info("Job failed, queued for retry",
job_id=job_id,
retry_count=new_retry_count,
max_retries=job.max_retries)
else:
# Mark as permanently failed
updated_job = await self.update(job.id, {
"status": "failed",
"retry_count": new_retry_count,
"updated_at": datetime.now()
})
logger.error("Job permanently failed",
job_id=job_id,
retry_count=new_retry_count,
error_message=error_message)
return updated_job
except Exception as e:
logger.error("Failed to handle job failure",
job_id=job_id,
error=str(e))
raise DatabaseError(f"Failed to handle job failure: {str(e)}")
async def cancel_job(self, job_id: str, cancelled_by: str = None) -> Optional[TrainingJobQueue]:
"""Cancel a job"""
try:
job = await self.get_by_job_id(job_id)
if not job:
logger.error(f"Job not found in queue: {job_id}")
return None
if job.status in ["completed", "failed"]:
logger.warning(f"Cannot cancel job {job_id} with status {job.status}")
return job
updated_job = await self.update(job.id, {
"status": "cancelled",
"cancelled_by": cancelled_by,
"updated_at": datetime.now()
})
logger.info("Job cancelled",
job_id=job_id,
cancelled_by=cancelled_by)
return updated_job
except Exception as e:
logger.error("Failed to cancel job",
job_id=job_id,
error=str(e))
raise DatabaseError(f"Failed to cancel job: {str(e)}")
async def get_queue_status(self, tenant_id: str = None) -> Dict[str, Any]:
"""Get queue status and statistics"""
try:
base_filters = {}
if tenant_id:
base_filters["tenant_id"] = tenant_id
# Get counts by status
queued_jobs = await self.count(filters={**base_filters, "status": "queued"})
running_jobs = await self.count(filters={**base_filters, "status": "running"})
completed_jobs = await self.count(filters={**base_filters, "status": "completed"})
failed_jobs = await self.count(filters={**base_filters, "status": "failed"})
cancelled_jobs = await self.count(filters={**base_filters, "status": "cancelled"})
# Get jobs by type
type_query = text(f"""
SELECT job_type, COUNT(*) as count
FROM training_job_queue
WHERE 1=1
{' AND tenant_id = :tenant_id' if tenant_id else ''}
GROUP BY job_type
ORDER BY count DESC
""")
params = {"tenant_id": tenant_id} if tenant_id else {}
result = await self.session.execute(type_query, params)
jobs_by_type = {row.job_type: row.count for row in result.fetchall()}
# Get average wait time for completed jobs
wait_time_query = text(f"""
SELECT
AVG(EXTRACT(EPOCH FROM (started_at - created_at))/60) as avg_wait_minutes
FROM training_job_queue
WHERE status = 'completed'
AND started_at IS NOT NULL
AND created_at IS NOT NULL
{' AND tenant_id = :tenant_id' if tenant_id else ''}
""")
wait_result = await self.session.execute(wait_time_query, params)
wait_row = wait_result.fetchone()
avg_wait_time = float(wait_row.avg_wait_minutes) if wait_row and wait_row.avg_wait_minutes else 0.0
return {
"tenant_id": tenant_id,
"queue_counts": {
"queued": queued_jobs,
"running": running_jobs,
"completed": completed_jobs,
"failed": failed_jobs,
"cancelled": cancelled_jobs,
"total": queued_jobs + running_jobs + completed_jobs + failed_jobs + cancelled_jobs
},
"jobs_by_type": jobs_by_type,
"avg_wait_time_minutes": round(avg_wait_time, 2),
"queue_health": {
"has_queued_jobs": queued_jobs > 0,
"has_running_jobs": running_jobs > 0,
"failure_rate": round((failed_jobs / max(completed_jobs + failed_jobs, 1)) * 100, 2)
}
}
except Exception as e:
logger.error("Failed to get queue status",
tenant_id=tenant_id,
error=str(e))
return {
"tenant_id": tenant_id,
"queue_counts": {
"queued": 0, "running": 0, "completed": 0,
"failed": 0, "cancelled": 0, "total": 0
},
"jobs_by_type": {},
"avg_wait_time_minutes": 0.0,
"queue_health": {
"has_queued_jobs": False,
"has_running_jobs": False,
"failure_rate": 0.0
}
}
async def get_jobs_by_tenant(
self,
tenant_id: str,
status: str = None,
job_type: str = None,
skip: int = 0,
limit: int = 100
) -> List[TrainingJobQueue]:
"""Get jobs for a tenant with optional filtering"""
try:
filters = {"tenant_id": tenant_id}
if status:
filters["status"] = status
if job_type:
filters["job_type"] = job_type
return await self.get_multi(
filters=filters,
skip=skip,
limit=limit,
order_by="created_at",
order_desc=True
)
except Exception as e:
logger.error("Failed to get jobs by tenant",
tenant_id=tenant_id,
error=str(e))
raise DatabaseError(f"Failed to get tenant jobs: {str(e)}")
async def cleanup_old_jobs(self, days_old: int = 30, status_filter: str = None) -> int:
"""Clean up old completed/failed/cancelled jobs"""
try:
cutoff_date = datetime.now() - timedelta(days=days_old)
# Only clean up finished jobs by default
default_statuses = ["completed", "failed", "cancelled"]
if status_filter:
status_condition = "status = :status"
params = {"cutoff_date": cutoff_date, "status": status_filter}
else:
status_list = "', '".join(default_statuses)
status_condition = f"status IN ('{status_list}')"
params = {"cutoff_date": cutoff_date}
query_text = f"""
DELETE FROM training_job_queue
WHERE created_at < :cutoff_date
AND {status_condition}
"""
result = await self.session.execute(text(query_text), params)
deleted_count = result.rowcount
logger.info("Cleaned up old queue jobs",
deleted_count=deleted_count,
days_old=days_old,
status_filter=status_filter)
return deleted_count
except Exception as e:
logger.error("Failed to cleanup old queue jobs",
error=str(e))
raise DatabaseError(f"Queue cleanup failed: {str(e)}")
async def get_stuck_jobs(self, hours_stuck: int = 2) -> List[TrainingJobQueue]:
"""Get jobs that have been running for too long"""
try:
cutoff_time = datetime.now() - timedelta(hours=hours_stuck)
query_text = """
SELECT * FROM training_job_queue
WHERE status = 'running'
AND started_at IS NOT NULL
AND started_at < :cutoff_time
ORDER BY started_at ASC
"""
result = await self.session.execute(text(query_text), {"cutoff_time": cutoff_time})
stuck_jobs = []
for row in result.fetchall():
record_dict = dict(row._mapping)
job = self.model(**record_dict)
stuck_jobs.append(job)
if stuck_jobs:
logger.warning("Found stuck jobs",
count=len(stuck_jobs),
hours_stuck=hours_stuck)
return stuck_jobs
except Exception as e:
logger.error("Failed to get stuck jobs",
hours_stuck=hours_stuck,
error=str(e))
return []
async def reset_stuck_jobs(self, hours_stuck: int = 2) -> int:
"""Reset stuck jobs back to queued status"""
try:
stuck_jobs = await self.get_stuck_jobs(hours_stuck)
reset_count = 0
for job in stuck_jobs:
# Reset job to queued status
await self.update(job.id, {
"status": "queued",
"started_at": None,
"updated_at": datetime.now()
})
reset_count += 1
if reset_count > 0:
logger.info("Reset stuck jobs",
reset_count=reset_count,
hours_stuck=hours_stuck)
return reset_count
except Exception as e:
logger.error("Failed to reset stuck jobs",
hours_stuck=hours_stuck,
error=str(e))
raise DatabaseError(f"Failed to reset stuck jobs: {str(e)}")