Add ci/cd and fix multiple pods issues

This commit is contained in:
Urtzi Alfaro
2026-01-18 09:02:27 +01:00
parent 3c4b5c2a06
commit 21d35ea92b
27 changed files with 3779 additions and 73 deletions

View File

@@ -342,4 +342,166 @@ class TrainingLogRepository(TrainingBaseRepository):
logger.error("Failed to get start time",
job_id=job_id,
error=str(e))
return None
return None
async def create_job_atomic(
self,
job_id: str,
tenant_id: str,
config: Dict[str, Any] = None
) -> tuple[Optional[ModelTrainingLog], bool]:
"""
Atomically create a training job, respecting the unique constraint.
This method uses INSERT ... ON CONFLICT to handle race conditions
when multiple pods try to create a job for the same tenant simultaneously.
The database constraint (idx_unique_active_training_per_tenant) ensures
only one active job per tenant can exist.
Args:
job_id: Unique job identifier
tenant_id: Tenant identifier
config: Optional job configuration
Returns:
Tuple of (job, created):
- If created: (new_job, True)
- If conflict (existing active job): (existing_job, False)
- If error: raises DatabaseError
"""
try:
# First, try to find an existing active job
existing = await self.get_active_jobs(tenant_id=tenant_id)
pending = await self.get_logs_by_tenant(tenant_id=tenant_id, status="pending", limit=1)
if existing or pending:
# Return existing job
active_job = existing[0] if existing else pending[0]
logger.info("Found existing active job, skipping creation",
existing_job_id=active_job.job_id,
tenant_id=tenant_id,
requested_job_id=job_id)
return (active_job, False)
# Try to create the new job
# If another pod created one in the meantime, the unique constraint will prevent this
log_data = {
"job_id": job_id,
"tenant_id": tenant_id,
"status": "pending",
"progress": 0,
"current_step": "initializing",
"config": config or {}
}
try:
new_job = await self.create_training_log(log_data)
await self.session.commit()
logger.info("Created new training job atomically",
job_id=job_id,
tenant_id=tenant_id)
return (new_job, True)
except Exception as create_error:
error_str = str(create_error).lower()
# Check if this is a unique constraint violation
if "unique" in error_str or "duplicate" in error_str or "constraint" in error_str:
await self.session.rollback()
# Another pod created a job, fetch it
logger.info("Unique constraint hit, fetching existing job",
tenant_id=tenant_id,
requested_job_id=job_id)
existing = await self.get_active_jobs(tenant_id=tenant_id)
pending = await self.get_logs_by_tenant(tenant_id=tenant_id, status="pending", limit=1)
if existing or pending:
active_job = existing[0] if existing else pending[0]
return (active_job, False)
# If still no job found, something went wrong
raise DatabaseError(f"Constraint violation but no active job found: {create_error}")
else:
raise
except DatabaseError:
raise
except Exception as e:
logger.error("Failed to create job atomically",
job_id=job_id,
tenant_id=tenant_id,
error=str(e))
raise DatabaseError(f"Failed to create training job atomically: {str(e)}")
async def recover_stale_jobs(self, stale_threshold_minutes: int = 60) -> List[ModelTrainingLog]:
"""
Find and mark stale running jobs as failed.
This is used during service startup to clean up jobs that were
running when a pod crashed. With multiple replicas, only stale
jobs (not updated recently) should be marked as failed.
Args:
stale_threshold_minutes: Jobs not updated for this long are considered stale
Returns:
List of jobs that were marked as failed
"""
try:
stale_cutoff = datetime.now() - timedelta(minutes=stale_threshold_minutes)
# Find running jobs that haven't been updated recently
query = text("""
SELECT id, job_id, tenant_id, status, updated_at
FROM model_training_logs
WHERE status IN ('running', 'pending')
AND updated_at < :stale_cutoff
""")
result = await self.session.execute(query, {"stale_cutoff": stale_cutoff})
stale_jobs = result.fetchall()
recovered_jobs = []
for row in stale_jobs:
try:
# Mark as failed
update_query = text("""
UPDATE model_training_logs
SET status = 'failed',
error_message = :error_msg,
end_time = :end_time,
updated_at = :updated_at
WHERE id = :id AND status IN ('running', 'pending')
""")
await self.session.execute(update_query, {
"id": row.id,
"error_msg": f"Job recovered as failed - not updated since {row.updated_at.isoformat()}. Pod may have crashed.",
"end_time": datetime.now(),
"updated_at": datetime.now()
})
logger.warning("Recovered stale training job",
job_id=row.job_id,
tenant_id=str(row.tenant_id),
last_updated=row.updated_at.isoformat() if row.updated_at else "unknown")
# Fetch the updated job to return
job = await self.get_by_job_id(row.job_id)
if job:
recovered_jobs.append(job)
except Exception as job_error:
logger.error("Failed to recover individual stale job",
job_id=row.job_id,
error=str(job_error))
if recovered_jobs:
await self.session.commit()
logger.info("Stale job recovery completed",
recovered_count=len(recovered_jobs),
stale_threshold_minutes=stale_threshold_minutes)
return recovered_jobs
except Exception as e:
logger.error("Failed to recover stale jobs",
error=str(e))
await self.session.rollback()
return []