Add ci/cd and fix multiple pods issues

This commit is contained in:
Urtzi Alfaro
2026-01-18 09:02:27 +01:00
parent 3c4b5c2a06
commit 21d35ea92b
27 changed files with 3779 additions and 73 deletions

View File

@@ -0,0 +1,60 @@
"""Add horizontal scaling constraints for multi-pod deployment
Revision ID: add_horizontal_scaling
Revises: 26a665cd5348
Create Date: 2025-01-18
This migration adds database-level constraints to prevent race conditions
when running multiple training service pods:
1. Partial unique index on model_training_logs to prevent duplicate active jobs per tenant
2. Index to speed up active job lookups
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = 'add_horizontal_scaling'
down_revision: Union[str, None] = '26a665cd5348'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# Add partial unique index to prevent duplicate active training jobs per tenant
# This ensures only ONE job can be in 'pending' or 'running' status per tenant at a time
# The constraint is enforced at the database level, preventing race conditions
# between multiple pods checking and creating jobs simultaneously
op.execute("""
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_active_training_per_tenant
ON model_training_logs (tenant_id)
WHERE status IN ('pending', 'running')
""")
# Add index to speed up active job lookups (used by deduplication check)
op.create_index(
'idx_training_logs_tenant_status',
'model_training_logs',
['tenant_id', 'status'],
unique=False,
if_not_exists=True
)
# Add index for job recovery queries (find stale running jobs)
op.create_index(
'idx_training_logs_status_updated',
'model_training_logs',
['status', 'updated_at'],
unique=False,
if_not_exists=True
)
def downgrade() -> None:
# Remove the indexes in reverse order
op.execute("DROP INDEX IF EXISTS idx_training_logs_status_updated")
op.execute("DROP INDEX IF EXISTS idx_training_logs_tenant_status")
op.execute("DROP INDEX IF EXISTS idx_unique_active_training_per_tenant")