Add ci/cd and fix multiple pods issues
This commit is contained in:
@@ -0,0 +1,60 @@
|
||||
"""Add horizontal scaling constraints for multi-pod deployment
|
||||
|
||||
Revision ID: add_horizontal_scaling
|
||||
Revises: 26a665cd5348
|
||||
Create Date: 2025-01-18
|
||||
|
||||
This migration adds database-level constraints to prevent race conditions
|
||||
when running multiple training service pods:
|
||||
|
||||
1. Partial unique index on model_training_logs to prevent duplicate active jobs per tenant
|
||||
2. Index to speed up active job lookups
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = 'add_horizontal_scaling'
|
||||
down_revision: Union[str, None] = '26a665cd5348'
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add partial unique index to prevent duplicate active training jobs per tenant
|
||||
# This ensures only ONE job can be in 'pending' or 'running' status per tenant at a time
|
||||
# The constraint is enforced at the database level, preventing race conditions
|
||||
# between multiple pods checking and creating jobs simultaneously
|
||||
op.execute("""
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_active_training_per_tenant
|
||||
ON model_training_logs (tenant_id)
|
||||
WHERE status IN ('pending', 'running')
|
||||
""")
|
||||
|
||||
# Add index to speed up active job lookups (used by deduplication check)
|
||||
op.create_index(
|
||||
'idx_training_logs_tenant_status',
|
||||
'model_training_logs',
|
||||
['tenant_id', 'status'],
|
||||
unique=False,
|
||||
if_not_exists=True
|
||||
)
|
||||
|
||||
# Add index for job recovery queries (find stale running jobs)
|
||||
op.create_index(
|
||||
'idx_training_logs_status_updated',
|
||||
'model_training_logs',
|
||||
['status', 'updated_at'],
|
||||
unique=False,
|
||||
if_not_exists=True
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Remove the indexes in reverse order
|
||||
op.execute("DROP INDEX IF EXISTS idx_training_logs_status_updated")
|
||||
op.execute("DROP INDEX IF EXISTS idx_training_logs_tenant_status")
|
||||
op.execute("DROP INDEX IF EXISTS idx_unique_active_training_per_tenant")
|
||||
Reference in New Issue
Block a user