Add ci/cd and fix multiple pods issues

2026-01-18 09:02:27 +01:00
parent 3c4b5c2a06
commit 21d35ea92b
27 changed files with 3779 additions and 73 deletions
--- a/services/training/app/utils/distributed_lock.py
+++ b/services/training/app/utils/distributed_lock.py
@@ -1,10 +1,16 @@
 """
 Distributed Locking Mechanisms
 Prevents concurrent training jobs for the same product
+
+HORIZONTAL SCALING FIX:
+- Uses SHA256 for stable hash across all Python processes/pods
+- Python's built-in hash() varies between processes due to hash randomization (Python 3.3+)
+- This ensures all pods compute the same lock ID for the same lock name
 """

 import asyncio
 import time
+import hashlib
 from typing import Optional
 import logging
 from contextlib import asynccontextmanager
@@ -39,9 +45,20 @@ class DatabaseLock:
        self.lock_id = self._hash_lock_name(lock_name)

    def _hash_lock_name(self, name: str) -> int:
-        """Convert lock name to integer ID for PostgreSQL advisory lock"""
-        # Use hash and modulo to get a positive 32-bit integer
-        return abs(hash(name)) % (2**31)
+        """
+        Convert lock name to integer ID for PostgreSQL advisory lock.
+
+        CRITICAL: Uses SHA256 for stable hash across all Python processes/pods.
+        Python's built-in hash() varies between processes due to hash randomization
+        (PYTHONHASHSEED, enabled by default since Python 3.3), which would cause
+        different pods to compute different lock IDs for the same lock name,
+        defeating the purpose of distributed locking.
+        """
+        # Use SHA256 for stable, cross-process hash
+        hash_bytes = hashlib.sha256(name.encode('utf-8')).digest()
+        # Take first 4 bytes and convert to positive 31-bit integer
+        # (PostgreSQL advisory locks use bigint, but we use 31-bit for safety)
+        return int.from_bytes(hash_bytes[:4], 'big') % (2**31)

    @asynccontextmanager
    async def acquire(self, session: AsyncSession):