From e585e9fac03faa44d277af3e356ca5bd5546c767 Mon Sep 17 00:00:00 2001 From: Urtzi Alfaro Date: Wed, 5 Nov 2025 16:30:15 +0100 Subject: [PATCH] Fix critical nested session deadlock in training_service.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root Cause (Actual): The actual nested session issue was in training_service.py, not just in the trainer methods. The flow was: 1. training_service.py creates outer session (line 173) 2. Updates training_log at line 235-237 (uncommitted) 3. Calls trainer.train_tenant_models() at line 239 4. Trainer creates its own session at line 93 5. DEADLOCK: Outer session has uncommitted UPDATE, inner session can't proceed Fix: Added explicit session.commit() after the ml_training progress update (line 241) to ensure the UPDATE is committed before trainer creates its own session. This prevents the deadlock condition. Related to previous commit caff497 which fixed nested sessions in prophet_manager and hybrid_trainer, but missed the actual root cause in training_service.py. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- services/training/app/services/training_service.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/services/training/app/services/training_service.py b/services/training/app/services/training_service.py index e5d0b2a0..156aa682 100644 --- a/services/training/app/services/training_service.py +++ b/services/training/app/services/training_service.py @@ -236,6 +236,11 @@ class EnhancedTrainingService: job_id, PROGRESS_ML_TRAINING_START, "ml_training", "running" ) + # ✅ FIX: Commit the session to prevent deadlock with trainer's nested session + # The trainer creates its own session, so we need to ensure this update is committed + await session.commit() + logger.debug("Committed session after ml_training progress update") + training_results = await self.trainer.train_tenant_models( tenant_id=tenant_id, training_dataset=training_dataset,