Add base kubernetes support final fix 4

2025-09-29 07:54:25 +02:00
parent 57f77638cc
commit 4777e59e7a
14 changed files with 1041 additions and 167 deletions
--- a/services/training/app/core/database.py
+++ b/services/training/app/core/database.py
@@ -43,11 +43,71 @@ async def get_db_health() -> bool:
            await conn.execute(text("SELECT 1"))
        logger.debug("Database health check passed")
        return True
-        
+
    except Exception as e:
        logger.error("Database health check failed", error=str(e))
        return False

+async def get_comprehensive_db_health() -> dict:
+    """
+    Comprehensive health check that verifies both connectivity and table existence
+    """
+    health_status = {
+        "status": "healthy",
+        "connectivity": False,
+        "tables_exist": False,
+        "tables_verified": [],
+        "missing_tables": [],
+        "errors": []
+    }
+
+    try:
+        # Test basic connectivity
+        health_status["connectivity"] = await get_db_health()
+
+        if not health_status["connectivity"]:
+            health_status["status"] = "unhealthy"
+            health_status["errors"].append("Database connectivity failed")
+            return health_status
+
+        # Test table existence
+        tables_verified = await _verify_tables_exist()
+        health_status["tables_exist"] = tables_verified
+
+        if tables_verified:
+            health_status["tables_verified"] = [
+                'model_training_logs', 'trained_models', 'model_performance_metrics',
+                'training_job_queue', 'model_artifacts'
+            ]
+        else:
+            health_status["status"] = "unhealthy"
+            health_status["errors"].append("Required tables missing or inaccessible")
+
+            # Try to identify which specific tables are missing
+            try:
+                async with database_manager.get_session() as session:
+                    for table_name in ['model_training_logs', 'trained_models', 'model_performance_metrics',
+                                     'training_job_queue', 'model_artifacts']:
+                        try:
+                            await session.execute(text(f"SELECT 1 FROM {table_name} LIMIT 1"))
+                            health_status["tables_verified"].append(table_name)
+                        except Exception:
+                            health_status["missing_tables"].append(table_name)
+            except Exception as e:
+                health_status["errors"].append(f"Error checking individual tables: {str(e)}")
+
+        logger.debug("Comprehensive database health check completed",
+                    status=health_status["status"],
+                    connectivity=health_status["connectivity"],
+                    tables_exist=health_status["tables_exist"])
+
+    except Exception as e:
+        health_status["status"] = "unhealthy"
+        health_status["errors"].append(f"Health check failed: {str(e)}")
+        logger.error("Comprehensive database health check failed", error=str(e))
+
+    return health_status
+
 # Training service specific database utilities
 class TrainingDatabaseUtils:
    """Training service specific database utilities"""
@@ -223,27 +283,118 @@ async def get_db_session() -> AsyncGenerator[AsyncSession, None]:

 # Database initialization for training service
 async def initialize_training_database():
-    """Initialize database tables for training service"""
+    """Initialize database tables for training service with retry logic and verification"""
+    import asyncio
+    from sqlalchemy import text
+
+    max_retries = 5
+    retry_delay = 2.0
+
+    for attempt in range(1, max_retries + 1):
+        try:
+            logger.info("Initializing training service database",
+                       attempt=attempt, max_retries=max_retries)
+
+            # Step 1: Test database connectivity first
+            logger.info("Testing database connectivity...")
+            connection_ok = await database_manager.test_connection()
+            if not connection_ok:
+                raise Exception("Database connection test failed")
+            logger.info("Database connectivity verified")
+
+            # Step 2: Import models to ensure they're registered
+            logger.info("Importing and registering database models...")
+            from app.models.training import (
+                ModelTrainingLog,
+                TrainedModel,
+                ModelPerformanceMetric,
+                TrainingJobQueue,
+                ModelArtifact
+            )
+
+            # Verify models are registered in metadata
+            expected_tables = {
+                'model_training_logs', 'trained_models', 'model_performance_metrics',
+                'training_job_queue', 'model_artifacts'
+            }
+            registered_tables = set(Base.metadata.tables.keys())
+            missing_tables = expected_tables - registered_tables
+            if missing_tables:
+                raise Exception(f"Models not properly registered: {missing_tables}")
+
+            logger.info("Models registered successfully",
+                       tables=list(registered_tables))
+
+            # Step 3: Create tables using shared infrastructure with verification
+            logger.info("Creating database tables...")
+            await database_manager.create_tables()
+
+            # Step 4: Verify tables were actually created
+            logger.info("Verifying table creation...")
+            verification_successful = await _verify_tables_exist()
+
+            if not verification_successful:
+                raise Exception("Table verification failed - tables were not created properly")
+
+            logger.info("Training service database initialized and verified successfully",
+                       attempt=attempt)
+            return
+
+        except Exception as e:
+            logger.error("Database initialization failed",
+                        attempt=attempt,
+                        max_retries=max_retries,
+                        error=str(e))
+
+            if attempt == max_retries:
+                logger.error("All database initialization attempts failed - giving up")
+                raise Exception(f"Failed to initialize training database after {max_retries} attempts: {str(e)}")
+
+            # Wait before retry with exponential backoff
+            wait_time = retry_delay * (2 ** (attempt - 1))
+            logger.info("Retrying database initialization",
+                       retry_in_seconds=wait_time,
+                       next_attempt=attempt + 1)
+            await asyncio.sleep(wait_time)
+
+async def _verify_tables_exist() -> bool:
+    """Verify that all required tables exist in the database"""
    try:
-        logger.info("Initializing training service database")
-        
-        # Import models to ensure they're registered
-        from app.models.training import (
-            ModelTrainingLog, 
-            TrainedModel, 
-            ModelPerformanceMetric, 
-            TrainingJobQueue, 
-            ModelArtifact
-        )
-        
-        # Create tables using shared infrastructure
-        await database_manager.create_tables()
-        
-        logger.info("Training service database initialized successfully")
-        
+        async with database_manager.get_session() as session:
+            # Check each required table exists and is accessible
+            required_tables = [
+                'model_training_logs',
+                'trained_models',
+                'model_performance_metrics',
+                'training_job_queue',
+                'model_artifacts'
+            ]
+
+            for table_name in required_tables:
+                try:
+                    # Try to query the table structure
+                    result = await session.execute(
+                        text(f"SELECT 1 FROM {table_name} LIMIT 1")
+                    )
+                    logger.debug(f"Table {table_name} exists and is accessible")
+                except Exception as table_error:
+                    # If it's a "relation does not exist" error, table creation failed
+                    if "does not exist" in str(table_error).lower():
+                        logger.error(f"Table {table_name} does not exist", error=str(table_error))
+                        return False
+                    # If it's an empty table, that's fine - table exists
+                    elif "no data" in str(table_error).lower():
+                        logger.debug(f"Table {table_name} exists but is empty (normal)")
+                    else:
+                        logger.warning(f"Unexpected error querying {table_name}", error=str(table_error))
+
+            logger.info("All required tables verified successfully",
+                       tables=required_tables)
+            return True
+
    except Exception as e:
-        logger.error("Failed to initialize training service database", error=str(e))
-        raise
+        logger.error("Table verification failed", error=str(e))
+        return False

 # Database cleanup for training service
 async def cleanup_training_database():