Add base kubernetes support final fix 4
This commit is contained in:
@@ -43,11 +43,71 @@ async def get_db_health() -> bool:
|
||||
await conn.execute(text("SELECT 1"))
|
||||
logger.debug("Database health check passed")
|
||||
return True
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Database health check failed", error=str(e))
|
||||
return False
|
||||
|
||||
async def get_comprehensive_db_health() -> dict:
|
||||
"""
|
||||
Comprehensive health check that verifies both connectivity and table existence
|
||||
"""
|
||||
health_status = {
|
||||
"status": "healthy",
|
||||
"connectivity": False,
|
||||
"tables_exist": False,
|
||||
"tables_verified": [],
|
||||
"missing_tables": [],
|
||||
"errors": []
|
||||
}
|
||||
|
||||
try:
|
||||
# Test basic connectivity
|
||||
health_status["connectivity"] = await get_db_health()
|
||||
|
||||
if not health_status["connectivity"]:
|
||||
health_status["status"] = "unhealthy"
|
||||
health_status["errors"].append("Database connectivity failed")
|
||||
return health_status
|
||||
|
||||
# Test table existence
|
||||
tables_verified = await _verify_tables_exist()
|
||||
health_status["tables_exist"] = tables_verified
|
||||
|
||||
if tables_verified:
|
||||
health_status["tables_verified"] = [
|
||||
'model_training_logs', 'trained_models', 'model_performance_metrics',
|
||||
'training_job_queue', 'model_artifacts'
|
||||
]
|
||||
else:
|
||||
health_status["status"] = "unhealthy"
|
||||
health_status["errors"].append("Required tables missing or inaccessible")
|
||||
|
||||
# Try to identify which specific tables are missing
|
||||
try:
|
||||
async with database_manager.get_session() as session:
|
||||
for table_name in ['model_training_logs', 'trained_models', 'model_performance_metrics',
|
||||
'training_job_queue', 'model_artifacts']:
|
||||
try:
|
||||
await session.execute(text(f"SELECT 1 FROM {table_name} LIMIT 1"))
|
||||
health_status["tables_verified"].append(table_name)
|
||||
except Exception:
|
||||
health_status["missing_tables"].append(table_name)
|
||||
except Exception as e:
|
||||
health_status["errors"].append(f"Error checking individual tables: {str(e)}")
|
||||
|
||||
logger.debug("Comprehensive database health check completed",
|
||||
status=health_status["status"],
|
||||
connectivity=health_status["connectivity"],
|
||||
tables_exist=health_status["tables_exist"])
|
||||
|
||||
except Exception as e:
|
||||
health_status["status"] = "unhealthy"
|
||||
health_status["errors"].append(f"Health check failed: {str(e)}")
|
||||
logger.error("Comprehensive database health check failed", error=str(e))
|
||||
|
||||
return health_status
|
||||
|
||||
# Training service specific database utilities
|
||||
class TrainingDatabaseUtils:
|
||||
"""Training service specific database utilities"""
|
||||
@@ -223,27 +283,118 @@ async def get_db_session() -> AsyncGenerator[AsyncSession, None]:
|
||||
|
||||
# Database initialization for training service
|
||||
async def initialize_training_database():
|
||||
"""Initialize database tables for training service"""
|
||||
"""Initialize database tables for training service with retry logic and verification"""
|
||||
import asyncio
|
||||
from sqlalchemy import text
|
||||
|
||||
max_retries = 5
|
||||
retry_delay = 2.0
|
||||
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
logger.info("Initializing training service database",
|
||||
attempt=attempt, max_retries=max_retries)
|
||||
|
||||
# Step 1: Test database connectivity first
|
||||
logger.info("Testing database connectivity...")
|
||||
connection_ok = await database_manager.test_connection()
|
||||
if not connection_ok:
|
||||
raise Exception("Database connection test failed")
|
||||
logger.info("Database connectivity verified")
|
||||
|
||||
# Step 2: Import models to ensure they're registered
|
||||
logger.info("Importing and registering database models...")
|
||||
from app.models.training import (
|
||||
ModelTrainingLog,
|
||||
TrainedModel,
|
||||
ModelPerformanceMetric,
|
||||
TrainingJobQueue,
|
||||
ModelArtifact
|
||||
)
|
||||
|
||||
# Verify models are registered in metadata
|
||||
expected_tables = {
|
||||
'model_training_logs', 'trained_models', 'model_performance_metrics',
|
||||
'training_job_queue', 'model_artifacts'
|
||||
}
|
||||
registered_tables = set(Base.metadata.tables.keys())
|
||||
missing_tables = expected_tables - registered_tables
|
||||
if missing_tables:
|
||||
raise Exception(f"Models not properly registered: {missing_tables}")
|
||||
|
||||
logger.info("Models registered successfully",
|
||||
tables=list(registered_tables))
|
||||
|
||||
# Step 3: Create tables using shared infrastructure with verification
|
||||
logger.info("Creating database tables...")
|
||||
await database_manager.create_tables()
|
||||
|
||||
# Step 4: Verify tables were actually created
|
||||
logger.info("Verifying table creation...")
|
||||
verification_successful = await _verify_tables_exist()
|
||||
|
||||
if not verification_successful:
|
||||
raise Exception("Table verification failed - tables were not created properly")
|
||||
|
||||
logger.info("Training service database initialized and verified successfully",
|
||||
attempt=attempt)
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Database initialization failed",
|
||||
attempt=attempt,
|
||||
max_retries=max_retries,
|
||||
error=str(e))
|
||||
|
||||
if attempt == max_retries:
|
||||
logger.error("All database initialization attempts failed - giving up")
|
||||
raise Exception(f"Failed to initialize training database after {max_retries} attempts: {str(e)}")
|
||||
|
||||
# Wait before retry with exponential backoff
|
||||
wait_time = retry_delay * (2 ** (attempt - 1))
|
||||
logger.info("Retrying database initialization",
|
||||
retry_in_seconds=wait_time,
|
||||
next_attempt=attempt + 1)
|
||||
await asyncio.sleep(wait_time)
|
||||
|
||||
async def _verify_tables_exist() -> bool:
|
||||
"""Verify that all required tables exist in the database"""
|
||||
try:
|
||||
logger.info("Initializing training service database")
|
||||
|
||||
# Import models to ensure they're registered
|
||||
from app.models.training import (
|
||||
ModelTrainingLog,
|
||||
TrainedModel,
|
||||
ModelPerformanceMetric,
|
||||
TrainingJobQueue,
|
||||
ModelArtifact
|
||||
)
|
||||
|
||||
# Create tables using shared infrastructure
|
||||
await database_manager.create_tables()
|
||||
|
||||
logger.info("Training service database initialized successfully")
|
||||
|
||||
async with database_manager.get_session() as session:
|
||||
# Check each required table exists and is accessible
|
||||
required_tables = [
|
||||
'model_training_logs',
|
||||
'trained_models',
|
||||
'model_performance_metrics',
|
||||
'training_job_queue',
|
||||
'model_artifacts'
|
||||
]
|
||||
|
||||
for table_name in required_tables:
|
||||
try:
|
||||
# Try to query the table structure
|
||||
result = await session.execute(
|
||||
text(f"SELECT 1 FROM {table_name} LIMIT 1")
|
||||
)
|
||||
logger.debug(f"Table {table_name} exists and is accessible")
|
||||
except Exception as table_error:
|
||||
# If it's a "relation does not exist" error, table creation failed
|
||||
if "does not exist" in str(table_error).lower():
|
||||
logger.error(f"Table {table_name} does not exist", error=str(table_error))
|
||||
return False
|
||||
# If it's an empty table, that's fine - table exists
|
||||
elif "no data" in str(table_error).lower():
|
||||
logger.debug(f"Table {table_name} exists but is empty (normal)")
|
||||
else:
|
||||
logger.warning(f"Unexpected error querying {table_name}", error=str(table_error))
|
||||
|
||||
logger.info("All required tables verified successfully",
|
||||
tables=required_tables)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to initialize training service database", error=str(e))
|
||||
raise
|
||||
logger.error("Table verification failed", error=str(e))
|
||||
return False
|
||||
|
||||
# Database cleanup for training service
|
||||
async def cleanup_training_database():
|
||||
|
||||
Reference in New Issue
Block a user