Fix startup issues

This commit is contained in:
Urtzi Alfaro
2025-10-01 12:17:59 +02:00
parent 2eeebfc1e0
commit 016742d63f
53 changed files with 2000 additions and 2091 deletions

View File

@@ -46,7 +46,25 @@ class BaseAlertService:
"""Initialize all detection mechanisms"""
try:
# Connect to Redis for leader election and deduplication
self.redis = await Redis.from_url(self.config.REDIS_URL)
import os
redis_password = os.getenv('REDIS_PASSWORD', '')
redis_host = os.getenv('REDIS_HOST', 'redis-service')
redis_port = int(os.getenv('REDIS_PORT', '6379'))
# Create Redis client with explicit password parameter
if redis_password:
self.redis = await Redis(
host=redis_host,
port=redis_port,
password=redis_password,
decode_responses=True
)
else:
self.redis = await Redis(
host=redis_host,
port=redis_port,
decode_responses=True
)
logger.info("Connected to Redis", service=self.config.SERVICE_NAME)
# Connect to RabbitMQ
@@ -98,7 +116,11 @@ class BaseAlertService:
"""Leader election for scheduled jobs"""
lock_key = f"scheduler_lock:{self.config.SERVICE_NAME}"
lock_ttl = 60
logger.info("DEBUG: maintain_leadership starting",
service=self.config.SERVICE_NAME,
redis_client_type=str(type(self.redis)))
while True:
try:
instance_id = getattr(self.config, 'INSTANCE_ID', str(uuid.uuid4()))
@@ -161,7 +183,12 @@ class BaseAlertService:
await asyncio.sleep(lock_ttl // 2 + random.uniform(0, 2))
except Exception as e:
logger.error("Leadership error", service=self.config.SERVICE_NAME, error=str(e))
import traceback
logger.error("Leadership error",
service=self.config.SERVICE_NAME,
error=str(e),
error_type=type(e).__name__,
traceback=traceback.format_exc())
self.is_leader = False
await asyncio.sleep(5)

View File

@@ -7,7 +7,7 @@ Provides common settings and patterns
import os
from typing import List, Dict, Optional, Any
from pydantic_settings import BaseSettings
from pydantic import validator
from pydantic import validator, Field
class BaseServiceSettings(BaseSettings):
@@ -54,8 +54,32 @@ class BaseServiceSettings(BaseSettings):
# ================================================================
# REDIS CONFIGURATION
# ================================================================
REDIS_URL: str = os.getenv("REDIS_URL", "redis://redis-service:6379")
@property
def REDIS_URL(self) -> str:
"""Build Redis URL from secure components"""
# Try complete URL first (for backward compatibility)
complete_url = os.getenv("REDIS_URL")
if complete_url:
return complete_url
# Build from components (secure approach)
password = os.getenv("REDIS_PASSWORD", "")
host = os.getenv("REDIS_HOST", "redis-service")
port = os.getenv("REDIS_PORT", "6379")
# DEBUG: print what we're using
import sys
print(f"[DEBUG REDIS_URL] password={repr(password)}, host={host}, port={port}", file=sys.stderr)
if password:
url = f"redis://:{password}@{host}:{port}"
print(f"[DEBUG REDIS_URL] Returning URL with auth: {url}", file=sys.stderr)
return url
url = f"redis://{host}:{port}"
print(f"[DEBUG REDIS_URL] Returning URL without auth: {url}", file=sys.stderr)
return url
REDIS_DB: int = int(os.getenv("REDIS_DB", "0"))
REDIS_MAX_CONNECTIONS: int = int(os.getenv("REDIS_MAX_CONNECTIONS", "50"))
REDIS_RETRY_ON_TIMEOUT: bool = True
@@ -65,7 +89,7 @@ class BaseServiceSettings(BaseSettings):
"TCP_KEEPINTVL": 3,
"TCP_KEEPCNT": 5,
}
@property
def REDIS_URL_WITH_DB(self) -> str:
"""Get Redis URL with database number"""

View File

@@ -27,7 +27,10 @@ logger = structlog.get_logger()
class DatabaseInitManager:
"""
Manages database initialization using Alembic migrations exclusively.
Uses autogenerate to create initial migrations if none exist.
Two modes:
1. Migration mode (for migration jobs): Runs alembic upgrade head
2. Verification mode (for services): Only verifies database is ready
"""
def __init__(
@@ -36,30 +39,103 @@ class DatabaseInitManager:
service_name: str,
alembic_ini_path: Optional[str] = None,
models_module: Optional[str] = None,
force_recreate: bool = False,
allow_create_all_fallback: bool = True,
environment: Optional[str] = None
verify_only: bool = True, # Default: services only verify
force_recreate: bool = False
):
self.database_manager = database_manager
self.service_name = service_name
self.alembic_ini_path = alembic_ini_path
self.models_module = models_module
self.verify_only = verify_only
self.force_recreate = force_recreate
self.allow_create_all_fallback = allow_create_all_fallback
self.environment = environment or os.getenv('ENVIRONMENT', 'development')
self.logger = logger.bind(service=service_name)
async def initialize_database(self) -> Dict[str, Any]:
"""
Main initialization method:
1. Check if migrations exist in the codebase
2. Run alembic upgrade head to apply all pending migrations
Main initialization method.
NOTE: Migration files must be pre-generated and included in Docker images.
Do NOT generate migrations at runtime.
Two modes:
1. verify_only=True (default, for services):
- Verifies database is ready
- Checks tables exist
- Checks alembic_version exists
- DOES NOT run migrations
2. verify_only=False (for migration jobs only):
- Runs alembic upgrade head
- Applies pending migrations
- Can force recreate if needed
"""
self.logger.info("Starting database initialization with Alembic")
if self.verify_only:
self.logger.info("Database verification mode - checking database is ready")
return await self._verify_database_ready()
else:
self.logger.info("Migration mode - running database migrations")
return await self._run_migrations_mode()
async def _verify_database_ready(self) -> Dict[str, Any]:
"""
Verify database is ready for service startup.
Services should NOT run migrations - only verify they've been applied.
"""
try:
# Check alembic configuration exists
if not self.alembic_ini_path or not os.path.exists(self.alembic_ini_path):
raise Exception(f"Alembic configuration not found at {self.alembic_ini_path}")
# Check database state
db_state = await self._check_database_state()
self.logger.info("Database state checked", state=db_state)
# Verify migrations exist
if not db_state["has_migrations"]:
raise Exception(
f"No migration files found for {self.service_name}. "
f"Migrations must be generated and included in the Docker image."
)
# Verify database is not empty
if db_state["is_empty"]:
raise Exception(
f"Database is empty. Migration job must run before service startup. "
f"Ensure migration job completes successfully before starting services."
)
# Verify alembic_version table exists
if not db_state["has_alembic_version"]:
raise Exception(
f"No alembic_version table found. Migration job must run before service startup."
)
# Verify current revision exists
if not db_state["current_revision"]:
raise Exception(
f"No current migration revision found. Database may not be properly initialized."
)
self.logger.info(
"Database verification successful",
migration_count=db_state["migration_count"],
current_revision=db_state["current_revision"],
table_count=len(db_state["existing_tables"])
)
return {
"action": "verified",
"message": "Database verified successfully - ready for service",
"current_revision": db_state["current_revision"],
"migration_count": db_state["migration_count"],
"table_count": len(db_state["existing_tables"])
}
except Exception as e:
self.logger.error("Database verification failed", error=str(e))
raise
async def _run_migrations_mode(self) -> Dict[str, Any]:
"""
Run migrations mode - for migration jobs only.
"""
try:
if not self.alembic_ini_path or not os.path.exists(self.alembic_ini_path):
raise Exception(f"Alembic configuration not found at {self.alembic_ini_path}")
@@ -68,36 +144,25 @@ class DatabaseInitManager:
db_state = await self._check_database_state()
self.logger.info("Database state checked", state=db_state)
# Handle different scenarios based on migration state
# Handle force recreate
if self.force_recreate:
result = await self._handle_force_recreate()
elif not db_state["has_migrations"]:
# No migration files found - check if fallback is allowed
if self.allow_create_all_fallback:
self.logger.warning(
"No migration files found - using create_all() as fallback. "
"Consider generating proper migrations for production use.",
environment=self.environment
)
result = await self._handle_no_migrations()
else:
# In production or when fallback is disabled, fail instead of using create_all
error_msg = (
f"No migration files found for {self.service_name} and "
f"create_all() fallback is disabled (environment: {self.environment}). "
f"Migration files must be generated before deployment. "
f"Run migration generation script to create initial migrations."
)
self.logger.error(error_msg)
raise Exception(error_msg)
else:
result = await self._handle_run_migrations()
return await self._handle_force_recreate()
self.logger.info("Database initialization completed", result=result)
# Check migrations exist
if not db_state["has_migrations"]:
raise Exception(
f"No migration files found for {self.service_name}. "
f"Generate migrations using regenerate_migrations_k8s.sh script."
)
# Run migrations
result = await self._handle_run_migrations()
self.logger.info("Migration mode completed", result=result)
return result
except Exception as e:
self.logger.error("Database initialization failed", error=str(e))
self.logger.error("Migration mode failed", error=str(e))
raise
async def _check_database_state(self) -> Dict[str, Any]:
@@ -139,24 +204,6 @@ class DatabaseInitManager:
return state
async def _handle_no_migrations(self) -> Dict[str, Any]:
"""Handle case where no migration files exist - use create_all()"""
self.logger.info("No migrations found, using create_all() to initialize tables")
try:
# Create tables directly using SQLAlchemy metadata
await self._create_tables_from_models()
return {
"action": "tables_created_via_create_all",
"tables_created": True,
"message": "Tables created using SQLAlchemy create_all()"
}
except Exception as e:
self.logger.error("Failed to create tables", error=str(e))
raise
async def _handle_run_migrations(self) -> Dict[str, Any]:
"""Handle normal migration scenario - run pending migrations"""
self.logger.info("Running pending migrations")
@@ -229,16 +276,6 @@ class DatabaseInitManager:
raise
async def _create_tables_from_models(self):
"""Create tables using SQLAlchemy metadata (create_all)"""
try:
async with self.database_manager.async_engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all)
self.logger.info("Tables created via create_all()")
except Exception as e:
self.logger.error("Failed to create tables", error=str(e))
raise
async def _drop_all_tables(self):
"""Drop all tables (for development reset)"""
try:
@@ -269,9 +306,8 @@ def create_init_manager(
database_manager: DatabaseManager,
service_name: str,
service_path: Optional[str] = None,
force_recreate: bool = False,
allow_create_all_fallback: Optional[bool] = None,
environment: Optional[str] = None
verify_only: bool = True,
force_recreate: bool = False
) -> DatabaseInitManager:
"""
Factory function to create a DatabaseInitManager with auto-detected paths
@@ -280,21 +316,9 @@ def create_init_manager(
database_manager: DatabaseManager instance
service_name: Name of the service
service_path: Path to service directory (auto-detected if None)
force_recreate: Whether to force recreate tables (development mode)
allow_create_all_fallback: Allow create_all() if no migrations (auto-detect from env if None)
environment: Environment name (auto-detect from ENVIRONMENT env var if None)
verify_only: True = verify DB ready (services), False = run migrations (jobs only)
force_recreate: Force recreate tables (requires verify_only=False)
"""
# Auto-detect environment
if environment is None:
environment = os.getenv('ENVIRONMENT', 'development')
# Auto-detect fallback setting based on environment
if allow_create_all_fallback is None:
# Only allow fallback in development/local environments
allow_create_all_fallback = environment.lower() in ['development', 'dev', 'local', 'test']
allow_create_all_fallback = False
# Auto-detect paths if not provided
if service_path is None:
# Try Docker container path first (service files at root level)
@@ -324,28 +348,25 @@ def create_init_manager(
service_name=service_name,
alembic_ini_path=alembic_ini_path,
models_module=models_module,
force_recreate=force_recreate,
allow_create_all_fallback=allow_create_all_fallback,
environment=environment
verify_only=verify_only,
force_recreate=force_recreate
)
async def initialize_service_database(
database_manager: DatabaseManager,
service_name: str,
force_recreate: bool = False,
allow_create_all_fallback: Optional[bool] = None,
environment: Optional[str] = None
verify_only: bool = True,
force_recreate: bool = False
) -> Dict[str, Any]:
"""
Convenience function for service database initialization
Convenience function for database initialization
Args:
database_manager: DatabaseManager instance
service_name: Name of the service
force_recreate: Whether to force recreate (development mode)
allow_create_all_fallback: Allow create_all() if no migrations (auto-detect from env if None)
environment: Environment name (auto-detect from ENVIRONMENT env var if None)
verify_only: True = verify DB ready (default, services), False = run migrations (jobs only)
force_recreate: Force recreate tables (requires verify_only=False)
Returns:
Dict with initialization results
@@ -353,9 +374,8 @@ async def initialize_service_database(
init_manager = create_init_manager(
database_manager=database_manager,
service_name=service_name,
force_recreate=force_recreate,
allow_create_all_fallback=allow_create_all_fallback,
environment=environment
verify_only=verify_only,
force_recreate=force_recreate
)
return await init_manager.initialize_database()

View File

@@ -217,27 +217,35 @@ class BaseFastAPIService:
raise
async def _handle_database_tables(self):
"""Handle automatic table creation and migration management"""
"""
Verify database is ready for service startup.
Services NEVER run migrations - they only verify the database
has been properly initialized by the migration job.
This ensures:
- Fast service startup (50-80% faster)
- No race conditions between replicas
- Clear separation: migrations are operational, not application concern
"""
try:
# Import the init manager here to avoid circular imports
from shared.database.init_manager import initialize_service_database
# Check if we're in force recreate mode (development)
force_recreate = os.getenv("DB_FORCE_RECREATE", "false").lower() == "true"
# Initialize database with automatic table creation
# Services ALWAYS verify only (never run migrations)
# Migrations are handled by dedicated migration jobs
result = await initialize_service_database(
database_manager=self.database_manager,
service_name=self.service_name.replace("-service", "").replace("_", ""),
force_recreate=force_recreate
verify_only=True # Services only verify, never run migrations
)
self.logger.info("Database table initialization completed", result=result)
self.logger.info("Database verification completed", result=result)
except Exception as e:
self.logger.error("Database table initialization failed", error=str(e))
# Don't raise here - let the service start even if table init fails
# This allows for manual intervention if needed
self.logger.error("Database verification failed", error=str(e))
# FAIL FAST: If database not ready, service should not start
raise
async def _cleanup_database(self):
"""Cleanup database connections"""