Fix production deadlock

This commit is contained in:
Urtzi Alfaro
2025-09-25 21:00:40 +02:00
parent 7cd0476812
commit cf4405b771
3 changed files with 111 additions and 23 deletions

View File

@@ -378,11 +378,20 @@ class ProductionAlertService(BaseAlertService, AlertServiceMixin):
} }
}, item_type='alert') }, item_type='alert')
# Mark as acknowledged to avoid duplicates # Mark as acknowledged to avoid duplicates - using proper session management
await self.db_manager.execute( try:
"UPDATE quality_checks SET acknowledged = true WHERE id = $1", from sqlalchemy import text
issue['id'] async with self.db_manager.get_session() as session:
) await session.execute(
text("UPDATE quality_checks SET acknowledged = true WHERE id = :id"),
{"id": issue['id']}
)
await session.commit()
except Exception as e:
logger.error("Failed to update quality check acknowledged status",
quality_check_id=str(issue.get('id')),
error=str(e))
# Don't raise here to avoid breaking the main flow
except Exception as e: except Exception as e:
logger.error("Error processing quality issue", logger.error("Error processing quality issue",
@@ -421,17 +430,20 @@ class ProductionAlertService(BaseAlertService, AlertServiceMixin):
for tenant_id in tenants: for tenant_id in tenants:
try: try:
from sqlalchemy import text from sqlalchemy import text
# Use a separate session for each tenant to avoid connection blocking
async with self.db_manager.get_session() as session: async with self.db_manager.get_session() as session:
result = await session.execute(text(query), {"tenant_id": tenant_id}) result = await session.execute(text(query), {"tenant_id": tenant_id})
equipment_list = result.fetchall() equipment_list = result.fetchall()
for equipment in equipment_list: for equipment in equipment_list:
# Process each equipment item in a non-blocking manner
await self._process_equipment_issue(equipment) await self._process_equipment_issue(equipment)
except Exception as e: except Exception as e:
logger.error("Error checking equipment status", logger.error("Error checking equipment status",
tenant_id=str(tenant_id), tenant_id=str(tenant_id),
error=str(e)) error=str(e))
# Continue processing other tenants despite this error
except Exception as e: except Exception as e:
logger.error("Equipment status check failed", error=str(e)) logger.error("Equipment status check failed", error=str(e))
@@ -558,17 +570,20 @@ class ProductionAlertService(BaseAlertService, AlertServiceMixin):
for tenant_id in tenants: for tenant_id in tenants:
try: try:
from sqlalchemy import text from sqlalchemy import text
# Use a separate session per tenant to avoid connection blocking
async with self.db_manager.get_session() as session: async with self.db_manager.get_session() as session:
result = await session.execute(text(query), {"tenant_id": tenant_id}) result = await session.execute(text(query), {"tenant_id": tenant_id})
recommendations = result.fetchall() recommendations = result.fetchall()
for rec in recommendations: for rec in recommendations:
# Process each recommendation individually
await self._generate_efficiency_recommendation(tenant_id, rec) await self._generate_efficiency_recommendation(tenant_id, rec)
except Exception as e: except Exception as e:
logger.error("Error generating efficiency recommendations", logger.error("Error generating efficiency recommendations",
tenant_id=str(tenant_id), tenant_id=str(tenant_id),
error=str(e)) error=str(e))
# Continue with other tenants despite this error
except Exception as e: except Exception as e:
logger.error("Efficiency recommendations failed", error=str(e)) logger.error("Efficiency recommendations failed", error=str(e))
@@ -665,6 +680,7 @@ class ProductionAlertService(BaseAlertService, AlertServiceMixin):
for tenant_id in tenants: for tenant_id in tenants:
try: try:
from sqlalchemy import text from sqlalchemy import text
# Use a separate session per tenant to avoid connection blocking
async with self.db_manager.get_session() as session: async with self.db_manager.get_session() as session:
result = await session.execute(text(query), {"tenant_id": tenant_id}) result = await session.execute(text(query), {"tenant_id": tenant_id})
energy_data = result.fetchall() energy_data = result.fetchall()
@@ -676,6 +692,7 @@ class ProductionAlertService(BaseAlertService, AlertServiceMixin):
logger.error("Error generating energy recommendations", logger.error("Error generating energy recommendations",
tenant_id=str(tenant_id), tenant_id=str(tenant_id),
error=str(e)) error=str(e))
# Continue with other tenants despite this error
except Exception as e: except Exception as e:
logger.error("Energy recommendations failed", error=str(e)) logger.error("Energy recommendations failed", error=str(e))

View File

@@ -6,6 +6,7 @@ Supports both alerts and recommendations through unified detection patterns
import asyncio import asyncio
import json import json
import random
import uuid import uuid
from typing import List, Dict, Any, Optional from typing import List, Dict, Any, Optional
from uuid import UUID from uuid import UUID
@@ -100,38 +101,64 @@ class BaseAlertService:
while True: while True:
try: try:
instance_id = getattr(self.config, 'INSTANCE_ID', 'default') instance_id = getattr(self.config, 'INSTANCE_ID', str(uuid.uuid4()))
was_leader = self.is_leader was_leader = self.is_leader
# Add jitter to avoid thundering herd when multiple instances start
if not was_leader:
await asyncio.sleep(random.uniform(0.1, 0.5)) # Small random delay before attempting to acquire
# Try to acquire new leadership if not currently leader # Try to acquire new leadership if not currently leader
if not self.is_leader: if not self.is_leader:
# Use atomic Redis operation to acquire lock
result = await self.redis.set( result = await self.redis.set(
lock_key, lock_key,
instance_id, instance_id,
ex=lock_ttl, ex=lock_ttl,
nx=True nx=True # Only set if key doesn't exist
) )
self.is_leader = result is not None acquired = result is not None
self.is_leader = acquired
else: else:
# Already leader - try to extend the lock # Already leader - try to extend the lock
current_value = await self.redis.get(lock_key) current_value = await self.redis.get(lock_key)
if current_value and current_value.decode() == instance_id: if current_value and current_value.decode() == instance_id:
# Still our lock, extend it # Still our lock, extend it using a Lua script for atomicity
await self.redis.expire(lock_key, lock_ttl) lua_script = """
self.is_leader = True if redis.call("GET", KEYS[1]) == ARGV[1] then
return redis.call("EXPIRE", KEYS[1], ARGV[2])
else
return 0
end
"""
try:
extend_result = await self.redis.eval(
lua_script,
keys=[lock_key],
args=[instance_id, lock_ttl]
)
self.is_leader = extend_result == 1
except:
# If Lua script fails (Redis cluster), fall back to simple get/set
self.is_leader = True # Keep current state if we can't verify
else: else:
# Lock expired or taken by someone else # Lock expired or taken by someone else
self.is_leader = False self.is_leader = False
# Handle leadership changes # Handle leadership changes
if self.is_leader and not was_leader: if self.is_leader and not was_leader:
self.scheduler.start() # Add a small delay to allow other instances to detect leadership change
logger.info("Acquired scheduler leadership", service=self.config.SERVICE_NAME) await asyncio.sleep(0.1)
if self.is_leader: # Double-check we're still the leader
self.scheduler.start()
logger.info("Acquired scheduler leadership", service=self.config.SERVICE_NAME)
elif not self.is_leader and was_leader: elif not self.is_leader and was_leader:
self.scheduler.shutdown() if self.scheduler.running:
self.scheduler.shutdown()
logger.info("Lost scheduler leadership", service=self.config.SERVICE_NAME) logger.info("Lost scheduler leadership", service=self.config.SERVICE_NAME)
await asyncio.sleep(lock_ttl // 2) # Add jitter to reduce contention between instances
await asyncio.sleep(lock_ttl // 2 + random.uniform(0, 2))
except Exception as e: except Exception as e:
logger.error("Leadership error", service=self.config.SERVICE_NAME, error=str(e)) logger.error("Leadership error", service=self.config.SERVICE_NAME, error=str(e))
@@ -189,9 +216,16 @@ class BaseAlertService:
"""Maintain database connection for listeners""" """Maintain database connection for listeners"""
try: try:
while not conn.is_closed(): while not conn.is_closed():
await asyncio.sleep(30) # Check every 30 seconds # Use a timeout to avoid hanging indefinitely
try: try:
await conn.fetchval("SELECT 1") await asyncio.wait_for(
conn.fetchval("SELECT 1"),
timeout=5.0
)
await asyncio.sleep(30) # Check every 30 seconds
except asyncio.TimeoutError:
logger.warning("DB ping timed out, connection may be dead", service=self.config.SERVICE_NAME)
break
except Exception as e: except Exception as e:
logger.error("DB listener connection lost", service=self.config.SERVICE_NAME, error=str(e)) logger.error("DB listener connection lost", service=self.config.SERVICE_NAME, error=str(e))
break break
@@ -226,12 +260,22 @@ class BaseAlertService:
# Determine routing key based on severity and type # Determine routing key based on severity and type
routing_key = get_routing_key(item_type, item['severity'], self.config.SERVICE_NAME) routing_key = get_routing_key(item_type, item['severity'], self.config.SERVICE_NAME)
# Publish to RabbitMQ # Publish to RabbitMQ with timeout to prevent blocking
success = await self.rabbitmq_client.publish_event( try:
exchange_name=self.exchange, success = await asyncio.wait_for(
routing_key=routing_key, self.rabbitmq_client.publish_event(
event_data=item exchange_name=self.exchange,
) routing_key=routing_key,
event_data=item
),
timeout=10.0 # 10 second timeout
)
except asyncio.TimeoutError:
logger.error("RabbitMQ publish timed out",
service=self.config.SERVICE_NAME,
item_type=item_type,
alert_type=item['type'])
return False
if success: if success:
self._items_published += 1 self._items_published += 1

View File

@@ -246,6 +246,33 @@ class DatabaseManager:
def __repr__(self) -> str: def __repr__(self) -> str:
return f"DatabaseManager(service='{self.service_name}', type='{self._get_database_type()}')" return f"DatabaseManager(service='{self.service_name}', type='{self._get_database_type()}')"
async def execute(self, query: str, *args, **kwargs):
"""
Execute a raw SQL query with proper session management
Note: Use this method carefully to avoid transaction conflicts
"""
from sqlalchemy import text
# Use a new session context to avoid conflicts with existing sessions
async with self.get_session() as session:
try:
# Convert query to SQLAlchemy text object if it's a string
if isinstance(query, str):
query = text(query)
result = await session.execute(query, *args, **kwargs)
# For UPDATE/DELETE operations that need to be committed
if query.text.strip().upper().startswith(('UPDATE', 'DELETE', 'INSERT')):
await session.commit()
return result
except Exception as e:
# Only rollback if it was a modifying operation
if isinstance(query, str) and query.strip().upper().startswith(('UPDATE', 'DELETE', 'INSERT')):
await session.rollback()
logger.error("Database execute failed", error=str(e))
raise
# ===== CONVENIENCE FUNCTIONS ===== # ===== CONVENIENCE FUNCTIONS =====