Fix production deadlock
This commit is contained in:
@@ -378,11 +378,20 @@ class ProductionAlertService(BaseAlertService, AlertServiceMixin):
|
|||||||
}
|
}
|
||||||
}, item_type='alert')
|
}, item_type='alert')
|
||||||
|
|
||||||
# Mark as acknowledged to avoid duplicates
|
# Mark as acknowledged to avoid duplicates - using proper session management
|
||||||
await self.db_manager.execute(
|
try:
|
||||||
"UPDATE quality_checks SET acknowledged = true WHERE id = $1",
|
from sqlalchemy import text
|
||||||
issue['id']
|
async with self.db_manager.get_session() as session:
|
||||||
|
await session.execute(
|
||||||
|
text("UPDATE quality_checks SET acknowledged = true WHERE id = :id"),
|
||||||
|
{"id": issue['id']}
|
||||||
)
|
)
|
||||||
|
await session.commit()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to update quality check acknowledged status",
|
||||||
|
quality_check_id=str(issue.get('id')),
|
||||||
|
error=str(e))
|
||||||
|
# Don't raise here to avoid breaking the main flow
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Error processing quality issue",
|
logger.error("Error processing quality issue",
|
||||||
@@ -421,17 +430,20 @@ class ProductionAlertService(BaseAlertService, AlertServiceMixin):
|
|||||||
for tenant_id in tenants:
|
for tenant_id in tenants:
|
||||||
try:
|
try:
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
|
# Use a separate session for each tenant to avoid connection blocking
|
||||||
async with self.db_manager.get_session() as session:
|
async with self.db_manager.get_session() as session:
|
||||||
result = await session.execute(text(query), {"tenant_id": tenant_id})
|
result = await session.execute(text(query), {"tenant_id": tenant_id})
|
||||||
equipment_list = result.fetchall()
|
equipment_list = result.fetchall()
|
||||||
|
|
||||||
for equipment in equipment_list:
|
for equipment in equipment_list:
|
||||||
|
# Process each equipment item in a non-blocking manner
|
||||||
await self._process_equipment_issue(equipment)
|
await self._process_equipment_issue(equipment)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Error checking equipment status",
|
logger.error("Error checking equipment status",
|
||||||
tenant_id=str(tenant_id),
|
tenant_id=str(tenant_id),
|
||||||
error=str(e))
|
error=str(e))
|
||||||
|
# Continue processing other tenants despite this error
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Equipment status check failed", error=str(e))
|
logger.error("Equipment status check failed", error=str(e))
|
||||||
@@ -558,17 +570,20 @@ class ProductionAlertService(BaseAlertService, AlertServiceMixin):
|
|||||||
for tenant_id in tenants:
|
for tenant_id in tenants:
|
||||||
try:
|
try:
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
|
# Use a separate session per tenant to avoid connection blocking
|
||||||
async with self.db_manager.get_session() as session:
|
async with self.db_manager.get_session() as session:
|
||||||
result = await session.execute(text(query), {"tenant_id": tenant_id})
|
result = await session.execute(text(query), {"tenant_id": tenant_id})
|
||||||
recommendations = result.fetchall()
|
recommendations = result.fetchall()
|
||||||
|
|
||||||
for rec in recommendations:
|
for rec in recommendations:
|
||||||
|
# Process each recommendation individually
|
||||||
await self._generate_efficiency_recommendation(tenant_id, rec)
|
await self._generate_efficiency_recommendation(tenant_id, rec)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Error generating efficiency recommendations",
|
logger.error("Error generating efficiency recommendations",
|
||||||
tenant_id=str(tenant_id),
|
tenant_id=str(tenant_id),
|
||||||
error=str(e))
|
error=str(e))
|
||||||
|
# Continue with other tenants despite this error
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Efficiency recommendations failed", error=str(e))
|
logger.error("Efficiency recommendations failed", error=str(e))
|
||||||
@@ -665,6 +680,7 @@ class ProductionAlertService(BaseAlertService, AlertServiceMixin):
|
|||||||
for tenant_id in tenants:
|
for tenant_id in tenants:
|
||||||
try:
|
try:
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
|
# Use a separate session per tenant to avoid connection blocking
|
||||||
async with self.db_manager.get_session() as session:
|
async with self.db_manager.get_session() as session:
|
||||||
result = await session.execute(text(query), {"tenant_id": tenant_id})
|
result = await session.execute(text(query), {"tenant_id": tenant_id})
|
||||||
energy_data = result.fetchall()
|
energy_data = result.fetchall()
|
||||||
@@ -676,6 +692,7 @@ class ProductionAlertService(BaseAlertService, AlertServiceMixin):
|
|||||||
logger.error("Error generating energy recommendations",
|
logger.error("Error generating energy recommendations",
|
||||||
tenant_id=str(tenant_id),
|
tenant_id=str(tenant_id),
|
||||||
error=str(e))
|
error=str(e))
|
||||||
|
# Continue with other tenants despite this error
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Energy recommendations failed", error=str(e))
|
logger.error("Energy recommendations failed", error=str(e))
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ Supports both alerts and recommendations through unified detection patterns
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import random
|
||||||
import uuid
|
import uuid
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
@@ -100,38 +101,64 @@ class BaseAlertService:
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
instance_id = getattr(self.config, 'INSTANCE_ID', 'default')
|
instance_id = getattr(self.config, 'INSTANCE_ID', str(uuid.uuid4()))
|
||||||
was_leader = self.is_leader
|
was_leader = self.is_leader
|
||||||
|
|
||||||
|
# Add jitter to avoid thundering herd when multiple instances start
|
||||||
|
if not was_leader:
|
||||||
|
await asyncio.sleep(random.uniform(0.1, 0.5)) # Small random delay before attempting to acquire
|
||||||
|
|
||||||
# Try to acquire new leadership if not currently leader
|
# Try to acquire new leadership if not currently leader
|
||||||
if not self.is_leader:
|
if not self.is_leader:
|
||||||
|
# Use atomic Redis operation to acquire lock
|
||||||
result = await self.redis.set(
|
result = await self.redis.set(
|
||||||
lock_key,
|
lock_key,
|
||||||
instance_id,
|
instance_id,
|
||||||
ex=lock_ttl,
|
ex=lock_ttl,
|
||||||
nx=True
|
nx=True # Only set if key doesn't exist
|
||||||
)
|
)
|
||||||
self.is_leader = result is not None
|
acquired = result is not None
|
||||||
|
self.is_leader = acquired
|
||||||
else:
|
else:
|
||||||
# Already leader - try to extend the lock
|
# Already leader - try to extend the lock
|
||||||
current_value = await self.redis.get(lock_key)
|
current_value = await self.redis.get(lock_key)
|
||||||
if current_value and current_value.decode() == instance_id:
|
if current_value and current_value.decode() == instance_id:
|
||||||
# Still our lock, extend it
|
# Still our lock, extend it using a Lua script for atomicity
|
||||||
await self.redis.expire(lock_key, lock_ttl)
|
lua_script = """
|
||||||
self.is_leader = True
|
if redis.call("GET", KEYS[1]) == ARGV[1] then
|
||||||
|
return redis.call("EXPIRE", KEYS[1], ARGV[2])
|
||||||
|
else
|
||||||
|
return 0
|
||||||
|
end
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
extend_result = await self.redis.eval(
|
||||||
|
lua_script,
|
||||||
|
keys=[lock_key],
|
||||||
|
args=[instance_id, lock_ttl]
|
||||||
|
)
|
||||||
|
self.is_leader = extend_result == 1
|
||||||
|
except:
|
||||||
|
# If Lua script fails (Redis cluster), fall back to simple get/set
|
||||||
|
self.is_leader = True # Keep current state if we can't verify
|
||||||
else:
|
else:
|
||||||
# Lock expired or taken by someone else
|
# Lock expired or taken by someone else
|
||||||
self.is_leader = False
|
self.is_leader = False
|
||||||
|
|
||||||
# Handle leadership changes
|
# Handle leadership changes
|
||||||
if self.is_leader and not was_leader:
|
if self.is_leader and not was_leader:
|
||||||
|
# Add a small delay to allow other instances to detect leadership change
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
if self.is_leader: # Double-check we're still the leader
|
||||||
self.scheduler.start()
|
self.scheduler.start()
|
||||||
logger.info("Acquired scheduler leadership", service=self.config.SERVICE_NAME)
|
logger.info("Acquired scheduler leadership", service=self.config.SERVICE_NAME)
|
||||||
elif not self.is_leader and was_leader:
|
elif not self.is_leader and was_leader:
|
||||||
|
if self.scheduler.running:
|
||||||
self.scheduler.shutdown()
|
self.scheduler.shutdown()
|
||||||
logger.info("Lost scheduler leadership", service=self.config.SERVICE_NAME)
|
logger.info("Lost scheduler leadership", service=self.config.SERVICE_NAME)
|
||||||
|
|
||||||
await asyncio.sleep(lock_ttl // 2)
|
# Add jitter to reduce contention between instances
|
||||||
|
await asyncio.sleep(lock_ttl // 2 + random.uniform(0, 2))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("Leadership error", service=self.config.SERVICE_NAME, error=str(e))
|
logger.error("Leadership error", service=self.config.SERVICE_NAME, error=str(e))
|
||||||
@@ -189,9 +216,16 @@ class BaseAlertService:
|
|||||||
"""Maintain database connection for listeners"""
|
"""Maintain database connection for listeners"""
|
||||||
try:
|
try:
|
||||||
while not conn.is_closed():
|
while not conn.is_closed():
|
||||||
await asyncio.sleep(30) # Check every 30 seconds
|
# Use a timeout to avoid hanging indefinitely
|
||||||
try:
|
try:
|
||||||
await conn.fetchval("SELECT 1")
|
await asyncio.wait_for(
|
||||||
|
conn.fetchval("SELECT 1"),
|
||||||
|
timeout=5.0
|
||||||
|
)
|
||||||
|
await asyncio.sleep(30) # Check every 30 seconds
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("DB ping timed out, connection may be dead", service=self.config.SERVICE_NAME)
|
||||||
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("DB listener connection lost", service=self.config.SERVICE_NAME, error=str(e))
|
logger.error("DB listener connection lost", service=self.config.SERVICE_NAME, error=str(e))
|
||||||
break
|
break
|
||||||
@@ -226,12 +260,22 @@ class BaseAlertService:
|
|||||||
# Determine routing key based on severity and type
|
# Determine routing key based on severity and type
|
||||||
routing_key = get_routing_key(item_type, item['severity'], self.config.SERVICE_NAME)
|
routing_key = get_routing_key(item_type, item['severity'], self.config.SERVICE_NAME)
|
||||||
|
|
||||||
# Publish to RabbitMQ
|
# Publish to RabbitMQ with timeout to prevent blocking
|
||||||
success = await self.rabbitmq_client.publish_event(
|
try:
|
||||||
|
success = await asyncio.wait_for(
|
||||||
|
self.rabbitmq_client.publish_event(
|
||||||
exchange_name=self.exchange,
|
exchange_name=self.exchange,
|
||||||
routing_key=routing_key,
|
routing_key=routing_key,
|
||||||
event_data=item
|
event_data=item
|
||||||
|
),
|
||||||
|
timeout=10.0 # 10 second timeout
|
||||||
)
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.error("RabbitMQ publish timed out",
|
||||||
|
service=self.config.SERVICE_NAME,
|
||||||
|
item_type=item_type,
|
||||||
|
alert_type=item['type'])
|
||||||
|
return False
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
self._items_published += 1
|
self._items_published += 1
|
||||||
|
|||||||
@@ -246,6 +246,33 @@ class DatabaseManager:
|
|||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"DatabaseManager(service='{self.service_name}', type='{self._get_database_type()}')"
|
return f"DatabaseManager(service='{self.service_name}', type='{self._get_database_type()}')"
|
||||||
|
|
||||||
|
async def execute(self, query: str, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Execute a raw SQL query with proper session management
|
||||||
|
Note: Use this method carefully to avoid transaction conflicts
|
||||||
|
"""
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
# Use a new session context to avoid conflicts with existing sessions
|
||||||
|
async with self.get_session() as session:
|
||||||
|
try:
|
||||||
|
# Convert query to SQLAlchemy text object if it's a string
|
||||||
|
if isinstance(query, str):
|
||||||
|
query = text(query)
|
||||||
|
|
||||||
|
result = await session.execute(query, *args, **kwargs)
|
||||||
|
# For UPDATE/DELETE operations that need to be committed
|
||||||
|
if query.text.strip().upper().startswith(('UPDATE', 'DELETE', 'INSERT')):
|
||||||
|
await session.commit()
|
||||||
|
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
# Only rollback if it was a modifying operation
|
||||||
|
if isinstance(query, str) and query.strip().upper().startswith(('UPDATE', 'DELETE', 'INSERT')):
|
||||||
|
await session.rollback()
|
||||||
|
logger.error("Database execute failed", error=str(e))
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
# ===== CONVENIENCE FUNCTIONS =====
|
# ===== CONVENIENCE FUNCTIONS =====
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user