Files
bakery-ia/services/alert_processor/app/jobs/priority_recalculation.py
2025-11-27 15:52:40 +01:00

338 lines
11 KiB
Python

"""
Priority Recalculation Job
Scheduled job that recalculates priority scores for active alerts,
applying time-based escalation boosts.
Runs hourly to ensure stale actions get escalated appropriately.
"""
import structlog
from datetime import datetime, timedelta, timezone
from typing import Dict, List
from uuid import UUID
from sqlalchemy import select, update
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.events import Alert, AlertStatus
from app.services.enrichment.priority_scoring import PriorityScoringService
from shared.schemas.alert_types import UrgencyContext
logger = structlog.get_logger()
class PriorityRecalculationJob:
"""Recalculates alert priorities with time-based escalation"""
def __init__(self, config, db_manager, redis_client):
self.config = config
self.db_manager = db_manager
self.redis = redis_client
self.priority_service = PriorityScoringService(config)
async def run(self, tenant_id: UUID = None) -> Dict[str, int]:
"""
Recalculate priorities for all active action-needed alerts.
Args:
tenant_id: Optional tenant filter. If None, runs for all tenants.
Returns:
Dict with counts: {
'processed': int,
'escalated': int,
'errors': int
}
"""
logger.info("Starting priority recalculation job", tenant_id=str(tenant_id) if tenant_id else "all")
counts = {
'processed': 0,
'escalated': 0,
'errors': 0
}
try:
# Process alerts in batches to avoid memory issues and timeouts
batch_size = 50 # Process 50 alerts at a time to prevent timeouts
# Get tenant IDs to process
tenant_ids = [tenant_id] if tenant_id else await self._get_tenant_ids()
for current_tenant_id in tenant_ids:
offset = 0
while True:
async with self.db_manager.get_session() as session:
# Get a batch of active alerts
alerts_batch = await self._get_active_alerts_batch(session, current_tenant_id, offset, batch_size)
if not alerts_batch:
break # No more alerts to process
logger.info(f"Processing batch of {len(alerts_batch)} alerts for tenant {current_tenant_id}, offset {offset}")
for alert in alerts_batch:
try:
result = await self._recalculate_alert_priority(session, alert)
counts['processed'] += 1
if result['escalated']:
counts['escalated'] += 1
except Exception as e:
logger.error(
"Error recalculating alert priority",
alert_id=str(alert.id),
error=str(e)
)
counts['errors'] += 1
# Commit this batch
await session.commit()
# Update offset for next batch
offset += batch_size
# Log progress periodically
if offset % (batch_size * 10) == 0: # Every 10 batches
logger.info(
"Priority recalculation progress update",
tenant_id=str(current_tenant_id),
processed=counts['processed'],
escalated=counts['escalated'],
errors=counts['errors']
)
logger.info(
"Tenant priority recalculation completed",
tenant_id=str(current_tenant_id),
processed=counts['processed'],
escalated=counts['escalated'],
errors=counts['errors']
)
logger.info(
"Priority recalculation completed for all tenants",
**counts
)
except Exception as e:
logger.error(
"Priority recalculation job failed",
error=str(e)
)
counts['errors'] += 1
return counts
async def _get_active_alerts(
self,
session: AsyncSession,
tenant_id: UUID = None
) -> List[Alert]:
"""
Get all active alerts that need priority recalculation.
Filters:
- Status: active
- Type class: action_needed (only these can escalate)
- Has action_created_at set
"""
stmt = select(Alert).where(
Alert.status == AlertStatus.ACTIVE,
Alert.type_class == 'action_needed',
Alert.action_created_at.isnot(None),
Alert.hidden_from_ui == False
)
if tenant_id:
stmt = stmt.where(Alert.tenant_id == tenant_id)
# Order by oldest first (most likely to need escalation)
stmt = stmt.order_by(Alert.action_created_at.asc())
result = await session.execute(stmt)
return result.scalars().all()
async def _get_tenant_ids(self) -> List[UUID]:
"""
Get all unique tenant IDs that have active alerts that need recalculation.
"""
async with self.db_manager.get_session() as session:
# Get unique tenant IDs with active alerts
stmt = select(Alert.tenant_id).distinct().where(
Alert.status == AlertStatus.ACTIVE,
Alert.type_class == 'action_needed',
Alert.action_created_at.isnot(None),
Alert.hidden_from_ui == False
)
result = await session.execute(stmt)
tenant_ids = result.scalars().all()
return tenant_ids
async def _get_active_alerts_batch(
self,
session: AsyncSession,
tenant_id: UUID,
offset: int,
limit: int
) -> List[Alert]:
"""
Get a batch of active alerts that need priority recalculation.
Filters:
- Status: active
- Type class: action_needed (only these can escalate)
- Has action_created_at set
"""
stmt = select(Alert).where(
Alert.status == AlertStatus.ACTIVE,
Alert.type_class == 'action_needed',
Alert.action_created_at.isnot(None),
Alert.hidden_from_ui == False
)
if tenant_id:
stmt = stmt.where(Alert.tenant_id == tenant_id)
# Order by oldest first (most likely to need escalation)
stmt = stmt.order_by(Alert.action_created_at.asc())
# Apply offset and limit for batching
stmt = stmt.offset(offset).limit(limit)
result = await session.execute(stmt)
return result.scalars().all()
async def _recalculate_alert_priority(
self,
session: AsyncSession,
alert: Alert
) -> Dict[str, any]:
"""
Recalculate priority for a single alert with escalation boost.
Returns:
Dict with 'old_score', 'new_score', 'escalated' (bool)
"""
old_score = alert.priority_score
# Build urgency context from alert metadata
urgency_context = None
if alert.urgency_context:
urgency_context = UrgencyContext(**alert.urgency_context)
# Calculate escalation boost
boost = self.priority_service.calculate_escalation_boost(
action_created_at=alert.action_created_at,
urgency_context=urgency_context,
current_priority=old_score
)
# Apply boost
new_score = min(100, old_score + boost)
# Update if score changed
if new_score != old_score:
# Update priority score and level
new_level = self.priority_service.get_priority_level(new_score)
alert.priority_score = new_score
alert.priority_level = new_level
alert.updated_at = datetime.now(timezone.utc)
# Add escalation metadata
if not alert.alert_metadata:
alert.alert_metadata = {}
alert.alert_metadata['escalation'] = {
'original_score': old_score,
'boost_applied': boost,
'escalated_at': datetime.now(timezone.utc).isoformat(),
'reason': 'time_based_escalation'
}
# Invalidate cache
cache_key = f"alert:{alert.tenant_id}:{alert.id}"
await self.redis.delete(cache_key)
logger.info(
"Alert priority escalated",
alert_id=str(alert.id),
old_score=old_score,
new_score=new_score,
boost=boost,
old_level=alert.priority_level if old_score == new_score else self.priority_service.get_priority_level(old_score),
new_level=new_level
)
return {
'old_score': old_score,
'new_score': new_score,
'escalated': True
}
return {
'old_score': old_score,
'new_score': new_score,
'escalated': False
}
async def run_for_all_tenants(self) -> Dict[str, Dict[str, int]]:
"""
Run recalculation for all tenants.
Returns:
Dict mapping tenant_id to counts
"""
logger.info("Running priority recalculation for all tenants")
all_results = {}
try:
# Get unique tenant IDs with active alerts using the new efficient method
tenant_ids = await self._get_tenant_ids()
logger.info(f"Found {len(tenant_ids)} tenants with active alerts")
for tenant_id in tenant_ids:
try:
counts = await self.run(tenant_id)
all_results[str(tenant_id)] = counts
except Exception as e:
logger.error(
"Error processing tenant",
tenant_id=str(tenant_id),
error=str(e)
)
total_processed = sum(r['processed'] for r in all_results.values())
total_escalated = sum(r['escalated'] for r in all_results.values())
total_errors = sum(r['errors'] for r in all_results.values())
logger.info(
"All tenants processed",
tenants=len(all_results),
total_processed=total_processed,
total_escalated=total_escalated,
total_errors=total_errors
)
except Exception as e:
logger.error(
"Failed to run for all tenants",
error=str(e)
)
return all_results
async def run_priority_recalculation_job(config, db_manager, redis_client):
"""
Main entry point for scheduled job.
This is called by the scheduler (cron/celery/etc).
"""
job = PriorityRecalculationJob(config, db_manager, redis_client)
return await job.run_for_all_tenants()