338 lines
11 KiB
Python
338 lines
11 KiB
Python
"""
|
|
Priority Recalculation Job
|
|
|
|
Scheduled job that recalculates priority scores for active alerts,
|
|
applying time-based escalation boosts.
|
|
|
|
Runs hourly to ensure stale actions get escalated appropriately.
|
|
"""
|
|
|
|
import structlog
|
|
from datetime import datetime, timedelta, timezone
|
|
from typing import Dict, List
|
|
from uuid import UUID
|
|
|
|
from sqlalchemy import select, update
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.models.events import Alert, AlertStatus
|
|
from app.services.enrichment.priority_scoring import PriorityScoringService
|
|
from shared.schemas.alert_types import UrgencyContext
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
|
|
class PriorityRecalculationJob:
|
|
"""Recalculates alert priorities with time-based escalation"""
|
|
|
|
def __init__(self, config, db_manager, redis_client):
|
|
self.config = config
|
|
self.db_manager = db_manager
|
|
self.redis = redis_client
|
|
self.priority_service = PriorityScoringService(config)
|
|
|
|
async def run(self, tenant_id: UUID = None) -> Dict[str, int]:
|
|
"""
|
|
Recalculate priorities for all active action-needed alerts.
|
|
|
|
Args:
|
|
tenant_id: Optional tenant filter. If None, runs for all tenants.
|
|
|
|
Returns:
|
|
Dict with counts: {
|
|
'processed': int,
|
|
'escalated': int,
|
|
'errors': int
|
|
}
|
|
"""
|
|
logger.info("Starting priority recalculation job", tenant_id=str(tenant_id) if tenant_id else "all")
|
|
|
|
counts = {
|
|
'processed': 0,
|
|
'escalated': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
try:
|
|
# Process alerts in batches to avoid memory issues and timeouts
|
|
batch_size = 50 # Process 50 alerts at a time to prevent timeouts
|
|
|
|
# Get tenant IDs to process
|
|
tenant_ids = [tenant_id] if tenant_id else await self._get_tenant_ids()
|
|
|
|
for current_tenant_id in tenant_ids:
|
|
offset = 0
|
|
while True:
|
|
async with self.db_manager.get_session() as session:
|
|
# Get a batch of active alerts
|
|
alerts_batch = await self._get_active_alerts_batch(session, current_tenant_id, offset, batch_size)
|
|
|
|
if not alerts_batch:
|
|
break # No more alerts to process
|
|
|
|
logger.info(f"Processing batch of {len(alerts_batch)} alerts for tenant {current_tenant_id}, offset {offset}")
|
|
|
|
for alert in alerts_batch:
|
|
try:
|
|
result = await self._recalculate_alert_priority(session, alert)
|
|
counts['processed'] += 1
|
|
if result['escalated']:
|
|
counts['escalated'] += 1
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error recalculating alert priority",
|
|
alert_id=str(alert.id),
|
|
error=str(e)
|
|
)
|
|
counts['errors'] += 1
|
|
|
|
# Commit this batch
|
|
await session.commit()
|
|
|
|
# Update offset for next batch
|
|
offset += batch_size
|
|
|
|
# Log progress periodically
|
|
if offset % (batch_size * 10) == 0: # Every 10 batches
|
|
logger.info(
|
|
"Priority recalculation progress update",
|
|
tenant_id=str(current_tenant_id),
|
|
processed=counts['processed'],
|
|
escalated=counts['escalated'],
|
|
errors=counts['errors']
|
|
)
|
|
|
|
logger.info(
|
|
"Tenant priority recalculation completed",
|
|
tenant_id=str(current_tenant_id),
|
|
processed=counts['processed'],
|
|
escalated=counts['escalated'],
|
|
errors=counts['errors']
|
|
)
|
|
|
|
logger.info(
|
|
"Priority recalculation completed for all tenants",
|
|
**counts
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Priority recalculation job failed",
|
|
error=str(e)
|
|
)
|
|
counts['errors'] += 1
|
|
|
|
return counts
|
|
|
|
async def _get_active_alerts(
|
|
self,
|
|
session: AsyncSession,
|
|
tenant_id: UUID = None
|
|
) -> List[Alert]:
|
|
"""
|
|
Get all active alerts that need priority recalculation.
|
|
|
|
Filters:
|
|
- Status: active
|
|
- Type class: action_needed (only these can escalate)
|
|
- Has action_created_at set
|
|
"""
|
|
stmt = select(Alert).where(
|
|
Alert.status == AlertStatus.ACTIVE,
|
|
Alert.type_class == 'action_needed',
|
|
Alert.action_created_at.isnot(None),
|
|
Alert.hidden_from_ui == False
|
|
)
|
|
|
|
if tenant_id:
|
|
stmt = stmt.where(Alert.tenant_id == tenant_id)
|
|
|
|
# Order by oldest first (most likely to need escalation)
|
|
stmt = stmt.order_by(Alert.action_created_at.asc())
|
|
|
|
result = await session.execute(stmt)
|
|
return result.scalars().all()
|
|
|
|
async def _get_tenant_ids(self) -> List[UUID]:
|
|
"""
|
|
Get all unique tenant IDs that have active alerts that need recalculation.
|
|
"""
|
|
async with self.db_manager.get_session() as session:
|
|
# Get unique tenant IDs with active alerts
|
|
stmt = select(Alert.tenant_id).distinct().where(
|
|
Alert.status == AlertStatus.ACTIVE,
|
|
Alert.type_class == 'action_needed',
|
|
Alert.action_created_at.isnot(None),
|
|
Alert.hidden_from_ui == False
|
|
)
|
|
|
|
result = await session.execute(stmt)
|
|
tenant_ids = result.scalars().all()
|
|
return tenant_ids
|
|
|
|
async def _get_active_alerts_batch(
|
|
self,
|
|
session: AsyncSession,
|
|
tenant_id: UUID,
|
|
offset: int,
|
|
limit: int
|
|
) -> List[Alert]:
|
|
"""
|
|
Get a batch of active alerts that need priority recalculation.
|
|
|
|
Filters:
|
|
- Status: active
|
|
- Type class: action_needed (only these can escalate)
|
|
- Has action_created_at set
|
|
"""
|
|
stmt = select(Alert).where(
|
|
Alert.status == AlertStatus.ACTIVE,
|
|
Alert.type_class == 'action_needed',
|
|
Alert.action_created_at.isnot(None),
|
|
Alert.hidden_from_ui == False
|
|
)
|
|
|
|
if tenant_id:
|
|
stmt = stmt.where(Alert.tenant_id == tenant_id)
|
|
|
|
# Order by oldest first (most likely to need escalation)
|
|
stmt = stmt.order_by(Alert.action_created_at.asc())
|
|
|
|
# Apply offset and limit for batching
|
|
stmt = stmt.offset(offset).limit(limit)
|
|
|
|
result = await session.execute(stmt)
|
|
return result.scalars().all()
|
|
|
|
async def _recalculate_alert_priority(
|
|
self,
|
|
session: AsyncSession,
|
|
alert: Alert
|
|
) -> Dict[str, any]:
|
|
"""
|
|
Recalculate priority for a single alert with escalation boost.
|
|
|
|
Returns:
|
|
Dict with 'old_score', 'new_score', 'escalated' (bool)
|
|
"""
|
|
old_score = alert.priority_score
|
|
|
|
# Build urgency context from alert metadata
|
|
urgency_context = None
|
|
if alert.urgency_context:
|
|
urgency_context = UrgencyContext(**alert.urgency_context)
|
|
|
|
# Calculate escalation boost
|
|
boost = self.priority_service.calculate_escalation_boost(
|
|
action_created_at=alert.action_created_at,
|
|
urgency_context=urgency_context,
|
|
current_priority=old_score
|
|
)
|
|
|
|
# Apply boost
|
|
new_score = min(100, old_score + boost)
|
|
|
|
# Update if score changed
|
|
if new_score != old_score:
|
|
# Update priority score and level
|
|
new_level = self.priority_service.get_priority_level(new_score)
|
|
|
|
alert.priority_score = new_score
|
|
alert.priority_level = new_level
|
|
alert.updated_at = datetime.now(timezone.utc)
|
|
|
|
# Add escalation metadata
|
|
if not alert.alert_metadata:
|
|
alert.alert_metadata = {}
|
|
|
|
alert.alert_metadata['escalation'] = {
|
|
'original_score': old_score,
|
|
'boost_applied': boost,
|
|
'escalated_at': datetime.now(timezone.utc).isoformat(),
|
|
'reason': 'time_based_escalation'
|
|
}
|
|
|
|
# Invalidate cache
|
|
cache_key = f"alert:{alert.tenant_id}:{alert.id}"
|
|
await self.redis.delete(cache_key)
|
|
|
|
logger.info(
|
|
"Alert priority escalated",
|
|
alert_id=str(alert.id),
|
|
old_score=old_score,
|
|
new_score=new_score,
|
|
boost=boost,
|
|
old_level=alert.priority_level if old_score == new_score else self.priority_service.get_priority_level(old_score),
|
|
new_level=new_level
|
|
)
|
|
|
|
return {
|
|
'old_score': old_score,
|
|
'new_score': new_score,
|
|
'escalated': True
|
|
}
|
|
|
|
return {
|
|
'old_score': old_score,
|
|
'new_score': new_score,
|
|
'escalated': False
|
|
}
|
|
|
|
async def run_for_all_tenants(self) -> Dict[str, Dict[str, int]]:
|
|
"""
|
|
Run recalculation for all tenants.
|
|
|
|
Returns:
|
|
Dict mapping tenant_id to counts
|
|
"""
|
|
logger.info("Running priority recalculation for all tenants")
|
|
|
|
all_results = {}
|
|
|
|
try:
|
|
# Get unique tenant IDs with active alerts using the new efficient method
|
|
tenant_ids = await self._get_tenant_ids()
|
|
logger.info(f"Found {len(tenant_ids)} tenants with active alerts")
|
|
|
|
for tenant_id in tenant_ids:
|
|
try:
|
|
counts = await self.run(tenant_id)
|
|
all_results[str(tenant_id)] = counts
|
|
except Exception as e:
|
|
logger.error(
|
|
"Error processing tenant",
|
|
tenant_id=str(tenant_id),
|
|
error=str(e)
|
|
)
|
|
|
|
total_processed = sum(r['processed'] for r in all_results.values())
|
|
total_escalated = sum(r['escalated'] for r in all_results.values())
|
|
total_errors = sum(r['errors'] for r in all_results.values())
|
|
|
|
logger.info(
|
|
"All tenants processed",
|
|
tenants=len(all_results),
|
|
total_processed=total_processed,
|
|
total_escalated=total_escalated,
|
|
total_errors=total_errors
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
"Failed to run for all tenants",
|
|
error=str(e)
|
|
)
|
|
|
|
return all_results
|
|
|
|
|
|
async def run_priority_recalculation_job(config, db_manager, redis_client):
|
|
"""
|
|
Main entry point for scheduled job.
|
|
|
|
This is called by the scheduler (cron/celery/etc).
|
|
"""
|
|
job = PriorityRecalculationJob(config, db_manager, redis_client)
|
|
return await job.run_for_all_tenants()
|