New alert system and panel de control page
This commit is contained in:
12
services/alert_processor/app/jobs/__init__.py
Normal file
12
services/alert_processor/app/jobs/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""
|
||||
Scheduled Jobs Package
|
||||
|
||||
Contains background jobs for the alert processor service.
|
||||
"""
|
||||
|
||||
from .priority_recalculation import PriorityRecalculationJob, run_priority_recalculation_job
|
||||
|
||||
__all__ = [
|
||||
"PriorityRecalculationJob",
|
||||
"run_priority_recalculation_job",
|
||||
]
|
||||
44
services/alert_processor/app/jobs/__main__.py
Normal file
44
services/alert_processor/app/jobs/__main__.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""
|
||||
Main entry point for alert processor jobs when run as modules.
|
||||
|
||||
This file makes the jobs package executable as a module:
|
||||
`python -m app.jobs.priority_recalculation`
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Add the app directory to Python path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "shared"))
|
||||
|
||||
from app.jobs.priority_recalculation import run_priority_recalculation_job
|
||||
from app.config import AlertProcessorConfig
|
||||
from shared.database.base import create_database_manager
|
||||
from app.core.cache import get_redis_client
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main entry point for the priority recalculation job."""
|
||||
# Initialize services
|
||||
config = AlertProcessorConfig()
|
||||
db_manager = create_database_manager(config.DATABASE_URL, "alert-processor")
|
||||
redis_client = await get_redis_client()
|
||||
|
||||
try:
|
||||
# Run the priority recalculation job
|
||||
results = await run_priority_recalculation_job(
|
||||
config=config,
|
||||
db_manager=db_manager,
|
||||
redis_client=redis_client
|
||||
)
|
||||
print(f"Priority recalculation completed: {results}")
|
||||
except Exception as e:
|
||||
print(f"Error running priority recalculation job: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
337
services/alert_processor/app/jobs/priority_recalculation.py
Normal file
337
services/alert_processor/app/jobs/priority_recalculation.py
Normal file
@@ -0,0 +1,337 @@
|
||||
"""
|
||||
Priority Recalculation Job
|
||||
|
||||
Scheduled job that recalculates priority scores for active alerts,
|
||||
applying time-based escalation boosts.
|
||||
|
||||
Runs hourly to ensure stale actions get escalated appropriately.
|
||||
"""
|
||||
|
||||
import structlog
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Dict, List
|
||||
from uuid import UUID
|
||||
|
||||
from sqlalchemy import select, update
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.models.events import Alert, AlertStatus
|
||||
from app.services.enrichment.priority_scoring import PriorityScoringService
|
||||
from shared.schemas.alert_types import UrgencyContext
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class PriorityRecalculationJob:
|
||||
"""Recalculates alert priorities with time-based escalation"""
|
||||
|
||||
def __init__(self, config, db_manager, redis_client):
|
||||
self.config = config
|
||||
self.db_manager = db_manager
|
||||
self.redis = redis_client
|
||||
self.priority_service = PriorityScoringService(config)
|
||||
|
||||
async def run(self, tenant_id: UUID = None) -> Dict[str, int]:
|
||||
"""
|
||||
Recalculate priorities for all active action-needed alerts.
|
||||
|
||||
Args:
|
||||
tenant_id: Optional tenant filter. If None, runs for all tenants.
|
||||
|
||||
Returns:
|
||||
Dict with counts: {
|
||||
'processed': int,
|
||||
'escalated': int,
|
||||
'errors': int
|
||||
}
|
||||
"""
|
||||
logger.info("Starting priority recalculation job", tenant_id=str(tenant_id) if tenant_id else "all")
|
||||
|
||||
counts = {
|
||||
'processed': 0,
|
||||
'escalated': 0,
|
||||
'errors': 0
|
||||
}
|
||||
|
||||
try:
|
||||
# Process alerts in batches to avoid memory issues and timeouts
|
||||
batch_size = 50 # Process 50 alerts at a time to prevent timeouts
|
||||
|
||||
# Get tenant IDs to process
|
||||
tenant_ids = [tenant_id] if tenant_id else await self._get_tenant_ids()
|
||||
|
||||
for current_tenant_id in tenant_ids:
|
||||
offset = 0
|
||||
while True:
|
||||
async with self.db_manager.get_session() as session:
|
||||
# Get a batch of active alerts
|
||||
alerts_batch = await self._get_active_alerts_batch(session, current_tenant_id, offset, batch_size)
|
||||
|
||||
if not alerts_batch:
|
||||
break # No more alerts to process
|
||||
|
||||
logger.info(f"Processing batch of {len(alerts_batch)} alerts for tenant {current_tenant_id}, offset {offset}")
|
||||
|
||||
for alert in alerts_batch:
|
||||
try:
|
||||
result = await self._recalculate_alert_priority(session, alert)
|
||||
counts['processed'] += 1
|
||||
if result['escalated']:
|
||||
counts['escalated'] += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Error recalculating alert priority",
|
||||
alert_id=str(alert.id),
|
||||
error=str(e)
|
||||
)
|
||||
counts['errors'] += 1
|
||||
|
||||
# Commit this batch
|
||||
await session.commit()
|
||||
|
||||
# Update offset for next batch
|
||||
offset += batch_size
|
||||
|
||||
# Log progress periodically
|
||||
if offset % (batch_size * 10) == 0: # Every 10 batches
|
||||
logger.info(
|
||||
"Priority recalculation progress update",
|
||||
tenant_id=str(current_tenant_id),
|
||||
processed=counts['processed'],
|
||||
escalated=counts['escalated'],
|
||||
errors=counts['errors']
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Tenant priority recalculation completed",
|
||||
tenant_id=str(current_tenant_id),
|
||||
processed=counts['processed'],
|
||||
escalated=counts['escalated'],
|
||||
errors=counts['errors']
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Priority recalculation completed for all tenants",
|
||||
**counts
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Priority recalculation job failed",
|
||||
error=str(e)
|
||||
)
|
||||
counts['errors'] += 1
|
||||
|
||||
return counts
|
||||
|
||||
async def _get_active_alerts(
|
||||
self,
|
||||
session: AsyncSession,
|
||||
tenant_id: UUID = None
|
||||
) -> List[Alert]:
|
||||
"""
|
||||
Get all active alerts that need priority recalculation.
|
||||
|
||||
Filters:
|
||||
- Status: active
|
||||
- Type class: action_needed (only these can escalate)
|
||||
- Has action_created_at set
|
||||
"""
|
||||
stmt = select(Alert).where(
|
||||
Alert.status == AlertStatus.ACTIVE,
|
||||
Alert.type_class == 'action_needed',
|
||||
Alert.action_created_at.isnot(None),
|
||||
Alert.hidden_from_ui == False
|
||||
)
|
||||
|
||||
if tenant_id:
|
||||
stmt = stmt.where(Alert.tenant_id == tenant_id)
|
||||
|
||||
# Order by oldest first (most likely to need escalation)
|
||||
stmt = stmt.order_by(Alert.action_created_at.asc())
|
||||
|
||||
result = await session.execute(stmt)
|
||||
return result.scalars().all()
|
||||
|
||||
async def _get_tenant_ids(self) -> List[UUID]:
|
||||
"""
|
||||
Get all unique tenant IDs that have active alerts that need recalculation.
|
||||
"""
|
||||
async with self.db_manager.get_session() as session:
|
||||
# Get unique tenant IDs with active alerts
|
||||
stmt = select(Alert.tenant_id).distinct().where(
|
||||
Alert.status == AlertStatus.ACTIVE,
|
||||
Alert.type_class == 'action_needed',
|
||||
Alert.action_created_at.isnot(None),
|
||||
Alert.hidden_from_ui == False
|
||||
)
|
||||
|
||||
result = await session.execute(stmt)
|
||||
tenant_ids = result.scalars().all()
|
||||
return tenant_ids
|
||||
|
||||
async def _get_active_alerts_batch(
|
||||
self,
|
||||
session: AsyncSession,
|
||||
tenant_id: UUID,
|
||||
offset: int,
|
||||
limit: int
|
||||
) -> List[Alert]:
|
||||
"""
|
||||
Get a batch of active alerts that need priority recalculation.
|
||||
|
||||
Filters:
|
||||
- Status: active
|
||||
- Type class: action_needed (only these can escalate)
|
||||
- Has action_created_at set
|
||||
"""
|
||||
stmt = select(Alert).where(
|
||||
Alert.status == AlertStatus.ACTIVE,
|
||||
Alert.type_class == 'action_needed',
|
||||
Alert.action_created_at.isnot(None),
|
||||
Alert.hidden_from_ui == False
|
||||
)
|
||||
|
||||
if tenant_id:
|
||||
stmt = stmt.where(Alert.tenant_id == tenant_id)
|
||||
|
||||
# Order by oldest first (most likely to need escalation)
|
||||
stmt = stmt.order_by(Alert.action_created_at.asc())
|
||||
|
||||
# Apply offset and limit for batching
|
||||
stmt = stmt.offset(offset).limit(limit)
|
||||
|
||||
result = await session.execute(stmt)
|
||||
return result.scalars().all()
|
||||
|
||||
async def _recalculate_alert_priority(
|
||||
self,
|
||||
session: AsyncSession,
|
||||
alert: Alert
|
||||
) -> Dict[str, any]:
|
||||
"""
|
||||
Recalculate priority for a single alert with escalation boost.
|
||||
|
||||
Returns:
|
||||
Dict with 'old_score', 'new_score', 'escalated' (bool)
|
||||
"""
|
||||
old_score = alert.priority_score
|
||||
|
||||
# Build urgency context from alert metadata
|
||||
urgency_context = None
|
||||
if alert.urgency_context:
|
||||
urgency_context = UrgencyContext(**alert.urgency_context)
|
||||
|
||||
# Calculate escalation boost
|
||||
boost = self.priority_service.calculate_escalation_boost(
|
||||
action_created_at=alert.action_created_at,
|
||||
urgency_context=urgency_context,
|
||||
current_priority=old_score
|
||||
)
|
||||
|
||||
# Apply boost
|
||||
new_score = min(100, old_score + boost)
|
||||
|
||||
# Update if score changed
|
||||
if new_score != old_score:
|
||||
# Update priority score and level
|
||||
new_level = self.priority_service.get_priority_level(new_score)
|
||||
|
||||
alert.priority_score = new_score
|
||||
alert.priority_level = new_level
|
||||
alert.updated_at = datetime.now(timezone.utc)
|
||||
|
||||
# Add escalation metadata
|
||||
if not alert.alert_metadata:
|
||||
alert.alert_metadata = {}
|
||||
|
||||
alert.alert_metadata['escalation'] = {
|
||||
'original_score': old_score,
|
||||
'boost_applied': boost,
|
||||
'escalated_at': datetime.now(timezone.utc).isoformat(),
|
||||
'reason': 'time_based_escalation'
|
||||
}
|
||||
|
||||
# Invalidate cache
|
||||
cache_key = f"alert:{alert.tenant_id}:{alert.id}"
|
||||
await self.redis.delete(cache_key)
|
||||
|
||||
logger.info(
|
||||
"Alert priority escalated",
|
||||
alert_id=str(alert.id),
|
||||
old_score=old_score,
|
||||
new_score=new_score,
|
||||
boost=boost,
|
||||
old_level=alert.priority_level if old_score == new_score else self.priority_service.get_priority_level(old_score),
|
||||
new_level=new_level
|
||||
)
|
||||
|
||||
return {
|
||||
'old_score': old_score,
|
||||
'new_score': new_score,
|
||||
'escalated': True
|
||||
}
|
||||
|
||||
return {
|
||||
'old_score': old_score,
|
||||
'new_score': new_score,
|
||||
'escalated': False
|
||||
}
|
||||
|
||||
async def run_for_all_tenants(self) -> Dict[str, Dict[str, int]]:
|
||||
"""
|
||||
Run recalculation for all tenants.
|
||||
|
||||
Returns:
|
||||
Dict mapping tenant_id to counts
|
||||
"""
|
||||
logger.info("Running priority recalculation for all tenants")
|
||||
|
||||
all_results = {}
|
||||
|
||||
try:
|
||||
# Get unique tenant IDs with active alerts using the new efficient method
|
||||
tenant_ids = await self._get_tenant_ids()
|
||||
logger.info(f"Found {len(tenant_ids)} tenants with active alerts")
|
||||
|
||||
for tenant_id in tenant_ids:
|
||||
try:
|
||||
counts = await self.run(tenant_id)
|
||||
all_results[str(tenant_id)] = counts
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Error processing tenant",
|
||||
tenant_id=str(tenant_id),
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
total_processed = sum(r['processed'] for r in all_results.values())
|
||||
total_escalated = sum(r['escalated'] for r in all_results.values())
|
||||
total_errors = sum(r['errors'] for r in all_results.values())
|
||||
|
||||
logger.info(
|
||||
"All tenants processed",
|
||||
tenants=len(all_results),
|
||||
total_processed=total_processed,
|
||||
total_escalated=total_escalated,
|
||||
total_errors=total_errors
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to run for all tenants",
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
async def run_priority_recalculation_job(config, db_manager, redis_client):
|
||||
"""
|
||||
Main entry point for scheduled job.
|
||||
|
||||
This is called by the scheduler (cron/celery/etc).
|
||||
"""
|
||||
job = PriorityRecalculationJob(config, db_manager, redis_client)
|
||||
return await job.run_for_all_tenants()
|
||||
Reference in New Issue
Block a user