New alert system and panel de control page

2025-11-27 15:52:40 +01:00
parent 1a2f4602f3
commit e902419b6e
178 changed files with 20982 additions and 6944 deletions
--- a/services/alert_processor/app/jobs/init.py
+++ b/services/alert_processor/app/jobs/init.py
@@ -0,0 +1,12 @@
+"""
+Scheduled Jobs Package
+
+Contains background jobs for the alert processor service.
+"""
+
+from .priority_recalculation import PriorityRecalculationJob, run_priority_recalculation_job
+
+__all__ = [
+    "PriorityRecalculationJob",
+    "run_priority_recalculation_job",
+]
--- a/services/alert_processor/app/jobs/main.py
+++ b/services/alert_processor/app/jobs/main.py
@@ -0,0 +1,44 @@
+"""
+Main entry point for alert processor jobs when run as modules.
+
+This file makes the jobs package executable as a module:
+`python -m app.jobs.priority_recalculation`
+"""
+
+import asyncio
+import sys
+import os
+from pathlib import Path
+
+# Add the app directory to Python path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "shared"))
+
+from app.jobs.priority_recalculation import run_priority_recalculation_job
+from app.config import AlertProcessorConfig
+from shared.database.base import create_database_manager
+from app.core.cache import get_redis_client
+
+
+async def main():
+    """Main entry point for the priority recalculation job."""
+    # Initialize services
+    config = AlertProcessorConfig()
+    db_manager = create_database_manager(config.DATABASE_URL, "alert-processor")
+    redis_client = await get_redis_client()
+
+    try:
+        # Run the priority recalculation job
+        results = await run_priority_recalculation_job(
+            config=config,
+            db_manager=db_manager,
+            redis_client=redis_client
+        )
+        print(f"Priority recalculation completed: {results}")
+    except Exception as e:
+        print(f"Error running priority recalculation job: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/services/alert_processor/app/jobs/priority_recalculation.py
+++ b/services/alert_processor/app/jobs/priority_recalculation.py
@@ -0,0 +1,337 @@
+"""
+Priority Recalculation Job
+
+Scheduled job that recalculates priority scores for active alerts,
+applying time-based escalation boosts.
+
+Runs hourly to ensure stale actions get escalated appropriately.
+"""
+
+import structlog
+from datetime import datetime, timedelta, timezone
+from typing import Dict, List
+from uuid import UUID
+
+from sqlalchemy import select, update
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.models.events import Alert, AlertStatus
+from app.services.enrichment.priority_scoring import PriorityScoringService
+from shared.schemas.alert_types import UrgencyContext
+
+logger = structlog.get_logger()
+
+
+class PriorityRecalculationJob:
+    """Recalculates alert priorities with time-based escalation"""
+
+    def __init__(self, config, db_manager, redis_client):
+        self.config = config
+        self.db_manager = db_manager
+        self.redis = redis_client
+        self.priority_service = PriorityScoringService(config)
+
+    async def run(self, tenant_id: UUID = None) -> Dict[str, int]:
+        """
+        Recalculate priorities for all active action-needed alerts.
+
+        Args:
+            tenant_id: Optional tenant filter. If None, runs for all tenants.
+
+        Returns:
+            Dict with counts: {
+                'processed': int,
+                'escalated': int,
+                'errors': int
+            }
+        """
+        logger.info("Starting priority recalculation job", tenant_id=str(tenant_id) if tenant_id else "all")
+
+        counts = {
+            'processed': 0,
+            'escalated': 0,
+            'errors': 0
+        }
+
+        try:
+            # Process alerts in batches to avoid memory issues and timeouts
+            batch_size = 50  # Process 50 alerts at a time to prevent timeouts
+
+            # Get tenant IDs to process
+            tenant_ids = [tenant_id] if tenant_id else await self._get_tenant_ids()
+
+            for current_tenant_id in tenant_ids:
+                offset = 0
+                while True:
+                    async with self.db_manager.get_session() as session:
+                        # Get a batch of active alerts
+                        alerts_batch = await self._get_active_alerts_batch(session, current_tenant_id, offset, batch_size)
+
+                        if not alerts_batch:
+                            break  # No more alerts to process
+
+                        logger.info(f"Processing batch of {len(alerts_batch)} alerts for tenant {current_tenant_id}, offset {offset}")
+
+                        for alert in alerts_batch:
+                            try:
+                                result = await self._recalculate_alert_priority(session, alert)
+                                counts['processed'] += 1
+                                if result['escalated']:
+                                    counts['escalated'] += 1
+
+                            except Exception as e:
+                                logger.error(
+                                    "Error recalculating alert priority",
+                                    alert_id=str(alert.id),
+                                    error=str(e)
+                                )
+                                counts['errors'] += 1
+
+                        # Commit this batch
+                        await session.commit()
+
+                        # Update offset for next batch
+                        offset += batch_size
+
+                        # Log progress periodically
+                        if offset % (batch_size * 10) == 0:  # Every 10 batches
+                            logger.info(
+                                "Priority recalculation progress update",
+                                tenant_id=str(current_tenant_id),
+                                processed=counts['processed'],
+                                escalated=counts['escalated'],
+                                errors=counts['errors']
+                            )
+
+                logger.info(
+                    "Tenant priority recalculation completed",
+                    tenant_id=str(current_tenant_id),
+                    processed=counts['processed'],
+                    escalated=counts['escalated'],
+                    errors=counts['errors']
+                )
+
+            logger.info(
+                "Priority recalculation completed for all tenants",
+                **counts
+            )
+
+        except Exception as e:
+            logger.error(
+                "Priority recalculation job failed",
+                error=str(e)
+            )
+            counts['errors'] += 1
+
+        return counts
+
+    async def _get_active_alerts(
+        self,
+        session: AsyncSession,
+        tenant_id: UUID = None
+    ) -> List[Alert]:
+        """
+        Get all active alerts that need priority recalculation.
+
+        Filters:
+        - Status: active
+        - Type class: action_needed (only these can escalate)
+        - Has action_created_at set
+        """
+        stmt = select(Alert).where(
+            Alert.status == AlertStatus.ACTIVE,
+            Alert.type_class == 'action_needed',
+            Alert.action_created_at.isnot(None),
+            Alert.hidden_from_ui == False
+        )
+
+        if tenant_id:
+            stmt = stmt.where(Alert.tenant_id == tenant_id)
+
+        # Order by oldest first (most likely to need escalation)
+        stmt = stmt.order_by(Alert.action_created_at.asc())
+
+        result = await session.execute(stmt)
+        return result.scalars().all()
+
+    async def _get_tenant_ids(self) -> List[UUID]:
+        """
+        Get all unique tenant IDs that have active alerts that need recalculation.
+        """
+        async with self.db_manager.get_session() as session:
+            # Get unique tenant IDs with active alerts
+            stmt = select(Alert.tenant_id).distinct().where(
+                Alert.status == AlertStatus.ACTIVE,
+                Alert.type_class == 'action_needed',
+                Alert.action_created_at.isnot(None),
+                Alert.hidden_from_ui == False
+            )
+
+            result = await session.execute(stmt)
+            tenant_ids = result.scalars().all()
+            return tenant_ids
+
+    async def _get_active_alerts_batch(
+        self,
+        session: AsyncSession,
+        tenant_id: UUID,
+        offset: int,
+        limit: int
+    ) -> List[Alert]:
+        """
+        Get a batch of active alerts that need priority recalculation.
+
+        Filters:
+        - Status: active
+        - Type class: action_needed (only these can escalate)
+        - Has action_created_at set
+        """
+        stmt = select(Alert).where(
+            Alert.status == AlertStatus.ACTIVE,
+            Alert.type_class == 'action_needed',
+            Alert.action_created_at.isnot(None),
+            Alert.hidden_from_ui == False
+        )
+
+        if tenant_id:
+            stmt = stmt.where(Alert.tenant_id == tenant_id)
+
+        # Order by oldest first (most likely to need escalation)
+        stmt = stmt.order_by(Alert.action_created_at.asc())
+
+        # Apply offset and limit for batching
+        stmt = stmt.offset(offset).limit(limit)
+
+        result = await session.execute(stmt)
+        return result.scalars().all()
+
+    async def _recalculate_alert_priority(
+        self,
+        session: AsyncSession,
+        alert: Alert
+    ) -> Dict[str, any]:
+        """
+        Recalculate priority for a single alert with escalation boost.
+
+        Returns:
+            Dict with 'old_score', 'new_score', 'escalated' (bool)
+        """
+        old_score = alert.priority_score
+
+        # Build urgency context from alert metadata
+        urgency_context = None
+        if alert.urgency_context:
+            urgency_context = UrgencyContext(**alert.urgency_context)
+
+        # Calculate escalation boost
+        boost = self.priority_service.calculate_escalation_boost(
+            action_created_at=alert.action_created_at,
+            urgency_context=urgency_context,
+            current_priority=old_score
+        )
+
+        # Apply boost
+        new_score = min(100, old_score + boost)
+
+        # Update if score changed
+        if new_score != old_score:
+            # Update priority score and level
+            new_level = self.priority_service.get_priority_level(new_score)
+
+            alert.priority_score = new_score
+            alert.priority_level = new_level
+            alert.updated_at = datetime.now(timezone.utc)
+
+            # Add escalation metadata
+            if not alert.alert_metadata:
+                alert.alert_metadata = {}
+
+            alert.alert_metadata['escalation'] = {
+                'original_score': old_score,
+                'boost_applied': boost,
+                'escalated_at': datetime.now(timezone.utc).isoformat(),
+                'reason': 'time_based_escalation'
+            }
+
+            # Invalidate cache
+            cache_key = f"alert:{alert.tenant_id}:{alert.id}"
+            await self.redis.delete(cache_key)
+
+            logger.info(
+                "Alert priority escalated",
+                alert_id=str(alert.id),
+                old_score=old_score,
+                new_score=new_score,
+                boost=boost,
+                old_level=alert.priority_level if old_score == new_score else self.priority_service.get_priority_level(old_score),
+                new_level=new_level
+            )
+
+            return {
+                'old_score': old_score,
+                'new_score': new_score,
+                'escalated': True
+            }
+
+        return {
+            'old_score': old_score,
+            'new_score': new_score,
+            'escalated': False
+        }
+
+    async def run_for_all_tenants(self) -> Dict[str, Dict[str, int]]:
+        """
+        Run recalculation for all tenants.
+
+        Returns:
+            Dict mapping tenant_id to counts
+        """
+        logger.info("Running priority recalculation for all tenants")
+
+        all_results = {}
+
+        try:
+            # Get unique tenant IDs with active alerts using the new efficient method
+            tenant_ids = await self._get_tenant_ids()
+            logger.info(f"Found {len(tenant_ids)} tenants with active alerts")
+
+            for tenant_id in tenant_ids:
+                try:
+                    counts = await self.run(tenant_id)
+                    all_results[str(tenant_id)] = counts
+                except Exception as e:
+                    logger.error(
+                        "Error processing tenant",
+                        tenant_id=str(tenant_id),
+                        error=str(e)
+                    )
+
+            total_processed = sum(r['processed'] for r in all_results.values())
+            total_escalated = sum(r['escalated'] for r in all_results.values())
+            total_errors = sum(r['errors'] for r in all_results.values())
+
+            logger.info(
+                "All tenants processed",
+                tenants=len(all_results),
+                total_processed=total_processed,
+                total_escalated=total_escalated,
+                total_errors=total_errors
+            )
+
+        except Exception as e:
+            logger.error(
+                "Failed to run for all tenants",
+                error=str(e)
+            )
+
+        return all_results
+
+
+async def run_priority_recalculation_job(config, db_manager, redis_client):
+    """
+    Main entry point for scheduled job.
+
+    This is called by the scheduler (cron/celery/etc).
+    """
+    job = PriorityRecalculationJob(config, db_manager, redis_client)
+    return await job.run_for_all_tenants()