New alert system and panel de control page

This commit is contained in:
Urtzi Alfaro
2025-11-27 15:52:40 +01:00
parent 1a2f4602f3
commit e902419b6e
178 changed files with 20982 additions and 6944 deletions

View File

@@ -3,7 +3,7 @@
Alerts API endpoints for dashboard and alert management
"""
from fastapi import APIRouter, HTTPException, Query, Path
from fastapi import APIRouter, HTTPException, Query, Path, Depends
from typing import List, Optional
from pydantic import BaseModel, Field
from uuid import UUID
@@ -11,7 +11,8 @@ from datetime import datetime
import structlog
from app.repositories.alerts_repository import AlertsRepository
from app.models.alerts import AlertSeverity, AlertStatus
from app.models.events import AlertStatus
from app.dependencies import get_current_user
logger = structlog.get_logger()
@@ -28,12 +29,14 @@ class AlertResponse(BaseModel):
tenant_id: str
item_type: str
alert_type: str
severity: str
priority_level: str
priority_score: int
status: str
service: str
title: str
message: str
actions: Optional[dict] = None
type_class: str
actions: Optional[List[dict]] = None # smart_actions is a list of action objects
alert_metadata: Optional[dict] = None
created_at: datetime
updated_at: datetime
@@ -47,10 +50,10 @@ class AlertsSummaryResponse(BaseModel):
"""Alerts summary for dashboard"""
total_count: int = Field(..., description="Total number of alerts")
active_count: int = Field(..., description="Number of active (unresolved) alerts")
critical_count: int = Field(..., description="Number of critical/urgent alerts")
high_count: int = Field(..., description="Number of high severity alerts")
medium_count: int = Field(..., description="Number of medium severity alerts")
low_count: int = Field(..., description="Number of low severity alerts")
critical_count: int = Field(..., description="Number of critical priority alerts")
high_count: int = Field(..., description="Number of high priority alerts")
medium_count: int = Field(..., description="Number of medium priority alerts")
low_count: int = Field(..., description="Number of low priority alerts")
resolved_count: int = Field(..., description="Number of resolved alerts")
acknowledged_count: int = Field(..., description="Number of acknowledged alerts")
@@ -71,7 +74,7 @@ class AlertsListResponse(BaseModel):
"/api/v1/tenants/{tenant_id}/alerts/summary",
response_model=AlertsSummaryResponse,
summary="Get alerts summary",
description="Get summary of alerts by severity and status for dashboard health indicator"
description="Get summary of alerts by priority level and status for dashboard health indicator"
)
async def get_alerts_summary(
tenant_id: UUID = Path(..., description="Tenant ID")
@@ -79,8 +82,8 @@ async def get_alerts_summary(
"""
Get alerts summary for dashboard
Returns counts of alerts grouped by severity and status.
Critical count maps to URGENT severity for dashboard compatibility.
Returns counts of alerts grouped by priority level and status.
Critical count maps to URGENT priority level for dashboard compatibility.
"""
from app.config import AlertProcessorConfig
from shared.database.base import create_database_manager
@@ -107,7 +110,7 @@ async def get_alerts_summary(
)
async def get_alerts(
tenant_id: UUID = Path(..., description="Tenant ID"),
severity: Optional[str] = Query(None, description="Filter by severity: low, medium, high, urgent"),
priority_level: Optional[str] = Query(None, description="Filter by priority level: critical, important, standard, info"),
status: Optional[str] = Query(None, description="Filter by status: active, resolved, acknowledged, ignored"),
resolved: Optional[bool] = Query(None, description="Filter by resolved status: true=resolved only, false=unresolved only"),
limit: int = Query(100, ge=1, le=1000, description="Maximum number of results"),
@@ -117,7 +120,7 @@ async def get_alerts(
Get filtered list of alerts
Supports filtering by:
- severity: low, medium, high, urgent (maps to "critical" in dashboard)
- priority_level: critical, important, standard, info
- status: active, resolved, acknowledged, ignored
- resolved: boolean filter for resolved status
- pagination: limit and offset
@@ -126,18 +129,20 @@ async def get_alerts(
from shared.database.base import create_database_manager
try:
# Validate severity enum
if severity and severity not in [s.value for s in AlertSeverity]:
# Validate priority_level enum
valid_priority_levels = ['critical', 'important', 'standard', 'info']
if priority_level and priority_level not in valid_priority_levels:
raise HTTPException(
status_code=400,
detail=f"Invalid severity. Must be one of: {[s.value for s in AlertSeverity]}"
detail=f"Invalid priority level. Must be one of: {valid_priority_levels}"
)
# Validate status enum
if status and status not in [s.value for s in AlertStatus]:
valid_status_values = ['active', 'resolved', 'acknowledged', 'ignored']
if status and status not in valid_status_values:
raise HTTPException(
status_code=400,
detail=f"Invalid status. Must be one of: {[s.value for s in AlertStatus]}"
detail=f"Invalid status. Must be one of: {valid_status_values}"
)
config = AlertProcessorConfig()
@@ -147,7 +152,7 @@ async def get_alerts(
repo = AlertsRepository(session)
alerts = await repo.get_alerts(
tenant_id=tenant_id,
severity=severity,
priority_level=priority_level,
status=status,
resolved=resolved,
limit=limit,
@@ -155,25 +160,42 @@ async def get_alerts(
)
# Convert to response models
alert_responses = [
AlertResponse(
alert_responses = []
for alert in alerts:
# Handle old format actions (strings) by converting to proper dict format
actions = alert.smart_actions
if actions and isinstance(actions, list) and len(actions) > 0:
# Check if actions are strings (old format)
if isinstance(actions[0], str):
# Convert old format to new format
actions = [
{
'action_type': action,
'label': action.replace('_', ' ').title(),
'variant': 'default',
'disabled': False
}
for action in actions
]
alert_responses.append(AlertResponse(
id=str(alert.id),
tenant_id=str(alert.tenant_id),
item_type=alert.item_type,
alert_type=alert.alert_type,
severity=alert.severity,
status=alert.status,
priority_level=alert.priority_level.value if hasattr(alert.priority_level, 'value') else alert.priority_level,
priority_score=alert.priority_score,
status=alert.status.value if hasattr(alert.status, 'value') else alert.status,
service=alert.service,
title=alert.title,
message=alert.message,
actions=alert.actions,
type_class=alert.type_class.value if hasattr(alert.type_class, 'value') else alert.type_class,
actions=actions, # Use converted actions
alert_metadata=alert.alert_metadata,
created_at=alert.created_at,
updated_at=alert.updated_at,
resolved_at=alert.resolved_at
)
for alert in alerts
]
))
return AlertsListResponse(
alerts=alert_responses,
@@ -214,17 +236,35 @@ async def get_alert(
if not alert:
raise HTTPException(status_code=404, detail="Alert not found")
# Handle old format actions (strings) by converting to proper dict format
actions = alert.smart_actions
if actions and isinstance(actions, list) and len(actions) > 0:
# Check if actions are strings (old format)
if isinstance(actions[0], str):
# Convert old format to new format
actions = [
{
'action_type': action,
'label': action.replace('_', ' ').title(),
'variant': 'default',
'disabled': False
}
for action in actions
]
return AlertResponse(
id=str(alert.id),
tenant_id=str(alert.tenant_id),
item_type=alert.item_type,
alert_type=alert.alert_type,
severity=alert.severity,
status=alert.status,
priority_level=alert.priority_level.value if hasattr(alert.priority_level, 'value') else alert.priority_level,
priority_score=alert.priority_score,
status=alert.status.value if hasattr(alert.status, 'value') else alert.status,
service=alert.service,
title=alert.title,
message=alert.message,
actions=alert.actions,
type_class=alert.type_class.value if hasattr(alert.type_class, 'value') else alert.type_class,
actions=actions, # Use converted actions
alert_metadata=alert.alert_metadata,
created_at=alert.created_at,
updated_at=alert.updated_at,
@@ -236,3 +276,242 @@ async def get_alert(
except Exception as e:
logger.error("Error getting alert", error=str(e), alert_id=str(alert_id))
raise HTTPException(status_code=500, detail=str(e))
@router.post(
"/api/v1/tenants/{tenant_id}/alerts/{alert_id}/cancel-auto-action",
summary="Cancel auto-action for escalation alert",
description="Cancel the pending auto-action for an escalation-type alert"
)
async def cancel_auto_action(
tenant_id: UUID = Path(..., description="Tenant ID"),
alert_id: UUID = Path(..., description="Alert ID")
) -> dict:
"""
Cancel the auto-action scheduled for an escalation alert.
This prevents the system from automatically executing the action.
"""
from app.config import AlertProcessorConfig
from shared.database.base import create_database_manager
from app.models.events import AlertStatus
try:
config = AlertProcessorConfig()
db_manager = create_database_manager(config.DATABASE_URL, "alert-processor")
async with db_manager.get_session() as session:
repo = AlertsRepository(session)
alert = await repo.get_alert_by_id(tenant_id, alert_id)
if not alert:
raise HTTPException(status_code=404, detail="Alert not found")
# Verify this is an escalation alert
if alert.type_class != 'escalation':
raise HTTPException(
status_code=400,
detail="Alert is not an escalation type, no auto-action to cancel"
)
# Update alert metadata to mark auto-action as cancelled
alert.alert_metadata = alert.alert_metadata or {}
alert.alert_metadata['auto_action_cancelled'] = True
alert.alert_metadata['auto_action_cancelled_at'] = datetime.utcnow().isoformat()
# Update urgency context to remove countdown
if alert.urgency_context:
alert.urgency_context['auto_action_countdown_seconds'] = None
alert.urgency_context['auto_action_cancelled'] = True
# Change type class from escalation to action_needed
alert.type_class = 'action_needed'
await session.commit()
await session.refresh(alert)
logger.info("Auto-action cancelled", alert_id=str(alert_id), tenant_id=str(tenant_id))
return {
"success": True,
"alert_id": str(alert_id),
"message": "Auto-action cancelled successfully",
"updated_type_class": alert.type_class.value
}
except HTTPException:
raise
except Exception as e:
logger.error("Error cancelling auto-action", error=str(e), alert_id=str(alert_id))
raise HTTPException(status_code=500, detail=str(e))
@router.post(
"/api/v1/tenants/{tenant_id}/alerts/{alert_id}/acknowledge",
summary="Acknowledge alert",
description="Mark alert as acknowledged"
)
async def acknowledge_alert(
tenant_id: UUID = Path(..., description="Tenant ID"),
alert_id: UUID = Path(..., description="Alert ID")
) -> dict:
"""Mark an alert as acknowledged"""
from app.config import AlertProcessorConfig
from shared.database.base import create_database_manager
from app.models.events import AlertStatus
try:
config = AlertProcessorConfig()
db_manager = create_database_manager(config.DATABASE_URL, "alert-processor")
async with db_manager.get_session() as session:
repo = AlertsRepository(session)
alert = await repo.get_alert_by_id(tenant_id, alert_id)
if not alert:
raise HTTPException(status_code=404, detail="Alert not found")
alert.status = AlertStatus.ACKNOWLEDGED
await session.commit()
logger.info("Alert acknowledged", alert_id=str(alert_id), tenant_id=str(tenant_id))
return {
"success": True,
"alert_id": str(alert_id),
"status": alert.status.value
}
except HTTPException:
raise
except Exception as e:
logger.error("Error acknowledging alert", error=str(e), alert_id=str(alert_id))
raise HTTPException(status_code=500, detail=str(e))
@router.post(
"/api/v1/tenants/{tenant_id}/alerts/{alert_id}/resolve",
summary="Resolve alert",
description="Mark alert as resolved"
)
async def resolve_alert(
tenant_id: UUID = Path(..., description="Tenant ID"),
alert_id: UUID = Path(..., description="Alert ID")
) -> dict:
"""Mark an alert as resolved"""
from app.config import AlertProcessorConfig
from shared.database.base import create_database_manager
from app.models.events import AlertStatus
try:
config = AlertProcessorConfig()
db_manager = create_database_manager(config.DATABASE_URL, "alert-processor")
async with db_manager.get_session() as session:
repo = AlertsRepository(session)
alert = await repo.get_alert_by_id(tenant_id, alert_id)
if not alert:
raise HTTPException(status_code=404, detail="Alert not found")
alert.status = AlertStatus.RESOLVED
alert.resolved_at = datetime.utcnow()
await session.commit()
logger.info("Alert resolved", alert_id=str(alert_id), tenant_id=str(tenant_id))
return {
"success": True,
"alert_id": str(alert_id),
"status": alert.status.value,
"resolved_at": alert.resolved_at.isoformat()
}
except HTTPException:
raise
except Exception as e:
logger.error("Error resolving alert", error=str(e), alert_id=str(alert_id))
raise HTTPException(status_code=500, detail=str(e))
@router.post(
"/api/v1/tenants/{tenant_id}/alerts/digest/send",
summary="Send email digest for alerts"
)
async def send_alert_digest(
tenant_id: UUID = Path(..., description="Tenant ID"),
days: int = Query(1, ge=1, le=7, description="Number of days to include in digest"),
digest_type: str = Query("daily", description="Type of digest: daily or weekly"),
user_email: str = Query(..., description="Email address to send digest to"),
user_name: str = Query(None, description="User name for personalization"),
current_user: dict = Depends(get_current_user)
):
"""
Send email digest of alerts.
Digest includes:
- AI Impact Summary (prevented issues, savings)
- Prevented Issues List with AI reasoning
- Action Needed Alerts
- Trend Warnings
"""
from app.config import AlertProcessorConfig
from shared.database.base import create_database_manager
from app.models.events import Alert
from app.services.enrichment.email_digest import EmailDigestService
from sqlalchemy import select, and_
from datetime import datetime, timedelta
try:
config = AlertProcessorConfig()
db_manager = create_database_manager(config.DATABASE_URL, "alert-processor")
async with db_manager.get_session() as session:
cutoff_date = datetime.utcnow() - timedelta(days=days)
# Fetch alerts from the specified period
query = select(Alert).where(
and_(
Alert.tenant_id == tenant_id,
Alert.created_at >= cutoff_date
)
).order_by(Alert.created_at.desc())
result = await session.execute(query)
alerts = result.scalars().all()
if not alerts:
return {
"success": False,
"message": "No alerts found for the specified period",
"alert_count": 0
}
# Send digest
digest_service = EmailDigestService(config)
if digest_type == "weekly":
success = await digest_service.send_weekly_digest(
tenant_id=tenant_id,
alerts=alerts,
user_email=user_email,
user_name=user_name
)
else:
success = await digest_service.send_daily_digest(
tenant_id=tenant_id,
alerts=alerts,
user_email=user_email,
user_name=user_name
)
return {
"success": success,
"message": f"{'Successfully sent' if success else 'Failed to send'} {digest_type} digest",
"alert_count": len(alerts),
"digest_type": digest_type,
"recipient": user_email
}
except Exception as e:
logger.error("Error sending email digest", error=str(e), tenant_id=str(tenant_id))
raise HTTPException(status_code=500, detail=f"Failed to send email digest: {str(e)}")