Files
bakery-ia/services/tenant/app/api/network_alerts.py
2025-12-17 20:50:22 +01:00

445 lines
19 KiB
Python

"""
Network Alerts API
Endpoints for aggregating and managing alerts across enterprise networks
"""
from fastapi import APIRouter, Depends, HTTPException, Query
from typing import List, Dict, Any, Optional
from datetime import datetime
from pydantic import BaseModel, Field
import structlog
from app.services.network_alerts_service import NetworkAlertsService
from shared.auth.tenant_access import verify_tenant_permission_dep
from shared.clients import get_tenant_client, get_alerts_client
from app.core.config import settings
logger = structlog.get_logger()
router = APIRouter()
# Pydantic models for request/response
class NetworkAlert(BaseModel):
alert_id: str = Field(..., description="Unique alert ID")
tenant_id: str = Field(..., description="Tenant ID where alert originated")
tenant_name: str = Field(..., description="Tenant name")
alert_type: str = Field(..., description="Type of alert: inventory, production, delivery, etc.")
severity: str = Field(..., description="Severity: critical, high, medium, low")
title: str = Field(..., description="Alert title")
message: str = Field(..., description="Alert message")
timestamp: str = Field(..., description="Alert timestamp")
status: str = Field(..., description="Alert status: active, acknowledged, resolved")
source_system: str = Field(..., description="System that generated the alert")
related_entity_id: Optional[str] = Field(None, description="ID of related entity (product, route, etc.)")
related_entity_type: Optional[str] = Field(None, description="Type of related entity")
class AlertSeveritySummary(BaseModel):
critical_count: int = Field(..., description="Number of critical alerts")
high_count: int = Field(..., description="Number of high severity alerts")
medium_count: int = Field(..., description="Number of medium severity alerts")
low_count: int = Field(..., description="Number of low severity alerts")
total_alerts: int = Field(..., description="Total number of alerts")
class AlertTypeSummary(BaseModel):
inventory_alerts: int = Field(..., description="Inventory-related alerts")
production_alerts: int = Field(..., description="Production-related alerts")
delivery_alerts: int = Field(..., description="Delivery-related alerts")
equipment_alerts: int = Field(..., description="Equipment-related alerts")
quality_alerts: int = Field(..., description="Quality-related alerts")
other_alerts: int = Field(..., description="Other types of alerts")
class NetworkAlertsSummary(BaseModel):
total_alerts: int = Field(..., description="Total alerts across network")
active_alerts: int = Field(..., description="Currently active alerts")
acknowledged_alerts: int = Field(..., description="Acknowledged alerts")
resolved_alerts: int = Field(..., description="Resolved alerts")
severity_summary: AlertSeveritySummary = Field(..., description="Alerts by severity")
type_summary: AlertTypeSummary = Field(..., description="Alerts by type")
most_recent_alert: Optional[NetworkAlert] = Field(None, description="Most recent alert")
class AlertCorrelation(BaseModel):
correlation_id: str = Field(..., description="Correlation group ID")
primary_alert: NetworkAlert = Field(..., description="Primary alert in the group")
related_alerts: List[NetworkAlert] = Field(..., description="Alerts correlated with primary alert")
correlation_type: str = Field(..., description="Type of correlation: causal, temporal, spatial")
correlation_strength: float = Field(..., description="Correlation strength (0-1)")
impact_analysis: str = Field(..., description="Analysis of combined impact")
async def get_network_alerts_service() -> NetworkAlertsService:
"""Dependency injection for NetworkAlertsService"""
tenant_client = get_tenant_client(settings, "tenant-service")
alerts_client = get_alerts_client(settings, "tenant-service")
return NetworkAlertsService(tenant_client, alerts_client)
@router.get("/tenants/{parent_id}/network/alerts",
response_model=List[NetworkAlert],
summary="Get aggregated alerts across network")
async def get_network_alerts(
parent_id: str,
severity: Optional[str] = Query(None, description="Filter by severity: critical, high, medium, low"),
alert_type: Optional[str] = Query(None, description="Filter by alert type"),
status: Optional[str] = Query(None, description="Filter by status: active, acknowledged, resolved"),
limit: int = Query(100, description="Maximum number of alerts to return"),
network_alerts_service: NetworkAlertsService = Depends(get_network_alerts_service),
verified_tenant: str = Depends(verify_tenant_permission_dep)
):
"""
Get aggregated alerts across all child tenants in a parent network
This endpoint provides a unified view of alerts across the entire enterprise network,
enabling network managers to identify and prioritize issues that require attention.
"""
try:
# Verify this is a parent tenant
tenant_info = await network_alerts_service.tenant_client.get_tenant(parent_id)
if tenant_info.get('tenant_type') != 'parent':
raise HTTPException(
status_code=403,
detail="Only parent tenants can access network alerts"
)
# Get all child tenants
child_tenants = await network_alerts_service.get_child_tenants(parent_id)
if not child_tenants:
return []
# Aggregate alerts from all child tenants
all_alerts = []
for child in child_tenants:
child_id = child['id']
child_name = child['name']
# Get alerts for this child tenant
child_alerts = await network_alerts_service.get_alerts_for_tenant(child_id)
# Enrich with tenant information and apply filters
for alert in child_alerts:
enriched_alert = {
'alert_id': alert.get('alert_id', str(uuid.uuid4())),
'tenant_id': child_id,
'tenant_name': child_name,
'alert_type': alert.get('alert_type', 'unknown'),
'severity': alert.get('severity', 'medium'),
'title': alert.get('title', 'No title'),
'message': alert.get('message', 'No message'),
'timestamp': alert.get('timestamp', datetime.now().isoformat()),
'status': alert.get('status', 'active'),
'source_system': alert.get('source_system', 'unknown'),
'related_entity_id': alert.get('related_entity_id'),
'related_entity_type': alert.get('related_entity_type')
}
# Apply filters
if severity and enriched_alert['severity'] != severity:
continue
if alert_type and enriched_alert['alert_type'] != alert_type:
continue
if status and enriched_alert['status'] != status:
continue
all_alerts.append(enriched_alert)
# Sort by severity (critical first) and timestamp (newest first)
severity_order = {'critical': 1, 'high': 2, 'medium': 3, 'low': 4}
all_alerts.sort(key=lambda x: (severity_order.get(x['severity'], 5), -int(x['timestamp'] or 0)))
return all_alerts[:limit]
except Exception as e:
logger.error("Failed to get network alerts", parent_id=parent_id, error=str(e))
raise HTTPException(status_code=500, detail=f"Failed to get network alerts: {str(e)}")
@router.get("/tenants/{parent_id}/network/alerts/summary",
response_model=NetworkAlertsSummary,
summary="Get network alerts summary")
async def get_network_alerts_summary(
parent_id: str,
network_alerts_service: NetworkAlertsService = Depends(get_network_alerts_service),
verified_tenant: str = Depends(verify_tenant_permission_dep)
):
"""
Get summary of alerts across the network
Provides aggregated metrics and statistics about alerts across all child tenants.
"""
try:
# Verify this is a parent tenant
tenant_info = await network_alerts_service.tenant_client.get_tenant(parent_id)
if tenant_info.get('tenant_type') != 'parent':
raise HTTPException(
status_code=403,
detail="Only parent tenants can access network alerts summary"
)
# Get all network alerts
all_alerts = await network_alerts_service.get_network_alerts(parent_id)
if not all_alerts:
return NetworkAlertsSummary(
total_alerts=0,
active_alerts=0,
acknowledged_alerts=0,
resolved_alerts=0,
severity_summary=AlertSeveritySummary(
critical_count=0,
high_count=0,
medium_count=0,
low_count=0,
total_alerts=0
),
type_summary=AlertTypeSummary(
inventory_alerts=0,
production_alerts=0,
delivery_alerts=0,
equipment_alerts=0,
quality_alerts=0,
other_alerts=0
),
most_recent_alert=None
)
# Calculate summary metrics
active_alerts = sum(1 for a in all_alerts if a['status'] == 'active')
acknowledged_alerts = sum(1 for a in all_alerts if a['status'] == 'acknowledged')
resolved_alerts = sum(1 for a in all_alerts if a['status'] == 'resolved')
# Calculate severity summary
severity_summary = AlertSeveritySummary(
critical_count=sum(1 for a in all_alerts if a['severity'] == 'critical'),
high_count=sum(1 for a in all_alerts if a['severity'] == 'high'),
medium_count=sum(1 for a in all_alerts if a['severity'] == 'medium'),
low_count=sum(1 for a in all_alerts if a['severity'] == 'low'),
total_alerts=len(all_alerts)
)
# Calculate type summary
type_summary = AlertTypeSummary(
inventory_alerts=sum(1 for a in all_alerts if a['alert_type'] == 'inventory'),
production_alerts=sum(1 for a in all_alerts if a['alert_type'] == 'production'),
delivery_alerts=sum(1 for a in all_alerts if a['alert_type'] == 'delivery'),
equipment_alerts=sum(1 for a in all_alerts if a['alert_type'] == 'equipment'),
quality_alerts=sum(1 for a in all_alerts if a['alert_type'] == 'quality'),
other_alerts=sum(1 for a in all_alerts if a['alert_type'] not in ['inventory', 'production', 'delivery', 'equipment', 'quality'])
)
# Get most recent alert
most_recent_alert = None
if all_alerts:
most_recent_alert = max(all_alerts, key=lambda x: x['timestamp'])
return NetworkAlertsSummary(
total_alerts=len(all_alerts),
active_alerts=active_alerts,
acknowledged_alerts=acknowledged_alerts,
resolved_alerts=resolved_alerts,
severity_summary=severity_summary,
type_summary=type_summary,
most_recent_alert=most_recent_alert
)
except Exception as e:
logger.error("Failed to get network alerts summary", parent_id=parent_id, error=str(e))
raise HTTPException(status_code=500, detail=f"Failed to get alerts summary: {str(e)}")
@router.get("/tenants/{parent_id}/network/alerts/correlations",
response_model=List[AlertCorrelation],
summary="Get correlated alert groups")
async def get_correlated_alerts(
parent_id: str,
min_correlation_strength: float = Query(0.7, ge=0.5, le=1.0, description="Minimum correlation strength"),
network_alerts_service: NetworkAlertsService = Depends(get_network_alerts_service),
verified_tenant: str = Depends(verify_tenant_permission_dep)
):
"""
Get groups of correlated alerts
Identifies alerts that are related or have cascading effects across the network.
"""
try:
# Verify this is a parent tenant
tenant_info = await network_alerts_service.tenant_client.get_tenant(parent_id)
if tenant_info.get('tenant_type') != 'parent':
raise HTTPException(
status_code=403,
detail="Only parent tenants can access alert correlations"
)
# Get all network alerts
all_alerts = await network_alerts_service.get_network_alerts(parent_id)
if not all_alerts:
return []
# Detect correlations (simplified for demo)
correlations = await network_alerts_service.detect_alert_correlations(
all_alerts, min_correlation_strength
)
return correlations
except Exception as e:
logger.error("Failed to get correlated alerts", parent_id=parent_id, error=str(e))
raise HTTPException(status_code=500, detail=f"Failed to get alert correlations: {str(e)}")
@router.post("/tenants/{parent_id}/network/alerts/{alert_id}/acknowledge",
summary="Acknowledge network alert")
async def acknowledge_network_alert(
parent_id: str,
alert_id: str,
network_alerts_service: NetworkAlertsService = Depends(get_network_alerts_service),
verified_tenant: str = Depends(verify_tenant_permission_dep)
):
"""
Acknowledge a network alert
Marks an alert as acknowledged to indicate it's being addressed.
"""
try:
# Verify this is a parent tenant
tenant_info = await network_alerts_service.tenant_client.get_tenant(parent_id)
if tenant_info.get('tenant_type') != 'parent':
raise HTTPException(
status_code=403,
detail="Only parent tenants can acknowledge network alerts"
)
# Acknowledge the alert
result = await network_alerts_service.acknowledge_alert(parent_id, alert_id)
return {
'success': True,
'alert_id': alert_id,
'status': 'acknowledged',
'message': 'Alert acknowledged successfully'
}
except Exception as e:
logger.error("Failed to acknowledge alert", parent_id=parent_id, alert_id=alert_id, error=str(e))
raise HTTPException(status_code=500, detail=f"Failed to acknowledge alert: {str(e)}")
@router.post("/tenants/{parent_id}/network/alerts/{alert_id}/resolve",
summary="Resolve network alert")
async def resolve_network_alert(
parent_id: str,
alert_id: str,
resolution_notes: Optional[str] = Query(None, description="Notes about resolution"),
network_alerts_service: NetworkAlertsService = Depends(get_network_alerts_service),
verified_tenant: str = Depends(verify_tenant_permission_dep)
):
"""
Resolve a network alert
Marks an alert as resolved after the issue has been addressed.
"""
try:
# Verify this is a parent tenant
tenant_info = await network_alerts_service.tenant_client.get_tenant(parent_id)
if tenant_info.get('tenant_type') != 'parent':
raise HTTPException(
status_code=403,
detail="Only parent tenants can resolve network alerts"
)
# Resolve the alert
result = await network_alerts_service.resolve_alert(parent_id, alert_id, resolution_notes)
return {
'success': True,
'alert_id': alert_id,
'status': 'resolved',
'resolution_notes': resolution_notes,
'message': 'Alert resolved successfully'
}
except Exception as e:
logger.error("Failed to resolve alert", parent_id=parent_id, alert_id=alert_id, error=str(e))
raise HTTPException(status_code=500, detail=f"Failed to resolve alert: {str(e)}")
@router.get("/tenants/{parent_id}/network/alerts/trends",
summary="Get alert trends over time")
async def get_alert_trends(
parent_id: str,
days: int = Query(30, ge=7, le=365, description="Number of days to analyze"),
network_alerts_service: NetworkAlertsService = Depends(get_network_alerts_service),
verified_tenant: str = Depends(verify_tenant_permission_dep)
):
"""
Get alert trends over time
Analyzes how alert patterns change over time to identify systemic issues.
"""
try:
# Verify this is a parent tenant
tenant_info = await network_alerts_service.tenant_client.get_tenant(parent_id)
if tenant_info.get('tenant_type') != 'parent':
raise HTTPException(
status_code=403,
detail="Only parent tenants can access alert trends"
)
# Get alert trends
trends = await network_alerts_service.get_alert_trends(parent_id, days)
return {
'success': True,
'trends': trends,
'period': f'Last {days} days'
}
except Exception as e:
logger.error("Failed to get alert trends", parent_id=parent_id, error=str(e))
raise HTTPException(status_code=500, detail=f"Failed to get alert trends: {str(e)}")
@router.get("/tenants/{parent_id}/network/alerts/prioritization",
summary="Get prioritized alerts")
async def get_prioritized_alerts(
parent_id: str,
limit: int = Query(10, description="Maximum number of alerts to return"),
network_alerts_service: NetworkAlertsService = Depends(get_network_alerts_service),
verified_tenant: str = Depends(verify_tenant_permission_dep)
):
"""
Get prioritized alerts based on impact and urgency
Uses AI to prioritize alerts based on potential business impact and urgency.
"""
try:
# Verify this is a parent tenant
tenant_info = await network_alerts_service.tenant_client.get_tenant(parent_id)
if tenant_info.get('tenant_type') != 'parent':
raise HTTPException(
status_code=403,
detail="Only parent tenants can access prioritized alerts"
)
# Get prioritized alerts
prioritized_alerts = await network_alerts_service.get_prioritized_alerts(parent_id, limit)
return {
'success': True,
'prioritized_alerts': prioritized_alerts,
'message': f'Top {len(prioritized_alerts)} prioritized alerts'
}
except Exception as e:
logger.error("Failed to get prioritized alerts", parent_id=parent_id, error=str(e))
raise HTTPException(status_code=500, detail=f"Failed to get prioritized alerts: {str(e)}")
# Import datetime at runtime to avoid circular imports
from datetime import datetime, timedelta
import uuid