New alert system and panel de control page

This commit is contained in:
Urtzi Alfaro
2025-11-27 15:52:40 +01:00
parent 1a2f4602f3
commit e902419b6e
178 changed files with 20982 additions and 6944 deletions

View File

@@ -55,23 +55,26 @@ class HeadlineData(BaseModel):
class HealthChecklistItem(BaseModel):
"""Individual item in health checklist"""
icon: str = Field(..., description="Icon name: check, warning, alert")
"""Individual item in tri-state health checklist"""
icon: str = Field(..., description="Icon name: check, warning, alert, ai_handled")
text: Optional[str] = Field(None, description="Deprecated: Use textKey instead")
textKey: Optional[str] = Field(None, description="i18n translation key")
textParams: Optional[Dict[str, Any]] = Field(None, description="Parameters for i18n translation")
actionRequired: bool = Field(..., description="Whether action is required")
status: str = Field(..., description="Tri-state status: good, ai_handled, needs_you")
actionPath: Optional[str] = Field(None, description="Path to navigate for action")
class BakeryHealthStatusResponse(BaseModel):
"""Overall bakery health status"""
"""Overall bakery health status with tri-state checklist"""
status: str = Field(..., description="Health status: green, yellow, red")
headline: HeadlineData = Field(..., description="i18n-ready status headline")
lastOrchestrationRun: Optional[str] = Field(None, description="ISO timestamp of last orchestration")
nextScheduledRun: str = Field(..., description="ISO timestamp of next scheduled run")
checklistItems: List[HealthChecklistItem] = Field(..., description="Status checklist")
checklistItems: List[HealthChecklistItem] = Field(..., description="Tri-state status checklist")
criticalIssues: int = Field(..., description="Count of critical issues")
pendingActions: int = Field(..., description="Count of pending actions")
aiPreventedIssues: int = Field(0, description="Count of issues AI prevented")
class ReasoningInputs(BaseModel):
@@ -207,10 +210,10 @@ async def get_bakery_health_status(
db: AsyncSession = Depends(get_db)
) -> BakeryHealthStatusResponse:
"""
Get overall bakery health status
Get overall bakery health status with tri-state checklist
This is the top-level indicator showing if the bakery is running smoothly
or if there are issues requiring attention.
or if there are issues requiring attention. Includes AI-prevented issues.
"""
try:
# Try to get from cache
@@ -227,11 +230,19 @@ async def get_bakery_health_status(
async def fetch_alerts():
try:
alerts_data = await alerts_client.get_alerts_summary(tenant_id) or {}
return alerts_data.get("critical_count", 0)
alerts_data = await alerts_client.get_alerts(tenant_id, limit=100) or {}
alerts_list = alerts_data.get("alerts", [])
# Count critical alerts
critical_count = sum(1 for a in alerts_list if a.get('priority_level') == 'CRITICAL')
# Count AI prevented issues
prevented_count = sum(1 for a in alerts_list if a.get('type_class') == 'prevented_issue')
return critical_count, prevented_count, alerts_list
except Exception as e:
logger.warning(f"Failed to fetch alerts: {e}")
return 0
return 0, 0, []
async def fetch_pending_pos():
try:
@@ -260,24 +271,28 @@ async def get_bakery_health_status(
return 0
# Execute all fetches in parallel
critical_alerts, pending_approvals, production_delays, out_of_stock_count = await asyncio.gather(
alerts_result, pending_approvals, production_delays, out_of_stock_count = await asyncio.gather(
fetch_alerts(),
fetch_pending_pos(),
fetch_production_delays(),
fetch_inventory()
)
critical_alerts, ai_prevented_count, all_alerts = alerts_result
# System errors (would come from monitoring system)
system_errors = 0
# Calculate health status
# Calculate health status with tri-state checklist
health_status = await dashboard_service.get_bakery_health_status(
tenant_id=tenant_id,
critical_alerts=critical_alerts,
pending_approvals=pending_approvals,
production_delays=production_delays,
out_of_stock_count=out_of_stock_count,
system_errors=system_errors
system_errors=system_errors,
ai_prevented_count=ai_prevented_count,
action_needed_alerts=all_alerts
)
# Cache the result
@@ -501,6 +516,116 @@ async def get_production_timeline(
raise HTTPException(status_code=500, detail=str(e))
@router.get("/unified-action-queue")
async def get_unified_action_queue(
tenant_id: str,
db: AsyncSession = Depends(get_db)
) -> Dict[str, Any]:
"""
Get unified action queue with time-based grouping
Combines all alerts (PO approvals, delivery tracking, production, etc.)
into URGENT (<6h), TODAY (<24h), and THIS WEEK (<7d) sections.
"""
try:
dashboard_service = DashboardService(db)
# Fetch all alerts from alert processor
alerts_data = await alerts_client.get_alerts(tenant_id, limit=100) or {}
alerts = alerts_data.get("alerts", [])
# Build unified queue
action_queue = await dashboard_service.get_unified_action_queue(
tenant_id=tenant_id,
alerts=alerts
)
return action_queue
except Exception as e:
logger.error(f"Error getting unified action queue: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/execution-progress")
async def get_execution_progress(
tenant_id: str,
db: AsyncSession = Depends(get_db)
) -> Dict[str, Any]:
"""
Get execution progress for today's plan
Shows plan vs actual for production batches, deliveries, and approvals
"""
try:
dashboard_service = DashboardService(db)
# Fetch today's data in parallel
async def fetch_todays_batches():
try:
batch_data = await production_client.get_todays_batches(tenant_id)
if batch_data:
return batch_data.get("batches", [])
return []
except Exception as e:
logger.warning(f"Failed to fetch today's batches: {e}")
return []
async def fetch_expected_deliveries():
try:
# Get POs with expected deliveries today
from datetime import datetime, timedelta, timezone
pos_result = await procurement_client.get_pending_purchase_orders(tenant_id, limit=100)
if pos_result and isinstance(pos_result, list):
today_start = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
today_end = today_start.replace(hour=23, minute=59, second=59)
deliveries_today = []
for po in pos_result:
expected_date = po.get("expected_delivery_date")
if expected_date:
if isinstance(expected_date, str):
expected_date = datetime.fromisoformat(expected_date.replace('Z', '+00:00'))
if today_start <= expected_date <= today_end:
deliveries_today.append(po)
return deliveries_today
return []
except Exception as e:
logger.warning(f"Failed to fetch expected deliveries: {e}")
return []
async def fetch_pending_approvals():
try:
po_data = await procurement_client.get_pending_purchase_orders(tenant_id, limit=100) or []
return len(po_data) if isinstance(po_data, list) else 0
except Exception as e:
logger.warning(f"Failed to fetch pending approvals: {e}")
return 0
# Execute in parallel
todays_batches, expected_deliveries, pending_approvals = await asyncio.gather(
fetch_todays_batches(),
fetch_expected_deliveries(),
fetch_pending_approvals()
)
# Calculate progress
progress = await dashboard_service.get_execution_progress(
tenant_id=tenant_id,
todays_batches=todays_batches,
expected_deliveries=expected_deliveries,
pending_approvals=pending_approvals
)
return progress
except Exception as e:
logger.error(f"Error getting execution progress: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.get("/insights", response_model=InsightsResponse)
async def get_insights(
tenant_id: str,
@@ -575,35 +700,32 @@ async def get_insights(
async def fetch_savings():
try:
# Get recent POs (last 7 days) and sum up optimization savings
seven_days_ago = datetime.now(timezone.utc) - timedelta(days=7)
# Get prevented issue savings from alert analytics
analytics = await alerts_client.get_dashboard_analytics(tenant_id, days=7)
pos_result = await procurement_client.get_pending_purchase_orders(tenant_id, limit=200)
if pos_result and isinstance(pos_result, list):
weekly_savings = 0
# Calculate savings from price optimization
for po in pos_result:
# Check if PO was created in last 7 days
created_at = po.get("created_at")
if created_at:
if isinstance(created_at, str):
created_at = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
if created_at >= seven_days_ago:
# Sum up savings from optimization
optimization_data = po.get("optimization_data", {})
if isinstance(optimization_data, dict):
savings = optimization_data.get("savings", 0) or 0
weekly_savings += float(savings)
if analytics:
weekly_savings = analytics.get('estimated_savings_eur', 0)
prevented_count = analytics.get('prevented_issues_count', 0)
# Calculate trend from period comparison
period_comparison = analytics.get('period_comparison', {})
current_prevented = period_comparison.get('current_prevented', 0)
previous_prevented = period_comparison.get('previous_prevented', 0)
trend_percentage = 0
if previous_prevented > 0:
trend_percentage = ((current_prevented - previous_prevented) / previous_prevented) * 100
# Default trend percentage (would need historical data for real trend)
return {
"weekly_savings": round(weekly_savings, 2),
"trend_percentage": 12 if weekly_savings > 0 else 0
"trend_percentage": round(trend_percentage, 1),
"prevented_count": prevented_count
}
return {"weekly_savings": 0, "trend_percentage": 0}
return {"weekly_savings": 0, "trend_percentage": 0, "prevented_count": 0}
except Exception as e:
logger.warning(f"Failed to calculate savings data: {e}")
return {"weekly_savings": 0, "trend_percentage": 0}
return {"weekly_savings": 0, "trend_percentage": 0, "prevented_count": 0}
# Execute all fetches in parallel
sustainability_data, inventory_data, delivery_data, savings_data = await asyncio.gather(

View File

@@ -0,0 +1,181 @@
"""
Internal API for Alert Intelligence Service
Provides orchestrator context for alert enrichment
"""
from fastapi import APIRouter, Header, HTTPException, Query
from typing import Optional, List, Dict, Any
from datetime import datetime, timedelta
from uuid import UUID
from pydantic import BaseModel
router = APIRouter(prefix="/api/internal", tags=["internal"])
class OrchestrationAction(BaseModel):
"""Recent orchestration action"""
id: str
type: str # purchase_order, production_batch
status: str # created, pending_approval, approved, completed
delivery_date: Optional[datetime]
reasoning: Optional[Dict[str, Any]]
estimated_resolution: Optional[datetime]
created_at: datetime
class RecentActionsResponse(BaseModel):
"""Response with recent orchestrator actions"""
actions: List[OrchestrationAction]
count: int
@router.get("/recent-actions", response_model=RecentActionsResponse)
async def get_recent_actions(
tenant_id: str = Query(..., description="Tenant ID"),
ingredient_id: Optional[str] = Query(None, description="Filter by ingredient"),
product_id: Optional[str] = Query(None, description="Filter by product"),
hours_ago: int = Query(24, description="Look back hours"),
x_internal_service: str = Header(None, description="Internal service authentication")
):
"""
Get recent orchestrator actions for alert context enrichment.
Only accessible by internal services (alert-intelligence).
Returns orchestration runs with details about POs created, batches adjusted, etc.
This helps the alert system understand if AI already addressed similar issues.
"""
from shared.database.base import create_database_manager
from ..core.config import get_settings
from ..models.orchestration_run import OrchestrationRun, OrchestrationStatus
from sqlalchemy import select, and_, desc
import structlog
logger = structlog.get_logger()
# Simple internal service authentication
if x_internal_service != "alert-intelligence":
raise HTTPException(status_code=403, detail="Access denied")
try:
settings = get_settings()
db_manager = create_database_manager(settings.DATABASE_URL, "orchestrator")
async with db_manager.get_session() as session:
cutoff_time = datetime.utcnow() - timedelta(hours=hours_ago)
# Query recent orchestration runs
query = select(OrchestrationRun).where(
and_(
OrchestrationRun.tenant_id == UUID(tenant_id),
OrchestrationRun.created_at >= cutoff_time,
OrchestrationRun.status.in_([
OrchestrationStatus.completed,
OrchestrationStatus.partial_success
])
)
).order_by(desc(OrchestrationRun.created_at))
result = await session.execute(query)
runs = result.scalars().all()
actions = []
for run in runs:
run_metadata = run.run_metadata or {}
# Add purchase order actions
if run.purchase_orders_created > 0:
po_details = run_metadata.get('purchase_orders', [])
# If metadata has PO details, use them
if po_details:
for po in po_details:
# Filter by ingredient if specified
if ingredient_id:
po_items = po.get('items', [])
has_ingredient = any(
item.get('ingredient_id') == ingredient_id
for item in po_items
)
if not has_ingredient:
continue
actions.append(OrchestrationAction(
id=po.get('id', str(run.id)),
type="purchase_order",
status=po.get('status', 'created'),
delivery_date=po.get('delivery_date'),
reasoning=run_metadata.get('reasoning'),
estimated_resolution=po.get('delivery_date'),
created_at=run.created_at
))
else:
# Fallback: create generic action from run
actions.append(OrchestrationAction(
id=str(run.id),
type="purchase_order",
status="created",
delivery_date=None,
reasoning=run_metadata.get('reasoning'),
estimated_resolution=None,
created_at=run.created_at
))
# Add production batch actions
if run.production_batches_created > 0:
batch_details = run_metadata.get('production_batches', [])
if batch_details:
for batch in batch_details:
# Filter by product if specified
if product_id and batch.get('product_id') != product_id:
continue
actions.append(OrchestrationAction(
id=batch.get('id', str(run.id)),
type="production_batch",
status=batch.get('status', 'created'),
delivery_date=None,
reasoning=run_metadata.get('reasoning'),
estimated_resolution=batch.get('scheduled_date'),
created_at=run.created_at
))
else:
# Fallback: create generic action from run
if not product_id: # Only add if no product filter
actions.append(OrchestrationAction(
id=str(run.id),
type="production_batch",
status="created",
delivery_date=None,
reasoning=run_metadata.get('reasoning'),
estimated_resolution=None,
created_at=run.created_at
))
logger.info(
"recent_actions_fetched",
tenant_id=tenant_id,
hours_ago=hours_ago,
action_count=len(actions),
ingredient_id=ingredient_id,
product_id=product_id
)
return RecentActionsResponse(
actions=actions,
count=len(actions)
)
except Exception as e:
logger.error("error_fetching_recent_actions", error=str(e), tenant_id=tenant_id)
raise HTTPException(
status_code=500,
detail=f"Failed to fetch recent actions: {str(e)}"
)
@router.get("/health")
async def internal_health():
"""Internal health check"""
return {"status": "healthy", "api": "internal"}

View File

@@ -16,6 +16,12 @@ from app.models.orchestration_run import OrchestrationRun
import uuid
from datetime import datetime, timezone, timedelta
from typing import Optional
import sys
from pathlib import Path
# Add shared utilities to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
from shared.utils.demo_dates import adjust_date_for_demo, BASE_REFERENCE_DATE
router = APIRouter()
logger = structlog.get_logger()
@@ -24,6 +30,27 @@ logger = structlog.get_logger()
INTERNAL_API_KEY = os.getenv("INTERNAL_API_KEY", "dev-internal-key-change-in-production")
async def ensure_unique_run_number(db: AsyncSession, base_run_number: str) -> str:
"""Ensure the run number is unique by appending a suffix if needed"""
proposed_run_number = base_run_number
# Check if the proposed run number already exists in the database
while True:
result = await db.execute(
select(OrchestrationRun)
.where(OrchestrationRun.run_number == proposed_run_number)
)
existing_run = result.scalar_one_or_none()
if not existing_run:
# Run number is unique, return it
return proposed_run_number
# Generate a new run number with an additional random suffix
random_suffix = str(uuid.uuid4())[:4].upper()
proposed_run_number = f"{base_run_number[:50-len(random_suffix)-1]}-{random_suffix}"
def verify_internal_api_key(x_internal_api_key: str = Header(...)):
"""Verify internal API key for service-to-service communication"""
if x_internal_api_key != INTERNAL_API_KEY:
@@ -86,38 +113,60 @@ async def clone_demo_data(
# Clone each orchestration run with date adjustment
for base_run in base_runs:
# Calculate time offset: how old was this run relative to when it was created
# We'll adjust all timestamps to be relative to the session creation time
# Use the shared date adjustment utility to ensure dates are always in the past
# This calculates the offset from BASE_REFERENCE_DATE and applies it to session creation time
if base_run.started_at:
# Calculate how many days ago this run was from a reference point
# Use a fixed reference date for consistency
reference_date = datetime(2025, 1, 15, 12, 0, 0, tzinfo=timezone.utc)
time_offset = base_run.started_at - reference_date
# Apply this offset to the current reference time
new_started_at = reference_time + time_offset
new_started_at = adjust_date_for_demo(
base_run.started_at, reference_time, BASE_REFERENCE_DATE
)
else:
new_started_at = reference_time - timedelta(hours=2)
# Adjust completed_at if it exists
if base_run.completed_at and base_run.started_at:
duration = base_run.completed_at - base_run.started_at
new_completed_at = new_started_at + duration
# Adjust completed_at using the same utility
if base_run.completed_at:
new_completed_at = adjust_date_for_demo(
base_run.completed_at, reference_time, BASE_REFERENCE_DATE
)
# Ensure completion is after start (in case of edge cases)
if new_completed_at and new_started_at and new_completed_at < new_started_at:
# Preserve original duration
duration = base_run.completed_at - base_run.started_at
new_completed_at = new_started_at + duration
else:
new_completed_at = None
# Adjust all step timestamps proportionally
# Adjust all step timestamps using the shared utility
def adjust_timestamp(original_timestamp):
if not original_timestamp or not base_run.started_at:
if not original_timestamp:
return None
step_offset = original_timestamp - base_run.started_at
return new_started_at + step_offset
return adjust_date_for_demo(original_timestamp, reference_time, BASE_REFERENCE_DATE)
# Create new orchestration run for virtual tenant
# Update run_number to have current year instead of original year, and make it unique
current_year = reference_time.year
# Extract type from original run number and create new format
parts = base_run.run_number.split('-')
if len(parts) >= 4:
tenant_prefix = parts[1] if len(parts) > 1 else "DEMO"
type_code = parts[2] if len(parts) > 2 else "TST"
original_index = parts[3] if len(parts) > 3 else "001"
# Generate a more robust unique suffix to avoid collisions
# Use UUID instead of just session_id substring to ensure uniqueness
unique_suffix = str(uuid.uuid4())[:8].upper()
proposed_run_number = f"ORCH-{tenant_prefix}-{type_code}-{current_year}-{original_index}-{unique_suffix}"
else:
unique_suffix = str(uuid.uuid4())[:12].upper()
proposed_run_number = f"{base_run.run_number}-{unique_suffix}"
# Ensure the run number is truly unique by checking against existing entries
# This prevents collisions especially in high-concurrency scenarios
run_number = await ensure_unique_run_number(db, proposed_run_number)
new_run = OrchestrationRun(
id=uuid.uuid4(),
tenant_id=virtual_uuid,
run_number=f"{base_run.run_number}-DEMO",
run_number=run_number,
status=base_run.status,
run_type=base_run.run_type,
priority=base_run.priority,