New alert system and panel de control page

This commit is contained in:
Urtzi Alfaro
2025-11-27 15:52:40 +01:00
parent 1a2f4602f3
commit e902419b6e
178 changed files with 20982 additions and 6944 deletions

View File

@@ -41,7 +41,7 @@ spec:
cpu: "500m"
livenessProbe:
httpGet:
path: /
path: /health
port: 3000
initialDelaySeconds: 60
timeoutSeconds: 10
@@ -49,7 +49,7 @@ spec:
failureThreshold: 3
readinessProbe:
httpGet:
path: /
path: /health
port: 3000
initialDelaySeconds: 20
timeoutSeconds: 5

View File

@@ -187,6 +187,33 @@ data:
ALERT_DEDUPLICATION_WINDOW_MINUTES: "15"
RECOMMENDATION_DEDUPLICATION_WINDOW_MINUTES: "60"
# Alert Enrichment Configuration (Unified Alert Service)
# Priority scoring weights (must sum to 1.0)
BUSINESS_IMPACT_WEIGHT: "0.4"
URGENCY_WEIGHT: "0.3"
USER_AGENCY_WEIGHT: "0.2"
CONFIDENCE_WEIGHT: "0.1"
# Priority thresholds (0-100 scale)
CRITICAL_THRESHOLD: "90"
IMPORTANT_THRESHOLD: "70"
STANDARD_THRESHOLD: "50"
# Timing intelligence
BUSINESS_HOURS_START: "6"
BUSINESS_HOURS_END: "22"
PEAK_HOURS_START: "7"
PEAK_HOURS_END: "11"
PEAK_HOURS_EVENING_START: "17"
PEAK_HOURS_EVENING_END: "19"
# Alert grouping
GROUPING_TIME_WINDOW_MINUTES: "15"
MAX_ALERTS_PER_GROUP: "5"
# Email digest
DIGEST_SEND_TIME: "18:00"
# ================================================================
# CHECK FREQUENCIES (CRON EXPRESSIONS)
# ================================================================

View File

@@ -0,0 +1,120 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: alert-priority-recalculation
namespace: bakery-ia
labels:
app: alert-priority-recalculation
component: cron
service: alert-processor
spec:
# Schedule: Every hour at minute 15
schedule: "15 * * * *"
# Keep last 3 successful jobs and 1 failed job for debugging
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 1
# Don't start new job if previous one is still running
concurrencyPolicy: Forbid
# Job must complete within 10 minutes
startingDeadlineSeconds: 600
jobTemplate:
spec:
# Retry up to 2 times if job fails
backoffLimit: 2
# Job must complete within 30 minutes
activeDeadlineSeconds: 1800
template:
metadata:
labels:
app: alert-priority-recalculation
component: cron
spec:
restartPolicy: OnFailure
# Use alert-processor service image
containers:
- name: priority-recalc
image: bakery/alert-processor:latest
imagePullPolicy: Always
command:
- python3
- -m
- app.jobs.priority_recalculation
env:
# Database connection
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: database-secrets
key: ALERT_PROCESSOR_DATABASE_URL
# Redis connection
- name: REDIS_URL
value: rediss://redis-service:6379/0?ssl_cert_reqs=none
# Alert processor settings
- name: BUSINESS_IMPACT_WEIGHT
value: "0.40"
- name: URGENCY_WEIGHT
value: "0.30"
- name: USER_AGENCY_WEIGHT
value: "0.20"
- name: CONFIDENCE_WEIGHT
value: "0.10"
- name: CRITICAL_THRESHOLD
value: "90"
- name: IMPORTANT_THRESHOLD
value: "70"
- name: STANDARD_THRESHOLD
value: "50"
# Escalation thresholds (hours)
- name: ESCALATION_THRESHOLD_48H
value: "48"
- name: ESCALATION_THRESHOLD_72H
value: "72"
# Service settings
- name: LOG_LEVEL
value: "INFO"
- name: PYTHONUNBUFFERED
value: "1"
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
---
apiVersion: v1
kind: ConfigMap
metadata:
name: alert-priority-recalculation-config
namespace: bakery-ia
data:
schedule: "Hourly at minute 15"
description: "Recalculates alert priorities with time-based escalation"
escalation_48h_boost: "10"
escalation_72h_boost: "20"
deadline_24h_boost: "15"
deadline_6h_boost: "30"
max_boost: "30"

View File

@@ -0,0 +1,176 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: delivery-tracking
namespace: bakery-ia
labels:
app: delivery-tracking
component: cron
service: orchestrator
spec:
# Schedule: Every hour at minute 30
schedule: "30 * * * *"
# Keep last 3 successful jobs and 1 failed job for debugging
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 1
# Don't start new job if previous one is still running
concurrencyPolicy: Forbid
# Job must complete within 10 minutes
startingDeadlineSeconds: 600
jobTemplate:
spec:
# Retry up to 2 times if job fails
backoffLimit: 2
# Job must complete within 30 minutes
activeDeadlineSeconds: 1800
template:
metadata:
labels:
app: delivery-tracking
component: cron
spec:
restartPolicy: OnFailure
# Use orchestrator service image
containers:
- name: delivery-tracker
image: bakery/orchestrator-service:latest
imagePullPolicy: Always
command:
- python3
- -c
- |
import asyncio
import os
from app.services.delivery_tracking_service import DeliveryTrackingService
from shared.database.base import create_database_manager
from app.core.config import settings
from shared.messaging.rabbitmq import RabbitMQClient
import structlog
logger = structlog.get_logger()
async def run_delivery_tracking():
"""Run delivery tracking for all tenants"""
import redis.asyncio as redis
from shared.redis_utils import initialize_redis, get_redis_client
config = settings # Use the global settings instance
db_manager = create_database_manager(config.DATABASE_URL, "orchestrator")
try:
# Initialize Redis - This is an async function
await initialize_redis(config.REDIS_URL, db=2, max_connections=10) # Using db 2 for orchestrator
redis_client = await get_redis_client()
except Exception as e:
logger.error("Failed to initialize Redis", error=str(e))
raise
try:
rabbitmq_client = RabbitMQClient(config.RABBITMQ_URL, "delivery-tracking-job")
service = DeliveryTrackingService(
config=config,
db_manager=db_manager,
redis_client=redis_client,
rabbitmq_client=rabbitmq_client
)
logger.info("Starting delivery tracking job")
# Get active tenant IDs from environment variable
active_tenant_ids = os.environ.get('ACTIVE_TENANT_IDS', '')
if active_tenant_ids:
tenant_ids = [tid.strip() for tid in active_tenant_ids.split(',') if tid.strip()]
else:
tenant_ids = ['00000000-0000-0000-0000-000000000001'] # Default single tenant
for tenant_id in tenant_ids:
try:
result = await service.check_expected_deliveries(tenant_id)
logger.info("Delivery tracking completed", tenant_id=tenant_id, **result)
except Exception as e:
logger.error("Delivery tracking failed", tenant_id=tenant_id, error=str(e))
logger.info("Delivery tracking job completed")
except Exception as e:
logger.error("Delivery tracking service error", error=str(e))
raise
if __name__ == "__main__":
asyncio.run(run_delivery_tracking())
env:
# Database connection
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: database-secrets
key: ORCHESTRATOR_DATABASE_URL
# Redis connection
- name: REDIS_URL
valueFrom:
secretKeyRef:
name: database-secrets
key: REDIS_URL
# Service URLs
- name: ALERT_PROCESSOR_URL
value: "http://alert-processor-api:8000"
- name: PROCUREMENT_SERVICE_URL
value: "http://procurement-service:8000"
# Active tenants (comma-separated UUIDs)
- name: ACTIVE_TENANT_IDS
value: "00000000-0000-0000-0000-000000000001"
# Orchestrator settings
- name: ORCHESTRATOR_CONTEXT_CACHE_TTL
value: "300"
# Delivery tracking settings
- name: ARRIVING_SOON_HOURS_BEFORE
value: "2"
- name: OVERDUE_MINUTES_AFTER
value: "30"
- name: DEFAULT_DELIVERY_WINDOW_HOURS
value: "4"
# Service settings
- name: LOG_LEVEL
value: "INFO"
- name: PYTHONUNBUFFERED
value: "1"
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
---
apiVersion: v1
kind: ConfigMap
metadata:
name: delivery-tracking-config
namespace: bakery-ia
data:
schedule: "Hourly at minute 30"
description: "Checks expected deliveries and generates proactive alerts"
arriving_soon_hours: "2"
overdue_minutes: "30"
delivery_window_hours: "4"

View File

@@ -0,0 +1,67 @@
apiVersion: batch/v1
kind: Job
metadata:
name: demo-seed-alerts
namespace: bakery-ia
labels:
app: demo-seed
component: initialization
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "28" # After orchestration runs (27), as alerts reference recent data
spec:
ttlSecondsAfterFinished: 3600
template:
metadata:
labels:
app: demo-seed-alerts
spec:
initContainers:
- name: wait-for-alert-processor-migration
image: busybox:1.36
command:
- sh
- -c
- |
echo "Waiting 30 seconds for alert-processor-migration to complete..."
sleep 30
- name: wait-for-alert-processor-api
image: curlimages/curl:latest
command:
- sh
- -c
- |
echo "Waiting for alert-processor-api to be ready..."
until curl -f http://alert-processor-api.bakery-ia.svc.cluster.local:8010/health > /dev/null 2>&1; do
echo "alert-processor-api not ready yet, waiting..."
sleep 5
done
echo "alert-processor-api is ready!"
containers:
- name: seed-alerts
image: bakery/alert-processor:latest
command: ["python", "/app/scripts/demo/seed_demo_alerts.py"]
env:
- name: ALERT_PROCESSOR_DATABASE_URL
valueFrom:
secretKeyRef:
name: database-secrets
key: ALERT_PROCESSOR_DATABASE_URL
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: database-secrets
key: ALERT_PROCESSOR_DATABASE_URL
- name: DEMO_MODE
value: "production"
- name: LOG_LEVEL
value: "INFO"
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
restartPolicy: OnFailure
serviceAccountName: demo-seed-sa

View File

@@ -62,6 +62,7 @@ resources:
- jobs/demo-seed-forecasts-job.yaml
- jobs/demo-seed-pos-configs-job.yaml
- jobs/demo-seed-orchestration-runs-job.yaml
- jobs/demo-seed-alerts-job.yaml
# External data initialization job (v2.0)
- jobs/external-data-init-job.yaml
@@ -70,6 +71,8 @@ resources:
- cronjobs/demo-cleanup-cronjob.yaml
- cronjobs/external-data-rotation-cronjob.yaml
- cronjobs/usage-tracker-cronjob.yaml
- cronjobs/alert-priority-recalculation-cronjob.yaml
- cronjobs/delivery-tracking-cronjob.yaml
# Infrastructure components
- components/databases/redis.yaml

View File

@@ -172,11 +172,11 @@ patches:
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "64Mi"
cpu: "25m"
memory: "512Mi"
cpu: "200m"
limits:
memory: "128Mi"
cpu: "100m"
memory: "1Gi"
cpu: "1000m"
- target:
group: apps
version: v1