From 4b4268d64064a944af2645d700db1d176a456425 Mon Sep 17 00:00:00 2001 From: Urtzi Alfaro Date: Sat, 23 Aug 2025 10:19:58 +0200 Subject: [PATCH] Add new alert architecture --- README.md | 136 --- docker-compose.yml | 47 +- frontend/src/components/alerts/AlertCard.tsx | 304 +++++++ .../src/components/alerts/AlertDashboard.tsx | 347 ++++++++ .../src/components/alerts/AlertFilters.tsx | 148 ++++ frontend/src/components/alerts/AlertStats.tsx | 102 +++ .../components/alerts/ConnectionStatus.tsx | 70 ++ frontend/src/hooks/useAlertStream.ts | 359 ++++++++ frontend/src/types/alerts.ts | 126 +++ .../dashboards/alert-system-dashboard.json | 644 ++++++++++++++ .../prometheus/rules/alert-system-rules.yml | 243 ++++++ migrations/001_create_alert_tables.sql | 197 +++++ services/alert_processor/Dockerfile | 26 + services/alert_processor/app/__init__.py | 1 + services/alert_processor/app/config.py | 49 ++ services/alert_processor/app/main.py | 360 ++++++++ services/alert_processor/requirements.txt | 12 + services/auth/README.md | 129 --- services/forecasting/README.md | 169 ---- services/inventory/app/main.py | 14 + .../app/services/inventory_alert_service.py | 710 ++++++++++++++++ services/inventory/requirements.txt | 6 +- services/notification/README.md | 321 ------- services/notification/app/api/sse_routes.py | 189 +++++ services/notification/app/main.py | 78 +- .../app/services/email_service.py | 22 +- .../app/services/notification_orchestrator.py | 279 ++++++ .../notification/app/services/sse_service.py | 256 ++++++ .../app/services/whatsapp_service.py | 58 +- services/notification/requirements.txt | 4 +- services/orders/README.md | 248 ------ services/orders/app/core/database.py | 4 +- services/pos/README.md | 138 --- services/production/README.md | 187 ---- services/production/app/main.py | 18 + .../app/services/production_alert_service.py | 795 ++++++++++++++++++ services/production/requirements.txt | 8 + services/training/README.md | 220 ----- shared/alerts/__init__.py | 1 + shared/alerts/base_service.py | 353 ++++++++ shared/alerts/templates.py | 218 +++++ shared/config/rabbitmq_config.py | 82 ++ shared/database/utils.py | 2 +- shared/monitoring/alert_metrics.py | 420 +++++++++ shared/notifications/__init__.py | 8 - 45 files changed, 6518 insertions(+), 1590 deletions(-) delete mode 100644 README.md create mode 100644 frontend/src/components/alerts/AlertCard.tsx create mode 100644 frontend/src/components/alerts/AlertDashboard.tsx create mode 100644 frontend/src/components/alerts/AlertFilters.tsx create mode 100644 frontend/src/components/alerts/AlertStats.tsx create mode 100644 frontend/src/components/alerts/ConnectionStatus.tsx create mode 100644 frontend/src/hooks/useAlertStream.ts create mode 100644 frontend/src/types/alerts.ts create mode 100644 infrastructure/monitoring/grafana/dashboards/alert-system-dashboard.json create mode 100644 infrastructure/monitoring/prometheus/rules/alert-system-rules.yml create mode 100644 migrations/001_create_alert_tables.sql create mode 100644 services/alert_processor/Dockerfile create mode 100644 services/alert_processor/app/__init__.py create mode 100644 services/alert_processor/app/config.py create mode 100644 services/alert_processor/app/main.py create mode 100644 services/alert_processor/requirements.txt delete mode 100644 services/auth/README.md delete mode 100644 services/forecasting/README.md create mode 100644 services/inventory/app/services/inventory_alert_service.py delete mode 100644 services/notification/README.md create mode 100644 services/notification/app/api/sse_routes.py create mode 100644 services/notification/app/services/notification_orchestrator.py create mode 100644 services/notification/app/services/sse_service.py delete mode 100644 services/orders/README.md delete mode 100644 services/pos/README.md delete mode 100644 services/production/README.md create mode 100644 services/production/app/services/production_alert_service.py delete mode 100644 services/training/README.md create mode 100644 shared/alerts/__init__.py create mode 100644 shared/alerts/base_service.py create mode 100644 shared/alerts/templates.py create mode 100644 shared/config/rabbitmq_config.py create mode 100644 shared/monitoring/alert_metrics.py delete mode 100644 shared/notifications/__init__.py diff --git a/README.md b/README.md deleted file mode 100644 index 8817bb45..00000000 --- a/README.md +++ /dev/null @@ -1,136 +0,0 @@ -# Bakery Forecasting Platform - Microservices - -## Overview -AI-powered demand forecasting platform for bakeries in Madrid, Spain using microservices architecture. - -## Architecture -- **API Gateway**: Central entry point for all client requests -- **Auth Service**: User authentication and authorization -- **Training Service**: ML model training for demand forecasting -- **Forecasting Service**: Generate predictions using trained models -- **Data Service**: External data integration (weather, traffic, events) -- **Tenant Service**: Multi-tenant management -- **Notification Service**: Email and WhatsApp notifications - -## Quick Start - -### Prerequisites -- Docker and Docker Compose -- Python 3.11+ -- Node.js 18+ - -### Setup -```bash -# Run setup script (this script!) -./scripts/setup.sh - -# Start services -docker-compose up -d - -# Check service health -curl http://localhost:8000/health -``` - -### Services -- **Gateway**: http://localhost:8000 -- **API Docs**: http://localhost:8000/docs -- **Grafana**: http://localhost:3002 -- **Prometheus**: http://localhost:9090 -- **RabbitMQ Management**: http://localhost:15672 - -### Development - -#### Running Tests -```bash -./scripts/test.sh -``` - -#### Building Services -```bash -docker-compose build -``` - -#### Viewing Logs -```bash -# All services -docker-compose logs -f - -# Specific service -docker-compose logs -f auth-service -``` - -#### Service URLs (Development) -- Gateway: http://localhost:8000 -- Auth Service: http://localhost:8001 -- Training Service: http://localhost:8002 -- Forecasting Service: http://localhost:8003 -- Data Service: http://localhost:8004 -- Tenant Service: http://localhost:8005 -- Notification Service: http://localhost:8006 - -## Environment Variables - -Copy `.env.example` to `.env` and update the following: - -```bash -# External API Keys -AEMET_API_KEY=your-aemet-api-key -MADRID_OPENDATA_API_KEY=your-madrid-opendata-key - -# Email Configuration -SMTP_USER=your-email@gmail.com -SMTP_PASSWORD=your-email-password - -# WhatsApp API -WHATSAPP_API_KEY=your-whatsapp-api-key - -# JWT Secret (change in production!) -JWT_SECRET_KEY=your-super-secret-jwt-key-change-in-production -``` - -## Troubleshooting - -### Services won't start -```bash -# Check if ports are available -docker-compose ps -netstat -tulpn | grep :8000 - -# Restart services -docker-compose down -docker-compose up -d -``` - -### Database connection issues -```bash -# Check database containers -docker-compose logs auth-db -docker-compose logs training-db - -# Reset databases -docker-compose down -v -docker-compose up -d -``` - -### Service communication issues -```bash -# Check service health -curl http://localhost:8000/health -curl http://localhost:8001/health -curl http://localhost:8002/health - -# Check RabbitMQ -open http://localhost:15672 -# User: bakery, Password: forecast123 -``` - -## Next Steps - -1. **Configure External APIs**: Add your AEMET and Madrid Open Data API keys -2. **Test Authentication**: Register a user and test login -3. **Upload Sales Data**: Import historical sales data -4. **Train Models**: Start your first training job -5. **Generate Forecasts**: Create demand predictions - -## License -MIT License diff --git a/docker-compose.yml b/docker-compose.yml index c03f1bc4..45d3cbfc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -730,6 +730,43 @@ services: timeout: 10s retries: 3 + alert-processor: + build: + context: . + dockerfile: ./services/alert_processor/Dockerfile + args: + - ENVIRONMENT=${ENVIRONMENT} + - BUILD_DATE=${BUILD_DATE} + image: bakery/alert-processor:${IMAGE_TAG} + restart: unless-stopped + env_file: .env + depends_on: + redis: + condition: service_healthy + rabbitmq: + condition: service_healthy + notification-service: + condition: service_healthy + networks: + - bakery-network + volumes: + - log_storage:/app/logs + - ./services/alert_processor:/app + - ./shared:/app/shared + deploy: + replicas: 2 + resources: + limits: + memory: 512M + reservations: + memory: 256M + # No health check needed - this is a background worker service + # healthcheck: + # test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] + # interval: 30s + # timeout: 10s + # retries: 3 + inventory-service: build: context: . @@ -760,7 +797,7 @@ services: - ./services/inventory:/app - ./shared:/app/shared healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health/')"] interval: 30s timeout: 10s retries: 3 @@ -797,7 +834,7 @@ services: - ./services/recipes:/app - ./shared:/app/shared healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] interval: 30s timeout: 10s retries: 3 @@ -835,7 +872,7 @@ services: - ./services/suppliers:/app - ./shared:/app/shared healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] interval: 30s timeout: 10s retries: 3 @@ -911,7 +948,7 @@ services: - ./services/orders:/app - ./shared:/app/shared healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] interval: 30s timeout: 10s retries: 3 @@ -950,7 +987,7 @@ services: - ./services/production:/app - ./shared:/app/shared healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')"] interval: 30s timeout: 10s retries: 3 diff --git a/frontend/src/components/alerts/AlertCard.tsx b/frontend/src/components/alerts/AlertCard.tsx new file mode 100644 index 00000000..77ecab5e --- /dev/null +++ b/frontend/src/components/alerts/AlertCard.tsx @@ -0,0 +1,304 @@ +// frontend/src/components/alerts/AlertCard.tsx +/** + * Individual alert/recommendation card component + * Displays alert details with appropriate styling and actions + */ + +import React, { useState } from 'react'; +import { AlertItem, ItemSeverity, ItemType } from '../../types/alerts'; +import { formatDistanceToNow } from 'date-fns'; +import { es } from 'date-fns/locale'; + +interface AlertCardProps { + item: AlertItem; + onAcknowledge: (itemId: string) => void; + onResolve: (itemId: string) => void; + compact?: boolean; + showActions?: boolean; +} + +const getSeverityConfig = (severity: ItemSeverity, itemType: ItemType) => { + if (itemType === 'recommendation') { + switch (severity) { + case 'high': + return { + color: 'bg-blue-50 border-blue-200 text-blue-900', + icon: '💡', + badge: 'bg-blue-100 text-blue-800' + }; + case 'medium': + return { + color: 'bg-blue-50 border-blue-100 text-blue-800', + icon: '💡', + badge: 'bg-blue-50 text-blue-600' + }; + case 'low': + return { + color: 'bg-gray-50 border-gray-200 text-gray-700', + icon: '💡', + badge: 'bg-gray-100 text-gray-600' + }; + default: + return { + color: 'bg-blue-50 border-blue-200 text-blue-900', + icon: '💡', + badge: 'bg-blue-100 text-blue-800' + }; + } + } else { + switch (severity) { + case 'urgent': + return { + color: 'bg-red-50 border-red-300 text-red-900', + icon: '🚨', + badge: 'bg-red-100 text-red-800', + pulse: true + }; + case 'high': + return { + color: 'bg-orange-50 border-orange-200 text-orange-900', + icon: '⚠️', + badge: 'bg-orange-100 text-orange-800' + }; + case 'medium': + return { + color: 'bg-yellow-50 border-yellow-200 text-yellow-900', + icon: '🔔', + badge: 'bg-yellow-100 text-yellow-800' + }; + case 'low': + return { + color: 'bg-green-50 border-green-200 text-green-900', + icon: 'ℹ️', + badge: 'bg-green-100 text-green-800' + }; + default: + return { + color: 'bg-gray-50 border-gray-200 text-gray-700', + icon: '📋', + badge: 'bg-gray-100 text-gray-600' + }; + } + } +}; + +const getStatusConfig = (status: string) => { + switch (status) { + case 'acknowledged': + return { + color: 'bg-blue-100 text-blue-800', + label: 'Reconocido' + }; + case 'resolved': + return { + color: 'bg-green-100 text-green-800', + label: 'Resuelto' + }; + default: + return { + color: 'bg-gray-100 text-gray-800', + label: 'Activo' + }; + } +}; + +export const AlertCard: React.FC = ({ + item, + onAcknowledge, + onResolve, + compact = false, + showActions = true +}) => { + const [isExpanded, setIsExpanded] = useState(false); + const [actionLoading, setActionLoading] = useState(null); + + const severityConfig = getSeverityConfig(item.severity, item.item_type); + const statusConfig = getStatusConfig(item.status); + + const handleAction = async (action: () => void, actionType: string) => { + setActionLoading(actionType); + try { + await action(); + } finally { + setActionLoading(null); + } + }; + + const timeAgo = formatDistanceToNow(new Date(item.timestamp), { + addSuffix: true, + locale: es + }); + + return ( +
+ {/* Header */} +
+
+
+ {/* Icon and Type Badge */} +
+ {severityConfig.icon} +
+ +
+ {/* Title and Badges */} +
+
+

+ {item.title} +

+
+ + {item.item_type === 'alert' ? 'Alerta' : 'Recomendación'} - {item.severity} + + + {statusConfig.label} + + + {item.service} + +
+
+ + {/* Expand Button */} + {!compact && ( + + )} +
+ + {/* Message */} +

+ {item.message} +

+ + {/* Timestamp */} +

+ {timeAgo} • {new Date(item.timestamp).toLocaleString('es-ES')} +

+
+
+
+ + {/* Quick Actions */} + {showActions && item.status === 'active' && ( +
+ + + +
+ )} +
+ + {/* Expanded Details */} + {isExpanded && ( +
+ {/* Actions */} + {item.actions.length > 0 && ( +
+

Acciones sugeridas:

+
    + {item.actions.map((action, index) => ( +
  • + {action} +
  • + ))} +
+
+ )} + + {/* Metadata */} + {Object.keys(item.metadata).length > 0 && ( +
+

Detalles técnicos:

+
+ {Object.entries(item.metadata).map(([key, value]) => ( +
+ {key}:{' '} + + {typeof value === 'object' ? JSON.stringify(value) : String(value)} + +
+ ))} +
+
+ )} + + {/* Acknowledgment/Resolution Info */} + {(item.acknowledged_at || item.resolved_at) && ( +
+ {item.acknowledged_at && ( +

+ Reconocido: {new Date(item.acknowledged_at).toLocaleString('es-ES')} + {item.acknowledged_by && ` por ${item.acknowledged_by}`} +

+ )} + {item.resolved_at && ( +

+ Resuelto: {new Date(item.resolved_at).toLocaleString('es-ES')} + {item.resolved_by && ` por ${item.resolved_by}`} +

+ )} +
+ )} +
+ )} +
+ ); +}; \ No newline at end of file diff --git a/frontend/src/components/alerts/AlertDashboard.tsx b/frontend/src/components/alerts/AlertDashboard.tsx new file mode 100644 index 00000000..32df033c --- /dev/null +++ b/frontend/src/components/alerts/AlertDashboard.tsx @@ -0,0 +1,347 @@ +// frontend/src/components/alerts/AlertDashboard.tsx +/** + * Main dashboard component for alerts and recommendations + * Provides filtering, bulk actions, and real-time updates + */ + +import React, { useState, useEffect, useMemo } from 'react'; +import { AlertItem, ItemFilters, ItemType, ItemSeverity, ItemStatus } from '../../types/alerts'; +import { useAlertStream } from '../../hooks/useAlertStream'; +import { AlertCard } from './AlertCard'; +import { AlertFilters } from './AlertFilters'; +import { AlertStats } from './AlertStats'; +import { ConnectionStatus } from './ConnectionStatus'; +import { useTenantId } from '../../hooks/useTenantId'; + +interface AlertDashboardProps { + className?: string; + maxItems?: number; + autoRequestNotifications?: boolean; +} + +export const AlertDashboard: React.FC = ({ + className = '', + maxItems = 50, + autoRequestNotifications = true +}) => { + const tenantId = useTenantId(); + const { + items, + connectionState, + urgentCount, + highCount, + recCount, + acknowledgeItem, + resolveItem, + notificationPermission, + requestNotificationPermission + } = useAlertStream({ tenantId }); + + const [filters, setFilters] = useState({ + item_type: 'all', + severity: 'all', + status: 'all', + service: 'all', + search: '' + }); + + const [selectedItems, setSelectedItems] = useState([]); + const [bulkActionsOpen, setBulkActionsOpen] = useState(false); + const [viewMode, setViewMode] = useState<'list' | 'compact'>('list'); + + // Request notification permission on mount if needed + useEffect(() => { + if (autoRequestNotifications && notificationPermission === 'default') { + // Delay request to avoid immediate popup + const timer = setTimeout(() => { + requestNotificationPermission(); + }, 2000); + return () => clearTimeout(timer); + } + }, [autoRequestNotifications, notificationPermission, requestNotificationPermission]); + + // Filter items based on current filters + const filteredItems = useMemo(() => { + let filtered = items; + + // Filter by type + if (filters.item_type !== 'all') { + filtered = filtered.filter(item => item.item_type === filters.item_type); + } + + // Filter by severity + if (filters.severity !== 'all') { + filtered = filtered.filter(item => item.severity === filters.severity); + } + + // Filter by status + if (filters.status !== 'all') { + filtered = filtered.filter(item => item.status === filters.status); + } + + // Filter by service + if (filters.service !== 'all') { + filtered = filtered.filter(item => item.service === filters.service); + } + + // Filter by search text + if (filters.search.trim()) { + const searchLower = filters.search.toLowerCase(); + filtered = filtered.filter(item => + item.title.toLowerCase().includes(searchLower) || + item.message.toLowerCase().includes(searchLower) || + item.type.toLowerCase().includes(searchLower) + ); + } + + return filtered.slice(0, maxItems); + }, [items, filters, maxItems]); + + // Get unique services for filter dropdown + const availableServices = useMemo(() => { + const services = [...new Set(items.map(item => item.service))].sort(); + return services; + }, [items]); + + // Handle bulk actions + const handleBulkAcknowledge = async () => { + await Promise.all(selectedItems.map(id => acknowledgeItem(id))); + setSelectedItems([]); + setBulkActionsOpen(false); + }; + + const handleBulkResolve = async () => { + await Promise.all(selectedItems.map(id => resolveItem(id))); + setSelectedItems([]); + setBulkActionsOpen(false); + }; + + const handleSelectAll = () => { + const selectableItems = filteredItems + .filter(item => item.status === 'active') + .map(item => item.id); + setSelectedItems(selectableItems); + }; + + const handleClearSelection = () => { + setSelectedItems([]); + setBulkActionsOpen(false); + }; + + const toggleItemSelection = (itemId: string) => { + setSelectedItems(prev => + prev.includes(itemId) + ? prev.filter(id => id !== itemId) + : [...prev, itemId] + ); + }; + + const activeItems = filteredItems.filter(item => item.status === 'active'); + const hasSelection = selectedItems.length > 0; + + return ( +
+ {/* Header */} +
+
+
+

+ Sistema de Alertas y Recomendaciones +

+

+ Monitoreo en tiempo real de operaciones de panadería +

+
+ + {/* Connection Status */} + +
+
+ + {/* Stats */} + + + {/* Notification Permission Banner */} + {notificationPermission === 'denied' && ( +
+
+
+ + + +
+
+

+ Notificaciones bloqueadas +

+

+ Las notificaciones del navegador están deshabilitadas. No recibirás alertas urgentes en tiempo real. +

+
+
+
+ )} + + {/* Filters and View Controls */} +
+
+ + +
+ {/* View Mode Toggle */} +
+ + +
+ + {/* Bulk Actions */} + {activeItems.length > 0 && ( +
+ +
+ )} +
+
+ + {/* Bulk Actions Panel */} + {bulkActionsOpen && activeItems.length > 0 && ( +
+
+
+ + {selectedItems.length} elementos seleccionados + + + +
+ + {hasSelection && ( +
+ + +
+ )} +
+
+ )} +
+ + {/* Items List */} +
+ {filteredItems.length === 0 ? ( +
+ {items.length === 0 ? ( +
+ + + +

+ Sistema operativo +

+

+ No hay alertas activas en este momento. Todas las operaciones funcionan correctamente. +

+
+ ) : ( +
+ + + +

+ No se encontraron elementos +

+

+ Intenta ajustar los filtros para ver más elementos. +

+
+ )} +
+ ) : ( +
+ {filteredItems.map((item) => ( +
+ {/* Selection Checkbox */} + {bulkActionsOpen && item.status === 'active' && ( +
+ toggleItemSelection(item.id)} + className="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded" + /> +
+ )} + +
+ +
+
+ ))} +
+ )} +
+
+ ); +}; \ No newline at end of file diff --git a/frontend/src/components/alerts/AlertFilters.tsx b/frontend/src/components/alerts/AlertFilters.tsx new file mode 100644 index 00000000..47867d98 --- /dev/null +++ b/frontend/src/components/alerts/AlertFilters.tsx @@ -0,0 +1,148 @@ +// frontend/src/components/alerts/AlertFilters.tsx +/** + * Filter controls for the alert dashboard + */ + +import React from 'react'; +import { ItemFilters, ItemType, ItemSeverity, ItemStatus } from '../../types/alerts'; + +interface AlertFiltersProps { + filters: ItemFilters; + onFiltersChange: (filters: ItemFilters) => void; + availableServices: string[]; +} + +export const AlertFilters: React.FC = ({ + filters, + onFiltersChange, + availableServices +}) => { + const updateFilter = (key: keyof ItemFilters, value: string) => { + onFiltersChange({ + ...filters, + [key]: value + }); + }; + + return ( +
+ {/* Search */} +
+ +
+
+ + + +
+ updateFilter('search', e.target.value)} + className="block w-full pl-10 pr-3 py-2 border border-gray-300 rounded-md leading-5 bg-white placeholder-gray-500 focus:outline-none focus:placeholder-gray-400 focus:ring-1 focus:ring-blue-500 focus:border-blue-500 sm:text-sm" + /> +
+
+ + {/* Type Filter */} +
+ + +
+ + {/* Severity Filter */} +
+ + +
+ + {/* Status Filter */} +
+ + +
+ + {/* Service Filter */} + {availableServices.length > 0 && ( +
+ + +
+ )} + + {/* Clear Filters */} + {(filters.search || filters.item_type !== 'all' || filters.severity !== 'all' || + filters.status !== 'all' || filters.service !== 'all') && ( + + )} +
+ ); +}; \ No newline at end of file diff --git a/frontend/src/components/alerts/AlertStats.tsx b/frontend/src/components/alerts/AlertStats.tsx new file mode 100644 index 00000000..e76e3afa --- /dev/null +++ b/frontend/src/components/alerts/AlertStats.tsx @@ -0,0 +1,102 @@ +// frontend/src/components/alerts/AlertStats.tsx +/** + * Statistics display for alerts and recommendations + */ + +import React from 'react'; + +interface AlertStatsProps { + urgentCount: number; + highCount: number; + recCount: number; + totalItems: number; + activeItems: number; +} + +export const AlertStats: React.FC = ({ + urgentCount, + highCount, + recCount, + totalItems, + activeItems +}) => { + const stats = [ + { + name: 'Alertas Urgentes', + value: urgentCount, + icon: '🚨', + color: urgentCount > 0 ? 'text-red-600' : 'text-gray-600', + bgColor: urgentCount > 0 ? 'bg-red-50' : 'bg-gray-50', + borderColor: urgentCount > 0 ? 'border-red-200' : 'border-gray-200' + }, + { + name: 'Alertas Altas', + value: highCount, + icon: '⚠️', + color: highCount > 0 ? 'text-orange-600' : 'text-gray-600', + bgColor: highCount > 0 ? 'bg-orange-50' : 'bg-gray-50', + borderColor: highCount > 0 ? 'border-orange-200' : 'border-gray-200' + }, + { + name: 'Recomendaciones', + value: recCount, + icon: '💡', + color: recCount > 0 ? 'text-blue-600' : 'text-gray-600', + bgColor: recCount > 0 ? 'bg-blue-50' : 'bg-gray-50', + borderColor: recCount > 0 ? 'border-blue-200' : 'border-gray-200' + }, + { + name: 'Total Activos', + value: activeItems, + icon: '📊', + color: 'text-gray-600', + bgColor: 'bg-gray-50', + borderColor: 'border-gray-200' + } + ]; + + return ( +
+
+
+ {stats.map((stat) => ( +
+
+ {stat.icon} + {stat.name} +
+
+ {stat.value} +
+ + {/* Pulse animation for urgent alerts */} + {stat.name === 'Alertas Urgentes' && urgentCount > 0 && ( +
+ )} +
+ ))} +
+ + {/* Summary text */} +
+ {totalItems === 0 ? ( +

+ + Todos los sistemas funcionan correctamente +

+ ) : ( +

+ Mostrando {totalItems} elementos total{totalItems !== 1 ? 'es' : ''} + {activeItems > 0 && ( + <>, {activeItems} activo{activeItems !== 1 ? 's' : ''} + )} +

+ )} +
+
+
+ ); +}; \ No newline at end of file diff --git a/frontend/src/components/alerts/ConnectionStatus.tsx b/frontend/src/components/alerts/ConnectionStatus.tsx new file mode 100644 index 00000000..9bd35c58 --- /dev/null +++ b/frontend/src/components/alerts/ConnectionStatus.tsx @@ -0,0 +1,70 @@ +// frontend/src/components/alerts/ConnectionStatus.tsx +/** + * Displays the current SSE connection status with appropriate styling + */ + +import React from 'react'; +import { SSEConnectionState } from '../../types/alerts'; + +interface ConnectionStatusProps { + connectionState: SSEConnectionState; +} + +export const ConnectionStatus: React.FC = ({ + connectionState +}) => { + const getStatusConfig = (state: SSEConnectionState) => { + switch (state.status) { + case 'connected': + return { + color: 'bg-green-100 text-green-800 border-green-200', + icon: '🟢', + label: 'Conectado', + description: 'Actualizaciones en tiempo real' + }; + case 'connecting': + return { + color: 'bg-yellow-100 text-yellow-800 border-yellow-200', + icon: '🟡', + label: 'Conectando...', + description: 'Estableciendo conexión' + }; + case 'error': + return { + color: 'bg-red-100 text-red-800 border-red-200', + icon: '🔴', + label: 'Error de conexión', + description: state.reconnectAttempts > 0 ? `Reintento ${state.reconnectAttempts}` : 'Fallo en la conexión' + }; + case 'disconnected': + default: + return { + color: 'bg-gray-100 text-gray-800 border-gray-200', + icon: '⚪', + label: 'Desconectado', + description: 'Sin actualizaciones en tiempo real' + }; + } + }; + + const config = getStatusConfig(connectionState); + + return ( +
+ {config.icon} +
+ {config.label} + {config.description} +
+ + {connectionState.status === 'connecting' && ( +
+ + + + +
+ )} +
+ ); +}; \ No newline at end of file diff --git a/frontend/src/hooks/useAlertStream.ts b/frontend/src/hooks/useAlertStream.ts new file mode 100644 index 00000000..a2ecf8a1 --- /dev/null +++ b/frontend/src/hooks/useAlertStream.ts @@ -0,0 +1,359 @@ +// frontend/src/hooks/useAlertStream.ts +/** + * React hook for managing SSE connection to alert and recommendation stream + * Handles connection management, reconnection, and real-time updates + */ + +import { useEffect, useState, useCallback, useRef } from 'react'; +import { AlertItem, ItemSeverity, ItemType, SSEConnectionState, NotificationPermission } from '../types/alerts'; +import { useAuth } from './useAuth'; + +interface UseAlertStreamProps { + tenantId: string; + autoConnect?: boolean; + maxReconnectAttempts?: number; +} + +interface UseAlertStreamReturn { + items: AlertItem[]; + connectionState: SSEConnectionState; + urgentCount: number; + highCount: number; + recCount: number; + acknowledgeItem: (itemId: string) => Promise; + resolveItem: (itemId: string) => Promise; + connect: () => void; + disconnect: () => void; + clearItems: () => void; + notificationPermission: NotificationPermission; + requestNotificationPermission: () => Promise; +} + +export const useAlertStream = ({ + tenantId, + autoConnect = true, + maxReconnectAttempts = 10 +}: UseAlertStreamProps): UseAlertStreamReturn => { + const [items, setItems] = useState([]); + const [connectionState, setConnectionState] = useState({ + status: 'disconnected', + reconnectAttempts: 0 + }); + const [notificationPermission, setNotificationPermission] = useState('default'); + + const eventSourceRef = useRef(null); + const reconnectTimeoutRef = useRef(); + const isManuallyDisconnected = useRef(false); + const { token } = useAuth(); + + // Initialize notification permission state + useEffect(() => { + if ('Notification' in window) { + setNotificationPermission(Notification.permission); + } + }, []); + + const requestNotificationPermission = useCallback(async (): Promise => { + if (!('Notification' in window)) { + return 'denied'; + } + + const permission = await Notification.requestPermission(); + setNotificationPermission(permission); + return permission; + }, []); + + const showBrowserNotification = useCallback((item: AlertItem) => { + if (notificationPermission !== 'granted') return; + + // Only show notifications for urgent/high alerts, not recommendations + if (item.item_type === 'recommendation') return; + if (!['urgent', 'high'].includes(item.severity)) return; + + const notification = new Notification(item.title, { + body: item.message, + icon: '/favicon.ico', + badge: '/badge-icon.png', + tag: item.id, + renotify: true, + requireInteraction: item.severity === 'urgent', + data: { + itemId: item.id, + itemType: item.item_type, + severity: item.severity + } + }); + + // Auto-close non-urgent notifications after 5 seconds + if (item.severity !== 'urgent') { + setTimeout(() => notification.close(), 5000); + } + + notification.onclick = () => { + window.focus(); + notification.close(); + // Could navigate to specific alert details + }; + }, [notificationPermission]); + + const playAlertSound = useCallback((severity: ItemSeverity) => { + // Only play sounds for urgent alerts + if (severity !== 'urgent') return; + + try { + const audio = new Audio('/sounds/alert-urgent.mp3'); + audio.volume = 0.5; + audio.play().catch(() => { + // Silently fail if audio can't play (user interaction required) + }); + } catch (error) { + console.warn('Could not play alert sound:', error); + } + }, []); + + const addAndSortItems = useCallback((newItem: AlertItem) => { + setItems(prev => { + // Prevent duplicates + if (prev.some(i => i.id === newItem.id)) return prev; + + const updated = [newItem, ...prev]; + + // Sort by severity weight, then by timestamp + const severityWeight = { urgent: 4, high: 3, medium: 2, low: 1 }; + + return updated.sort((a, b) => { + const weightDiff = severityWeight[b.severity] - severityWeight[a.severity]; + if (weightDiff !== 0) return weightDiff; + return new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime(); + }).slice(0, 100); // Keep only latest 100 items + }); + }, []); + + const connect = useCallback(() => { + if (!token || !tenantId) { + console.warn('Cannot connect to alert stream: missing token or tenantId'); + return; + } + + // Clean up existing connection + if (eventSourceRef.current) { + eventSourceRef.current.close(); + } + + isManuallyDisconnected.current = false; + setConnectionState(prev => ({ ...prev, status: 'connecting' })); + + // Create SSE connection + const url = `${process.env.REACT_APP_NOTIFICATION_SERVICE_URL || 'http://localhost:8002'}/api/v1/sse/alerts/stream/${tenantId}`; + + const eventSource = new EventSource(url, { + withCredentials: true + }); + + // Add auth header (if supported by browser) + if ('headers' in eventSource) { + (eventSource as any).headers = { + 'Authorization': `Bearer ${token}` + }; + } + + eventSource.onopen = () => { + setConnectionState(prev => ({ + ...prev, + status: 'connected', + lastConnected: new Date(), + reconnectAttempts: 0 + })); + console.log('Alert stream connected'); + }; + + eventSource.addEventListener('connected', (event) => { + console.log('Alert stream handshake completed:', event.data); + }); + + eventSource.addEventListener('initial_items', (event) => { + try { + const initialItems = JSON.parse(event.data); + setItems(initialItems); + console.log(`Loaded ${initialItems.length} initial items`); + } catch (error) { + console.error('Error parsing initial items:', error); + } + }); + + eventSource.addEventListener('alert', (event) => { + try { + const newItem = JSON.parse(event.data); + addAndSortItems(newItem); + + // Show browser notification for urgent/high alerts + showBrowserNotification(newItem); + + // Play sound for urgent alerts + if (newItem.severity === 'urgent') { + playAlertSound(newItem.severity); + } + + console.log('New alert received:', newItem.type, newItem.severity); + } catch (error) { + console.error('Error processing alert event:', error); + } + }); + + eventSource.addEventListener('recommendation', (event) => { + try { + const newItem = JSON.parse(event.data); + addAndSortItems(newItem); + console.log('New recommendation received:', newItem.type); + } catch (error) { + console.error('Error processing recommendation event:', error); + } + }); + + eventSource.addEventListener('ping', (event) => { + // Handle keepalive pings + console.debug('SSE keepalive ping received'); + }); + + eventSource.onerror = (error) => { + console.error('SSE error:', error); + setConnectionState(prev => ({ + ...prev, + status: 'error' + })); + + eventSource.close(); + + // Attempt reconnection with exponential backoff + if (!isManuallyDisconnected.current && + connectionState.reconnectAttempts < maxReconnectAttempts) { + + const backoffTime = Math.min(1000 * Math.pow(2, connectionState.reconnectAttempts), 30000); + + setConnectionState(prev => ({ + ...prev, + reconnectAttempts: prev.reconnectAttempts + 1 + })); + + console.log(`Reconnecting in ${backoffTime}ms (attempt ${connectionState.reconnectAttempts + 1})`); + + reconnectTimeoutRef.current = setTimeout(() => { + connect(); + }, backoffTime); + } + }; + + eventSourceRef.current = eventSource; + }, [token, tenantId, connectionState.reconnectAttempts, maxReconnectAttempts, addAndSortItems, showBrowserNotification, playAlertSound]); + + const disconnect = useCallback(() => { + isManuallyDisconnected.current = true; + + if (eventSourceRef.current) { + eventSourceRef.current.close(); + eventSourceRef.current = null; + } + + if (reconnectTimeoutRef.current) { + clearTimeout(reconnectTimeoutRef.current); + } + + setConnectionState({ + status: 'disconnected', + reconnectAttempts: 0 + }); + }, []); + + const acknowledgeItem = useCallback(async (itemId: string) => { + try { + const response = await fetch( + `${process.env.REACT_APP_NOTIFICATION_SERVICE_URL || 'http://localhost:8002'}/api/v1/sse/items/${itemId}/acknowledge`, + { + method: 'POST', + headers: { + 'Authorization': `Bearer ${token}`, + 'Content-Type': 'application/json' + } + } + ); + + if (response.ok) { + setItems(prev => prev.map(item => + item.id === itemId + ? { ...item, status: 'acknowledged' as const, acknowledged_at: new Date().toISOString() } + : item + )); + } + } catch (error) { + console.error('Failed to acknowledge item:', error); + } + }, [token]); + + const resolveItem = useCallback(async (itemId: string) => { + try { + const response = await fetch( + `${process.env.REACT_APP_NOTIFICATION_SERVICE_URL || 'http://localhost:8002'}/api/v1/sse/items/${itemId}/resolve`, + { + method: 'POST', + headers: { + 'Authorization': `Bearer ${token}`, + 'Content-Type': 'application/json' + } + } + ); + + if (response.ok) { + setItems(prev => prev.map(item => + item.id === itemId + ? { ...item, status: 'resolved' as const, resolved_at: new Date().toISOString() } + : item + )); + } + } catch (error) { + console.error('Failed to resolve item:', error); + } + }, [token]); + + const clearItems = useCallback(() => { + setItems([]); + }, []); + + // Auto-connect on mount if enabled + useEffect(() => { + if (autoConnect && token && tenantId) { + connect(); + } + + return () => { + disconnect(); + }; + }, [autoConnect, token, tenantId]); // Don't include connect/disconnect to avoid loops + + // Calculate counts + const urgentCount = items.filter(i => + i.severity === 'urgent' && i.status === 'active' && i.item_type === 'alert' + ).length; + + const highCount = items.filter(i => + i.severity === 'high' && i.status === 'active' && i.item_type === 'alert' + ).length; + + const recCount = items.filter(i => + i.item_type === 'recommendation' && i.status === 'active' + ).length; + + return { + items, + connectionState, + urgentCount, + highCount, + recCount, + acknowledgeItem, + resolveItem, + connect, + disconnect, + clearItems, + notificationPermission, + requestNotificationPermission + }; +}; \ No newline at end of file diff --git a/frontend/src/types/alerts.ts b/frontend/src/types/alerts.ts new file mode 100644 index 00000000..31c9d24c --- /dev/null +++ b/frontend/src/types/alerts.ts @@ -0,0 +1,126 @@ +// frontend/src/types/alerts.ts +/** + * TypeScript types for the unified alert and recommendation system + */ + +export type ItemType = 'alert' | 'recommendation'; + +export type ItemSeverity = 'urgent' | 'high' | 'medium' | 'low'; + +export type ItemStatus = 'active' | 'acknowledged' | 'resolved'; + +export interface AlertItem { + id: string; + tenant_id: string; + item_type: ItemType; + type: string; // Specific alert/recommendation type + severity: ItemSeverity; + status: ItemStatus; + service: string; + title: string; + message: string; + actions: string[]; + metadata: Record; + created_at: string; + acknowledged_at?: string; + acknowledged_by?: string; + resolved_at?: string; + resolved_by?: string; + timestamp: string; +} + +export interface SSEEvent { + event: string; + data: string; + id?: string; +} + +export interface ItemFilters { + item_type: ItemType | 'all'; + severity: ItemSeverity | 'all'; + status: ItemStatus | 'all'; + service: string | 'all'; + search: string; +} + +export interface ItemCounts { + total: number; + alerts: { + urgent: number; + high: number; + medium: number; + low: number; + }; + recommendations: { + high: number; + medium: number; + low: number; + }; + by_status: { + active: number; + acknowledged: number; + resolved: number; + }; +} + +export interface NotificationSettings { + browser_notifications: boolean; + sound_enabled: boolean; + auto_acknowledge_timeout: number; // minutes + show_recommendations: boolean; + urgent_only: boolean; +} + +export interface SSEConnectionState { + status: 'connecting' | 'connected' | 'disconnected' | 'error'; + lastConnected?: Date; + reconnectAttempts: number; + latency?: number; +} + +// Notification permission states +export type NotificationPermission = 'default' | 'granted' | 'denied'; + +// UI state +export interface AlertUIState { + filters: ItemFilters; + selectedItems: string[]; + sortBy: 'created_at' | 'severity' | 'type'; + sortOrder: 'asc' | 'desc'; + viewMode: 'list' | 'grid' | 'compact'; + sidebarOpen: boolean; + bulkActionsOpen: boolean; +} + +// Action types for alert responses +export interface AlertAction { + id: string; + label: string; + type: 'acknowledge' | 'resolve' | 'custom'; + icon?: string; + variant?: 'primary' | 'secondary' | 'danger'; + requires_confirmation?: boolean; +} + +// Metrics for dashboard +export interface AlertMetrics { + response_time_avg: number; // seconds + false_positive_rate: number; + recommendation_adoption_rate: number; + items_last_24h: number; + top_alert_types: Array<{ + type: string; + count: number; + }>; + service_health: Record; +} + +// Template for creating new alerts (development/testing) +export interface AlertTemplate { + type: string; + severity: ItemSeverity; + title: string; + message: string; + actions: string[]; + metadata?: Record; +} \ No newline at end of file diff --git a/infrastructure/monitoring/grafana/dashboards/alert-system-dashboard.json b/infrastructure/monitoring/grafana/dashboards/alert-system-dashboard.json new file mode 100644 index 00000000..f78cd6be --- /dev/null +++ b/infrastructure/monitoring/grafana/dashboards/alert-system-dashboard.json @@ -0,0 +1,644 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Comprehensive monitoring dashboard for the Bakery Alert and Recommendation System", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(alert_items_published_total[5m])", + "interval": "", + "legendFormat": "{{item_type}} - {{severity}}", + "refId": "A" + } + ], + "title": "Alert/Recommendation Publishing Rate", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "sum(alert_sse_active_connections)", + "interval": "", + "legendFormat": "Active SSE Connections", + "refId": "A" + } + ], + "title": "Active SSE Connections", + "type": "gauge" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "sum by (item_type) (alert_items_published_total)", + "interval": "", + "legendFormat": "{{item_type}}", + "refId": "A" + } + ], + "title": "Items by Type", + "type": "piechart" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "displayMode": "list", + "placement": "right" + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "sum by (severity) (alert_items_published_total)", + "interval": "", + "legendFormat": "{{severity}}", + "refId": "A" + } + ], + "title": "Items by Severity", + "type": "piechart" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(alert_notifications_sent_total[5m])", + "interval": "", + "legendFormat": "{{channel}}", + "refId": "A" + } + ], + "title": "Notification Delivery Rate by Channel", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m]))", + "interval": "", + "legendFormat": "95th percentile", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, rate(alert_processing_duration_seconds_bucket[5m]))", + "interval": "", + "legendFormat": "50th percentile (median)", + "refId": "B" + } + ], + "title": "Processing Duration", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "vis": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "expr": "rate(alert_processing_errors_total[5m])", + "interval": "", + "legendFormat": "{{error_type}}", + "refId": "A" + }, + { + "expr": "rate(alert_delivery_failures_total[5m])", + "interval": "", + "legendFormat": "Delivery: {{channel}}", + "refId": "B" + } + ], + "title": "Error Rates", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Health" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "color": "red", + "index": 0, + "text": "Unhealthy" + }, + "1": { + "color": "green", + "index": 1, + "text": "Healthy" + } + }, + "type": "value" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 8, + "options": { + "showHeader": true + }, + "pluginVersion": "8.0.0", + "targets": [ + { + "expr": "alert_system_component_health", + "format": "table", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "System Component Health", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "__name__": true, + "instance": true, + "job": true + }, + "indexByName": {}, + "renameByName": { + "Value": "Health", + "component": "Component", + "service": "Service" + } + } + } + ], + "type": "table" + } + ], + "schemaVersion": 27, + "style": "dark", + "tags": [ + "bakery", + "alerts", + "recommendations", + "monitoring" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "Europe/Madrid", + "title": "Bakery Alert & Recommendation System", + "uid": "bakery-alert-system", + "version": 1 +} \ No newline at end of file diff --git a/infrastructure/monitoring/prometheus/rules/alert-system-rules.yml b/infrastructure/monitoring/prometheus/rules/alert-system-rules.yml new file mode 100644 index 00000000..c2d9f437 --- /dev/null +++ b/infrastructure/monitoring/prometheus/rules/alert-system-rules.yml @@ -0,0 +1,243 @@ +# infrastructure/monitoring/prometheus/rules/alert-system-rules.yml +# Prometheus alerting rules for the Bakery Alert and Recommendation System + +groups: + - name: alert_system_health + rules: + # System component health alerts + - alert: AlertSystemComponentDown + expr: alert_system_component_health == 0 + for: 2m + labels: + severity: critical + service: "{{ $labels.service }}" + component: "{{ $labels.component }}" + annotations: + summary: "Alert system component {{ $labels.component }} is unhealthy" + description: "Component {{ $labels.component }} in service {{ $labels.service }} has been unhealthy for more than 2 minutes." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#component-health" + + # Connection health alerts + - alert: RabbitMQConnectionDown + expr: alert_rabbitmq_connection_status == 0 + for: 1m + labels: + severity: critical + service: "{{ $labels.service }}" + annotations: + summary: "RabbitMQ connection down for {{ $labels.service }}" + description: "Service {{ $labels.service }} has lost connection to RabbitMQ for more than 1 minute." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#rabbitmq-connection" + + - alert: RedisConnectionDown + expr: alert_redis_connection_status == 0 + for: 1m + labels: + severity: critical + service: "{{ $labels.service }}" + annotations: + summary: "Redis connection down for {{ $labels.service }}" + description: "Service {{ $labels.service }} has lost connection to Redis for more than 1 minute." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#redis-connection" + + # Leader election issues + - alert: NoSchedulerLeader + expr: sum(alert_scheduler_leader_status) == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "No scheduler leader elected" + description: "No service has been elected as scheduler leader for more than 5 minutes. Scheduled checks may not be running." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#leader-election" + + - name: alert_system_performance + rules: + # High error rates + - alert: HighAlertProcessingErrorRate + expr: rate(alert_processing_errors_total[5m]) > 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: "High alert processing error rate" + description: "Alert processing error rate is {{ $value | humanizePercentage }} over the last 5 minutes." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-errors" + + - alert: HighNotificationDeliveryFailureRate + expr: rate(alert_delivery_failures_total[5m]) / rate(alert_notifications_sent_total[5m]) > 0.05 + for: 3m + labels: + severity: warning + channel: "{{ $labels.channel }}" + annotations: + summary: "High notification delivery failure rate for {{ $labels.channel }}" + description: "Notification delivery failure rate for {{ $labels.channel }} is {{ $value | humanizePercentage }} over the last 5 minutes." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#delivery-failures" + + # Processing latency + - alert: HighAlertProcessingLatency + expr: histogram_quantile(0.95, rate(alert_processing_duration_seconds_bucket[5m])) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "High alert processing latency" + description: "95th percentile alert processing latency is {{ $value }}s, exceeding 5s threshold." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#processing-latency" + + # SSE connection issues + - alert: TooManySSEConnections + expr: sum(alert_sse_active_connections) > 1000 + for: 2m + labels: + severity: warning + annotations: + summary: "Too many active SSE connections" + description: "Number of active SSE connections ({{ $value }}) exceeds 1000. This may impact performance." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-connections" + + - alert: SSEConnectionErrors + expr: rate(alert_sse_connection_errors_total[5m]) > 0.5 + for: 3m + labels: + severity: warning + annotations: + summary: "High SSE connection error rate" + description: "SSE connection error rate is {{ $value }} errors/second over the last 5 minutes." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-errors" + + - name: alert_system_business + rules: + # Alert volume anomalies + - alert: UnusuallyHighAlertVolume + expr: rate(alert_items_published_total{item_type="alert"}[10m]) > 2 + for: 5m + labels: + severity: warning + service: "{{ $labels.service }}" + annotations: + summary: "Unusually high alert volume from {{ $labels.service }}" + description: "Service {{ $labels.service }} is generating alerts at {{ $value }} alerts/second, which is above normal levels." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#high-volume" + + - alert: NoAlertsGenerated + expr: rate(alert_items_published_total[30m]) == 0 + for: 15m + labels: + severity: warning + annotations: + summary: "No alerts generated recently" + description: "No alerts have been generated in the last 30 minutes. This may indicate a problem with detection systems." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#no-alerts" + + # Response time issues + - alert: SlowAlertResponseTime + expr: histogram_quantile(0.95, rate(alert_item_response_time_seconds_bucket[1h])) > 3600 + for: 10m + labels: + severity: warning + annotations: + summary: "Slow alert response times" + description: "95th percentile alert response time is {{ $value | humanizeDuration }}, exceeding 1 hour." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#response-times" + + # Critical alerts not acknowledged + - alert: CriticalAlertsUnacknowledged + expr: sum(alert_active_items_current{item_type="alert",severity="urgent"}) > 5 + for: 10m + labels: + severity: critical + annotations: + summary: "Multiple critical alerts unacknowledged" + description: "{{ $value }} critical alerts remain unacknowledged for more than 10 minutes." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#critical-unacked" + + - name: alert_system_capacity + rules: + # Queue size monitoring + - alert: LargeSSEMessageQueues + expr: alert_sse_message_queue_size > 100 + for: 5m + labels: + severity: warning + tenant_id: "{{ $labels.tenant_id }}" + annotations: + summary: "Large SSE message queue for tenant {{ $labels.tenant_id }}" + description: "SSE message queue for tenant {{ $labels.tenant_id }} has {{ $value }} messages, indicating potential client issues." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#sse-queues" + + # Database storage issues + - alert: SlowDatabaseStorage + expr: histogram_quantile(0.95, rate(alert_database_storage_duration_seconds_bucket[5m])) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Slow database storage for alerts" + description: "95th percentile database storage time is {{ $value }}s, exceeding 1s threshold." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#database-storage" + + - name: alert_system_effectiveness + rules: + # False positive rate monitoring + - alert: HighFalsePositiveRate + expr: alert_false_positive_rate > 0.2 + for: 30m + labels: + severity: warning + service: "{{ $labels.service }}" + alert_type: "{{ $labels.alert_type }}" + annotations: + summary: "High false positive rate for {{ $labels.alert_type }}" + description: "False positive rate for {{ $labels.alert_type }} in {{ $labels.service }} is {{ $value | humanizePercentage }}." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#false-positives" + + # Low recommendation adoption + - alert: LowRecommendationAdoption + expr: rate(alert_recommendations_implemented_total[24h]) / rate(alert_items_published_total{item_type="recommendation"}[24h]) < 0.1 + for: 1h + labels: + severity: info + service: "{{ $labels.service }}" + annotations: + summary: "Low recommendation adoption rate" + description: "Recommendation adoption rate for {{ $labels.service }} is {{ $value | humanizePercentage }} over the last 24 hours." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#recommendation-adoption" + +# Additional alerting rules for specific scenarios + - name: alert_system_critical_scenarios + rules: + # Complete system failure + - alert: AlertSystemDown + expr: up{job=~"alert-processor|notification-service"} == 0 + for: 1m + labels: + severity: critical + service: "{{ $labels.job }}" + annotations: + summary: "Alert system service {{ $labels.job }} is down" + description: "Critical alert system service {{ $labels.job }} has been down for more than 1 minute." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#service-down" + + # Data loss prevention + - alert: AlertDataNotPersisted + expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_database_storage_duration_seconds_count[5m]) == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Alert data not being persisted to database" + description: "Alerts are being processed but not stored in database, potential data loss." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#data-persistence" + + # Notification blackhole + - alert: NotificationsNotDelivered + expr: rate(alert_items_processed_total[5m]) > 0 and rate(alert_notifications_sent_total[5m]) == 0 + for: 3m + labels: + severity: critical + annotations: + summary: "Notifications not being delivered" + description: "Alerts are being processed but no notifications are being sent." + runbook_url: "https://docs.bakery.local/runbooks/alert-system#notification-delivery" \ No newline at end of file diff --git a/migrations/001_create_alert_tables.sql b/migrations/001_create_alert_tables.sql new file mode 100644 index 00000000..e4631f82 --- /dev/null +++ b/migrations/001_create_alert_tables.sql @@ -0,0 +1,197 @@ +-- migrations/001_create_alert_tables.sql +-- Database schema for unified alerts and recommendations system + +-- Main alerts table (stores both alerts and recommendations) +CREATE TABLE IF NOT EXISTS alerts ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + tenant_id UUID NOT NULL, + item_type VARCHAR(20) NOT NULL DEFAULT 'alert' CHECK (item_type IN ('alert', 'recommendation')), + alert_type VARCHAR(50) NOT NULL, -- Specific type like 'critical_stock_shortage', 'inventory_optimization' + severity VARCHAR(20) NOT NULL CHECK (severity IN ('urgent', 'high', 'medium', 'low')), + status VARCHAR(20) NOT NULL DEFAULT 'active' CHECK (status IN ('active', 'acknowledged', 'resolved')), + service VARCHAR(50) NOT NULL, + + title VARCHAR(255) NOT NULL, + message TEXT NOT NULL, + actions JSONB DEFAULT '[]', + metadata JSONB DEFAULT '{}', + + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + acknowledged_at TIMESTAMP WITH TIME ZONE, + acknowledged_by UUID, + resolved_at TIMESTAMP WITH TIME ZONE, + resolved_by UUID, + + -- Add severity weight for sorting + severity_weight INT GENERATED ALWAYS AS ( + CASE severity + WHEN 'urgent' THEN 4 + WHEN 'high' THEN 3 + WHEN 'medium' THEN 2 + WHEN 'low' THEN 1 + END + ) STORED +); + +-- Indexes for performance +CREATE INDEX IF NOT EXISTS idx_alerts_tenant_status ON alerts(tenant_id, status); +CREATE INDEX IF NOT EXISTS idx_alerts_created_at ON alerts(created_at DESC); +CREATE INDEX IF NOT EXISTS idx_alerts_severity ON alerts(severity_weight DESC); +CREATE INDEX IF NOT EXISTS idx_alerts_tenant_active ON alerts(tenant_id, status) WHERE status = 'active'; +CREATE INDEX IF NOT EXISTS idx_alerts_item_type ON alerts(item_type); +CREATE INDEX IF NOT EXISTS idx_alerts_service ON alerts(service); + +-- Composite index for common queries +CREATE INDEX IF NOT EXISTS idx_alerts_tenant_type_status ON alerts(tenant_id, item_type, status); + +-- Alert history for audit trail (applies to both alerts and recommendations) +CREATE TABLE IF NOT EXISTS alert_history ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + alert_id UUID REFERENCES alerts(id) ON DELETE CASCADE, + tenant_id UUID NOT NULL, + action VARCHAR(50) NOT NULL, + performed_by UUID, + performed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + details JSONB DEFAULT '{}' +); + +CREATE INDEX IF NOT EXISTS idx_alert_history_alert ON alert_history(alert_id); +CREATE INDEX IF NOT EXISTS idx_alert_history_tenant ON alert_history(tenant_id); + +-- Database triggers for immediate alerts (recommendations typically not triggered this way) +-- Stock critical trigger +CREATE OR REPLACE FUNCTION notify_stock_critical() +RETURNS TRIGGER AS $$ +BEGIN + -- Only trigger for alerts, not recommendations + IF NEW.current_stock < NEW.minimum_stock AND + OLD.current_stock >= OLD.minimum_stock THEN + PERFORM pg_notify( + 'stock_alerts', + json_build_object( + 'tenant_id', NEW.tenant_id, + 'ingredient_id', NEW.id, + 'name', NEW.name, + 'current_stock', NEW.current_stock, + 'minimum_stock', NEW.minimum_stock, + 'alert_type', 'critical_stock_shortage' + )::text + ); + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +-- Temperature breach trigger +CREATE OR REPLACE FUNCTION notify_temperature_breach() +RETURNS TRIGGER AS $$ +BEGIN + IF NEW.temperature > NEW.max_threshold AND + NEW.breach_duration_minutes > 30 THEN + PERFORM pg_notify( + 'temperature_alerts', + json_build_object( + 'tenant_id', NEW.tenant_id, + 'sensor_id', NEW.sensor_id, + 'location', NEW.location, + 'temperature', NEW.temperature, + 'duration', NEW.breach_duration_minutes, + 'alert_type', 'temperature_breach' + )::text + ); + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +-- Production delay trigger +CREATE OR REPLACE FUNCTION notify_production_delay() +RETURNS TRIGGER AS $$ +BEGIN + IF NEW.status = 'delayed' AND OLD.status != 'delayed' THEN + PERFORM pg_notify( + 'production_alerts', + json_build_object( + 'tenant_id', NEW.tenant_id, + 'batch_id', NEW.id, + 'product_name', NEW.product_name, + 'planned_completion', NEW.planned_completion_time, + 'delay_minutes', EXTRACT(EPOCH FROM (NOW() - NEW.planned_completion_time))/60, + 'alert_type', 'production_delay' + )::text + ); + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +-- Create placeholder tables for triggers (these would exist in respective services) +-- This is just for reference - actual tables should be in service-specific migrations + +-- Inventory items table structure (for reference) +CREATE TABLE IF NOT EXISTS inventory_items ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + tenant_id UUID NOT NULL, + name VARCHAR(255) NOT NULL, + current_stock DECIMAL(10,2) DEFAULT 0, + minimum_stock DECIMAL(10,2) DEFAULT 0, + maximum_stock DECIMAL(10,2), + unit VARCHAR(50) DEFAULT 'kg', + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +-- Temperature readings table structure (for reference) +CREATE TABLE IF NOT EXISTS temperature_readings ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + tenant_id UUID NOT NULL, + sensor_id VARCHAR(100) NOT NULL, + location VARCHAR(255) NOT NULL, + temperature DECIMAL(5,2) NOT NULL, + max_threshold DECIMAL(5,2) DEFAULT 25.0, + breach_duration_minutes INT DEFAULT 0, + recorded_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +-- Production batches table structure (for reference) +CREATE TABLE IF NOT EXISTS production_batches ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + tenant_id UUID NOT NULL, + product_name VARCHAR(255) NOT NULL, + status VARCHAR(50) DEFAULT 'planned', + planned_completion_time TIMESTAMP WITH TIME ZONE, + actual_completion_time TIMESTAMP WITH TIME ZONE, + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +-- Apply triggers (only if tables exist) +DO $$ +BEGIN + -- Stock critical trigger + IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'inventory_items') THEN + DROP TRIGGER IF EXISTS stock_critical_trigger ON inventory_items; + CREATE TRIGGER stock_critical_trigger + AFTER UPDATE ON inventory_items + FOR EACH ROW + EXECUTE FUNCTION notify_stock_critical(); + END IF; + + -- Temperature breach trigger + IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'temperature_readings') THEN + DROP TRIGGER IF EXISTS temperature_breach_trigger ON temperature_readings; + CREATE TRIGGER temperature_breach_trigger + AFTER INSERT OR UPDATE ON temperature_readings + FOR EACH ROW + EXECUTE FUNCTION notify_temperature_breach(); + END IF; + + -- Production delay trigger + IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'production_batches') THEN + DROP TRIGGER IF EXISTS production_delay_trigger ON production_batches; + CREATE TRIGGER production_delay_trigger + AFTER UPDATE ON production_batches + FOR EACH ROW + EXECUTE FUNCTION notify_production_delay(); + END IF; +END +$$; \ No newline at end of file diff --git a/services/alert_processor/Dockerfile b/services/alert_processor/Dockerfile new file mode 100644 index 00000000..207e0005 --- /dev/null +++ b/services/alert_processor/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements and install dependencies +COPY services/alert_processor/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy shared libraries +COPY shared/ /app/shared/ + +# Copy application code +COPY services/alert_processor/app/ /app/app/ + +# Create non-root user +RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app +USER appuser + +EXPOSE 8000 + +CMD ["python", "-m", "app.main"] \ No newline at end of file diff --git a/services/alert_processor/app/__init__.py b/services/alert_processor/app/__init__.py new file mode 100644 index 00000000..e8ff18d5 --- /dev/null +++ b/services/alert_processor/app/__init__.py @@ -0,0 +1 @@ +# Alert Processor Service \ No newline at end of file diff --git a/services/alert_processor/app/config.py b/services/alert_processor/app/config.py new file mode 100644 index 00000000..20ec06b6 --- /dev/null +++ b/services/alert_processor/app/config.py @@ -0,0 +1,49 @@ +# services/alert_processor/app/config.py +""" +Alert Processor Service Configuration +""" + +import os +from typing import List +from shared.config.base import BaseServiceSettings + +class AlertProcessorConfig(BaseServiceSettings): + """Configuration for Alert Processor Service""" + SERVICE_NAME: str = "alert-processor" + APP_NAME: str = "Alert Processor Service" + DESCRIPTION: str = "Central alert and recommendation processor" + + # Use the notification database for alert storage + # This makes sense since alerts and notifications are closely related + DATABASE_URL: str = os.getenv( + "NOTIFICATION_DATABASE_URL", + "postgresql+asyncpg://notification_user:notification_pass123@notification-db:5432/notification_db" + ) + + # Use dedicated Redis DB for alert processing + REDIS_DB: int = int(os.getenv("ALERT_PROCESSOR_REDIS_DB", "6")) + + # Alert processing configuration + BATCH_SIZE: int = int(os.getenv("ALERT_BATCH_SIZE", "10")) + PROCESSING_TIMEOUT: int = int(os.getenv("ALERT_PROCESSING_TIMEOUT", "30")) + + # Deduplication settings + ALERT_DEDUPLICATION_WINDOW_MINUTES: int = int(os.getenv("ALERT_DEDUPLICATION_WINDOW_MINUTES", "15")) + RECOMMENDATION_DEDUPLICATION_WINDOW_MINUTES: int = int(os.getenv("RECOMMENDATION_DEDUPLICATION_WINDOW_MINUTES", "60")) + + # Alert severity channel mappings (hardcoded for now to avoid config parsing issues) + @property + def urgent_channels(self) -> List[str]: + return ["whatsapp", "email", "push", "dashboard"] + + @property + def high_channels(self) -> List[str]: + return ["whatsapp", "email", "dashboard"] + + @property + def medium_channels(self) -> List[str]: + return ["email", "dashboard"] + + @property + def low_channels(self) -> List[str]: + return ["dashboard"] \ No newline at end of file diff --git a/services/alert_processor/app/main.py b/services/alert_processor/app/main.py new file mode 100644 index 00000000..dd847151 --- /dev/null +++ b/services/alert_processor/app/main.py @@ -0,0 +1,360 @@ +# services/alert_processor/app/main.py +""" +Alert Processor Service - Central hub for processing alerts and recommendations +Consumes from RabbitMQ, stores in database, and routes to notification service +""" + +import asyncio +import json +import signal +import sys +from datetime import datetime +from typing import Dict, Any +import structlog +import redis.asyncio as aioredis +from aio_pika import connect_robust, IncomingMessage, ExchangeType + +from app.config import AlertProcessorConfig +from shared.database.base import create_database_manager +from shared.clients.base_service_client import BaseServiceClient +from shared.config.rabbitmq_config import RABBITMQ_CONFIG + +# Setup logging +structlog.configure( + processors=[ + structlog.stdlib.filter_by_level, + structlog.stdlib.add_logger_name, + structlog.stdlib.add_log_level, + structlog.stdlib.PositionalArgumentsFormatter(), + structlog.processors.TimeStamper(fmt="ISO"), + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + structlog.processors.JSONRenderer() + ], + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), + wrapper_class=structlog.stdlib.BoundLogger, + cache_logger_on_first_use=True, +) + +logger = structlog.get_logger() + + +class NotificationServiceClient(BaseServiceClient): + """Client for notification service""" + + def __init__(self, config: AlertProcessorConfig): + super().__init__("notification-service", config) + self.config = config + + def get_service_base_path(self) -> str: + """Return the base path for notification service APIs""" + return "/api/v1" + + async def send_notification(self, tenant_id: str, notification: Dict[str, Any], channels: list) -> Dict[str, Any]: + """Send notification via notification service""" + try: + response = await self.post( + "/api/v1/notifications/send", + json={ + "tenant_id": tenant_id, + "notification": notification, + "channels": channels + } + ) + return response + except Exception as e: + logger.error("Failed to send notification", error=str(e), tenant_id=tenant_id) + return {"status": "failed", "error": str(e)} + +class AlertProcessorService: + """ + Central service for processing and routing alerts and recommendations + Integrates with notification service for multi-channel delivery + """ + + def __init__(self, config: AlertProcessorConfig): + self.config = config + self.db_manager = create_database_manager(config.DATABASE_URL, "alert-processor") + self.notification_client = NotificationServiceClient(config) + self.redis = None + self.connection = None + self.channel = None + self.running = False + + # Metrics + self.items_processed = 0 + self.items_stored = 0 + self.notifications_sent = 0 + self.errors_count = 0 + + async def start(self): + """Start the alert processor service""" + try: + logger.info("Starting Alert Processor Service") + + # Connect to Redis for SSE publishing + self.redis = aioredis.from_url(self.config.REDIS_URL) + logger.info("Connected to Redis") + + # Connect to RabbitMQ + await self._setup_rabbitmq() + + # Start consuming messages + await self._start_consuming() + + self.running = True + logger.info("Alert Processor Service started successfully") + + except Exception as e: + logger.error("Failed to start Alert Processor Service", error=str(e)) + raise + + async def _setup_rabbitmq(self): + """Setup RabbitMQ connection and configuration""" + self.connection = await connect_robust( + self.config.RABBITMQ_URL, + heartbeat=30, + connection_attempts=5 + ) + self.channel = await self.connection.channel() + await self.channel.set_qos(prefetch_count=10) # Process 10 messages at a time + + # Setup exchange and queue based on config + exchange_config = RABBITMQ_CONFIG["exchanges"]["alerts"] + self.exchange = await self.channel.declare_exchange( + exchange_config["name"], + getattr(ExchangeType, exchange_config["type"].upper()), + durable=exchange_config["durable"] + ) + + queue_config = RABBITMQ_CONFIG["queues"]["alert_processing"] + self.queue = await self.channel.declare_queue( + queue_config["name"], + durable=queue_config["durable"], + arguments=queue_config["arguments"] + ) + + # Bind to all alert and recommendation routing keys + await self.queue.bind(self.exchange, routing_key="*.*.*") + + logger.info("RabbitMQ setup completed") + + async def _start_consuming(self): + """Start consuming messages from RabbitMQ""" + await self.queue.consume(self.process_item) + logger.info("Started consuming alert messages") + + async def process_item(self, message: IncomingMessage): + """Process incoming alert or recommendation""" + async with message.process(): + try: + # Parse message + item = json.loads(message.body.decode()) + + logger.info("Processing item", + item_type=item.get('item_type'), + alert_type=item.get('type'), + severity=item.get('severity'), + tenant_id=item.get('tenant_id')) + + # Store in database + stored_item = await self.store_item(item) + self.items_stored += 1 + + # Determine delivery channels based on severity and type + channels = self.get_channels_by_severity_and_type( + item['severity'], + item['item_type'] + ) + + # Send via notification service if channels are specified + if channels: + notification_result = await self.notification_client.send_notification( + tenant_id=item['tenant_id'], + notification={ + 'type': item['item_type'], # 'alert' or 'recommendation' + 'id': item['id'], + 'title': item['title'], + 'message': item['message'], + 'severity': item['severity'], + 'metadata': item.get('metadata', {}), + 'actions': item.get('actions', []), + 'email': item.get('email'), + 'phone': item.get('phone'), + 'user_id': item.get('user_id') + }, + channels=channels + ) + + if notification_result.get('status') == 'success': + self.notifications_sent += 1 + + # Stream to SSE for real-time dashboard (always) + await self.stream_to_sse(item['tenant_id'], stored_item) + + self.items_processed += 1 + + logger.info("Item processed successfully", + item_id=item['id'], + channels=len(channels)) + + except Exception as e: + self.errors_count += 1 + logger.error("Item processing failed", error=str(e)) + raise + + async def store_item(self, item: dict) -> dict: + """Store alert or recommendation in database""" + from sqlalchemy import text + + query = text(""" + INSERT INTO alerts ( + id, tenant_id, item_type, alert_type, severity, status, + service, title, message, actions, metadata, + created_at + ) VALUES (:id, :tenant_id, :item_type, :alert_type, :severity, :status, + :service, :title, :message, :actions, :metadata, :created_at) + RETURNING * + """) + + async with self.db_manager.get_session() as session: + result = await session.execute( + query, + { + 'id': item['id'], + 'tenant_id': item['tenant_id'], + 'item_type': item['item_type'], # 'alert' or 'recommendation' + 'alert_type': item['type'], + 'severity': item['severity'], + 'status': 'active', + 'service': item['service'], + 'title': item['title'], + 'message': item['message'], + 'actions': json.dumps(item.get('actions', [])), + 'metadata': json.dumps(item.get('metadata', {})), + 'created_at': item['timestamp'] + } + ) + + row = result.fetchone() + await session.commit() + + logger.debug("Item stored in database", item_id=item['id']) + return dict(row._mapping) + + async def stream_to_sse(self, tenant_id: str, item: dict): + """Publish item to Redis for SSE streaming""" + channel = f"alerts:{tenant_id}" + + # Prepare message for SSE + sse_message = { + 'id': item['id'], + 'item_type': item['item_type'], + 'type': item['alert_type'], + 'severity': item['severity'], + 'title': item['title'], + 'message': item['message'], + 'actions': json.loads(item['actions']) if isinstance(item['actions'], str) else item['actions'], + 'metadata': json.loads(item['metadata']) if isinstance(item['metadata'], str) else item['metadata'], + 'timestamp': item['created_at'].isoformat() if hasattr(item['created_at'], 'isoformat') else item['created_at'], + 'status': item['status'] + } + + # Publish to Redis channel for SSE + await self.redis.publish(channel, json.dumps(sse_message)) + + logger.debug("Item published to SSE", tenant_id=tenant_id, item_id=item['id']) + + def get_channels_by_severity_and_type(self, severity: str, item_type: str) -> list: + """Determine notification channels based on severity, type, and time""" + current_hour = datetime.now().hour + + channels = ['dashboard'] # Always include dashboard (SSE) + + if item_type == 'alert': + if severity == 'urgent': + # Urgent alerts: All channels immediately + channels.extend(['whatsapp', 'email', 'push']) + elif severity == 'high': + # High alerts: WhatsApp and email during extended hours + if 6 <= current_hour <= 22: + channels.extend(['whatsapp', 'email']) + else: + channels.append('email') # Email only during night + elif severity == 'medium': + # Medium alerts: Email during business hours + if 7 <= current_hour <= 20: + channels.append('email') + # Low severity: Dashboard only + + elif item_type == 'recommendation': + # Recommendations: Less urgent, limit channels and respect business hours + if severity in ['medium', 'high']: + if 8 <= current_hour <= 19: # Business hours for recommendations + channels.append('email') + # Low/urgent (rare for recs): Dashboard only + + return channels + + async def stop(self): + """Stop the alert processor service""" + self.running = False + logger.info("Stopping Alert Processor Service") + + try: + # Close RabbitMQ connection + if self.connection and not self.connection.is_closed: + await self.connection.close() + + # Close Redis connection + if self.redis: + await self.redis.close() + + logger.info("Alert Processor Service stopped") + + except Exception as e: + logger.error("Error stopping service", error=str(e)) + + def get_metrics(self) -> Dict[str, Any]: + """Get service metrics""" + return { + "items_processed": self.items_processed, + "items_stored": self.items_stored, + "notifications_sent": self.notifications_sent, + "errors_count": self.errors_count, + "running": self.running + } + +async def main(): + """Main entry point""" + config = AlertProcessorConfig() + service = AlertProcessorService(config) + + # Setup signal handlers for graceful shutdown + async def shutdown(): + logger.info("Received shutdown signal") + await service.stop() + sys.exit(0) + + # Register signal handlers + for sig in (signal.SIGTERM, signal.SIGINT): + signal.signal(sig, lambda s, f: asyncio.create_task(shutdown())) + + try: + # Start the service + await service.start() + + # Keep running + while service.running: + await asyncio.sleep(1) + + except KeyboardInterrupt: + logger.info("Received keyboard interrupt") + except Exception as e: + logger.error("Service failed", error=str(e)) + finally: + await service.stop() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/services/alert_processor/requirements.txt b/services/alert_processor/requirements.txt new file mode 100644 index 00000000..d506968f --- /dev/null +++ b/services/alert_processor/requirements.txt @@ -0,0 +1,12 @@ +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +aio-pika==9.3.1 +redis==5.0.1 +asyncpg==0.29.0 +sqlalchemy==2.0.23 +structlog==23.2.0 +prometheus-client==0.19.0 +pydantic-settings==2.1.0 +pydantic==2.5.2 +httpx==0.25.2 +python-jose[cryptography]==3.3.0 \ No newline at end of file diff --git a/services/auth/README.md b/services/auth/README.md deleted file mode 100644 index 51754588..00000000 --- a/services/auth/README.md +++ /dev/null @@ -1,129 +0,0 @@ -# ================================================================ -# services/auth/README.md -# ================================================================ -# Authentication Service - -Microservice for user authentication and authorization in the bakery forecasting platform. - -## Features - -- User registration and login -- JWT access and refresh tokens -- Password security validation -- Rate limiting and login attempt tracking -- Multi-tenant user management -- Session management -- Event publishing for user actions - -## Quick Start - -### Development - -```bash -# Start dependencies -docker-compose up -d auth-db redis rabbitmq - -# Install dependencies -pip install -r requirements.txt - -# Run migrations -alembic upgrade head - -# Start service -uvicorn app.main:app --reload --host 0.0.0.0 --port 8001 -``` - -### With Docker - -```bash -# Start everything -docker-compose up -d - -# View logs -docker-compose logs -f auth-service - -# Run tests -docker-compose exec auth-service pytest -``` - -## API Endpoints - -### Authentication -- `POST /api/v1/auth/register` - Register new user -- `POST /api/v1/auth/login` - User login -- `POST /api/v1/auth/refresh` - Refresh access token -- `POST /api/v1/auth/verify` - Verify token -- `POST /api/v1/auth/logout` - Logout user - -### User Management -- `GET /api/v1/users/me` - Get current user -- `PUT /api/v1/users/me` - Update current user -- `POST /api/v1/users/change-password` - Change password - -### Health -- `GET /health` - Health check -- `GET /metrics` - Prometheus metrics - -## Configuration - -Set these environment variables: - -```bash -DATABASE_URL=postgresql+asyncpg://auth_user:auth_pass123@auth-db:5432/auth_db -REDIS_URL=redis://redis:6379/0 -RABBITMQ_URL=amqp://bakery:forecast123@rabbitmq:5672/ -JWT_SECRET_KEY=your-super-secret-jwt-key-change-in-production -JWT_ACCESS_TOKEN_EXPIRE_MINUTES=30 -JWT_REFRESH_TOKEN_EXPIRE_DAYS=7 -MAX_LOGIN_ATTEMPTS=5 -LOCKOUT_DURATION_MINUTES=30 -``` - -## Testing - -```bash -# Run all tests -pytest - -# Run with coverage -pytest --cov=app - -# Run specific test file -pytest tests/test_auth.py -v -``` - -## Database Migrations - -```bash -# Create migration -alembic revision --autogenerate -m "description" - -# Apply migrations -alembic upgrade head - -# Rollback -alembic downgrade -1 -``` - -## Monitoring - -- Health endpoint: `/health` -- Metrics endpoint: `/metrics` (Prometheus format) -- Logs: Structured JSON logging -- Tracing: Request ID tracking - -## Security Features - -- Bcrypt password hashing -- JWT tokens with expiration -- Rate limiting on login attempts -- Account lockout protection -- IP and user agent tracking -- Token revocation support - -## Events Published - -- `user.registered` - When user registers -- `user.login` - When user logs in -- `user.logout` - When user logs out -- `user.password_changed` - When password changes \ No newline at end of file diff --git a/services/forecasting/README.md b/services/forecasting/README.md deleted file mode 100644 index ad0bcd25..00000000 --- a/services/forecasting/README.md +++ /dev/null @@ -1,169 +0,0 @@ - ================================================================ -# Documentation: services/forecasting/README.md -# ================================================================ - -# Forecasting Service - -AI-powered demand prediction service for bakery operations in Madrid, Spain. - -## Overview - -The Forecasting Service is a specialized microservice responsible for generating accurate demand predictions for bakery products. It integrates trained ML models with real-time weather and traffic data to provide actionable forecasts for business planning. - -## Features - -### Core Functionality -- **Single Product Forecasting**: Generate predictions for individual products -- **Batch Forecasting**: Process multiple products and time periods -- **Real-time Predictions**: On-demand forecasting with external data -- **Business Rules**: Spanish bakery-specific adjustments -- **Alert System**: Automated notifications for demand anomalies - -### Integration Points -- **Training Service**: Loads trained Prophet models -- **Data Service**: Retrieves weather and traffic data -- **Notification Service**: Sends alerts and reports -- **Gateway Service**: Authentication and request routing - -## API Endpoints - -### Forecasts -- `POST /api/v1/forecasts/single` - Generate single forecast -- `POST /api/v1/forecasts/batch` - Generate batch forecasts -- `GET /api/v1/forecasts/list` - List historical forecasts -- `GET /api/v1/forecasts/alerts` - Get forecast alerts -- `PUT /api/v1/forecasts/alerts/{id}/acknowledge` - Acknowledge alert - -### Predictions -- `POST /api/v1/predictions/realtime` - Real-time prediction -- `GET /api/v1/predictions/quick/{product}` - Quick multi-day forecast - -## Business Logic - -### Spanish Bakery Rules -- **Siesta Impact**: Reduced afternoon activity consideration -- **Weather Adjustments**: Rain reduces traffic, extreme temperatures affect product mix -- **Holiday Handling**: Spanish holiday calendar integration -- **Weekend Patterns**: Different demand patterns for weekends - -### Business Types -- **Individual Bakery**: Single location with direct sales -- **Central Workshop**: Production facility supplying multiple locations - -## Configuration - -### Environment Variables -```bash -# Database -DATABASE_URL=postgresql+asyncpg://user:pass@host:port/db - -# External Services -TRAINING_SERVICE_URL=http://training-service:8000 -DATA_SERVICE_URL=http://data-service:8000 - -# Business Rules -WEEKEND_ADJUSTMENT_FACTOR=0.8 -HOLIDAY_ADJUSTMENT_FACTOR=0.5 -RAIN_IMPACT_FACTOR=0.7 -``` - -### Performance Settings -```bash -MAX_FORECAST_DAYS=30 -PREDICTION_CACHE_TTL_HOURS=6 -FORECAST_BATCH_SIZE=100 -``` - -## Development - -### Setup -```bash -cd services/forecasting -pip install -r requirements.txt -``` - -### Testing -```bash -pytest tests/ -v --cov=app -``` - -### Running Locally -```bash -uvicorn app.main:app --reload --port 8000 -``` - -## Deployment - -### Docker -```bash -docker build -t forecasting-service . -docker run -p 8000:8000 forecasting-service -``` - -### Kubernetes -```bash -kubectl apply -f infrastructure/kubernetes/base/forecasting-service.yaml -``` - -## Monitoring - -### Metrics -- `forecasts_generated_total` - Total forecasts generated -- `predictions_served_total` - Total predictions served -- `forecast_processing_time_seconds` - Processing time histogram -- `active_models_count` - Number of active models - -### Health Checks -- `/health` - Service health status -- `/metrics` - Prometheus metrics endpoint - -## Performance - -### Benchmarks -- **Single Forecast**: < 2 seconds average -- **Batch Forecasting**: 100 products in < 30 seconds -- **Concurrent Load**: 95%+ success rate at 20 concurrent requests - -### Optimization -- Model caching for faster predictions -- Feature preparation optimization -- Database query optimization -- Asynchronous external API calls - -## Troubleshooting - -### Common Issues - -1. **No Model Found Error** - - Ensure training service has models for tenant/product - - Check model training logs in training service - -2. **High Prediction Latency** - - Monitor model cache hit rate - - Check external service response times - - Review database query performance - -3. **Inaccurate Predictions** - - Verify external data quality (weather/traffic) - - Check model performance metrics - - Review business rule configurations - -### Logging -```bash -# View service logs -docker logs forecasting-service - -# Debug level logging -LOG_LEVEL=DEBUG uvicorn app.main:app -``` - -## Contributing - -1. Follow the existing code structure and patterns -2. Add tests for new functionality -3. Update documentation for API changes -4. Ensure performance benchmarks are maintained - -## License - -This service is part of the Bakery Forecasting Platform - MIT License \ No newline at end of file diff --git a/services/inventory/app/main.py b/services/inventory/app/main.py index bd1bda32..3f513bfa 100644 --- a/services/inventory/app/main.py +++ b/services/inventory/app/main.py @@ -14,6 +14,7 @@ import structlog from app.core.config import settings from app.core.database import init_db, close_db from app.api import ingredients, stock, classification +from app.services.inventory_alert_service import InventoryAlertService from shared.monitoring.health import router as health_router from shared.monitoring.metrics import setup_metrics_early # Auth decorators are used in endpoints, no global setup needed @@ -32,6 +33,14 @@ async def lifespan(app: FastAPI): await init_db() logger.info("Database initialized successfully") + # Initialize alert service + alert_service = InventoryAlertService(settings) + await alert_service.start() + logger.info("Inventory alert service started") + + # Store alert service in app state + app.state.alert_service = alert_service + # Setup metrics is already done early - no need to do it here logger.info("Metrics setup completed") @@ -44,6 +53,11 @@ async def lifespan(app: FastAPI): # Shutdown logger.info("Shutting down Inventory Service") try: + # Stop alert service + if hasattr(app.state, 'alert_service'): + await app.state.alert_service.stop() + logger.info("Alert service stopped") + await close_db() logger.info("Database connections closed") except Exception as e: diff --git a/services/inventory/app/services/inventory_alert_service.py b/services/inventory/app/services/inventory_alert_service.py new file mode 100644 index 00000000..0d582c9e --- /dev/null +++ b/services/inventory/app/services/inventory_alert_service.py @@ -0,0 +1,710 @@ +# services/inventory/app/services/inventory_alert_service.py +""" +Inventory-specific alert and recommendation detection service +Implements hybrid detection patterns for critical stock issues and optimization opportunities +""" + +import asyncio +import json +from typing import List, Dict, Any, Optional +from uuid import UUID +from datetime import datetime, timedelta +import structlog +from apscheduler.triggers.cron import CronTrigger + +from shared.alerts.base_service import BaseAlertService, AlertServiceMixin +from shared.alerts.templates import format_item_message + +logger = structlog.get_logger() + +class InventoryAlertService(BaseAlertService, AlertServiceMixin): + """Inventory service alert and recommendation detection""" + + def setup_scheduled_checks(self): + """Inventory-specific scheduled checks for alerts and recommendations""" + + # Critical stock checks - every 5 minutes (alerts) + self.scheduler.add_job( + self.check_stock_levels, + CronTrigger(minute='*/5'), + id='stock_levels', + misfire_grace_time=30, + max_instances=1 + ) + + # Expiry checks - every 2 minutes (food safety critical, alerts) + self.scheduler.add_job( + self.check_expiring_products, + CronTrigger(minute='*/2'), + id='expiry_check', + misfire_grace_time=30, + max_instances=1 + ) + + # Temperature checks - every 2 minutes (alerts) + self.scheduler.add_job( + self.check_temperature_breaches, + CronTrigger(minute='*/2'), + id='temperature_check', + misfire_grace_time=30, + max_instances=1 + ) + + # Inventory optimization - every 30 minutes (recommendations) + self.scheduler.add_job( + self.generate_inventory_recommendations, + CronTrigger(minute='*/30'), + id='inventory_recs', + misfire_grace_time=120, + max_instances=1 + ) + + # Waste reduction analysis - every hour (recommendations) + self.scheduler.add_job( + self.generate_waste_reduction_recommendations, + CronTrigger(minute='0'), + id='waste_reduction_recs', + misfire_grace_time=300, + max_instances=1 + ) + + logger.info("Inventory alert schedules configured", + service=self.config.SERVICE_NAME) + + async def check_stock_levels(self): + """Batch check all stock levels for critical shortages (alerts)""" + try: + self._checks_performed += 1 + + query = """ + WITH stock_analysis AS ( + SELECT + i.*, + COALESCE(p.scheduled_quantity, 0) as tomorrow_needed, + COALESCE(s.avg_daily_usage, 0) as avg_daily_usage, + COALESCE(s.lead_time_days, 7) as lead_time_days, + CASE + WHEN i.current_stock < i.minimum_stock THEN 'critical' + WHEN i.current_stock < i.minimum_stock * 1.2 THEN 'low' + WHEN i.current_stock > i.maximum_stock THEN 'overstock' + ELSE 'normal' + END as status, + GREATEST(0, i.minimum_stock - i.current_stock) as shortage_amount + FROM inventory_items i + LEFT JOIN production_schedule p ON p.ingredient_id = i.id + AND p.date = CURRENT_DATE + INTERVAL '1 day' + LEFT JOIN supplier_items s ON s.ingredient_id = i.id + WHERE i.tenant_id = $1 AND i.active = true + ) + SELECT * FROM stock_analysis WHERE status != 'normal' + ORDER BY + CASE status + WHEN 'critical' THEN 1 + WHEN 'low' THEN 2 + WHEN 'overstock' THEN 3 + END, + shortage_amount DESC + """ + + tenants = await self.get_active_tenants() + + for tenant_id in tenants: + try: + from sqlalchemy import text + async with self.db_manager.get_session() as session: + result = await session.execute(text(query), {"tenant_id": tenant_id}) + issues = result.fetchall() + + for issue in issues: + await self._process_stock_issue(tenant_id, issue) + + except Exception as e: + logger.error("Error checking stock for tenant", + tenant_id=str(tenant_id), + error=str(e)) + + logger.debug("Stock level check completed", + tenants_checked=len(tenants)) + + except Exception as e: + logger.error("Stock level check failed", error=str(e)) + self._errors_count += 1 + + async def _process_stock_issue(self, tenant_id: UUID, issue: Dict[str, Any]): + """Process individual stock issue""" + try: + if issue['status'] == 'critical': + # Critical stock shortage - immediate alert + template_data = self.format_spanish_message( + 'critical_stock_shortage', + ingredient_name=issue["name"], + current_stock=issue["current_stock"], + required_stock=issue["tomorrow_needed"] or issue["minimum_stock"], + shortage_amount=issue["shortage_amount"] + ) + + await self.publish_item(tenant_id, { + 'type': 'critical_stock_shortage', + 'severity': 'urgent', + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'ingredient_id': str(issue['id']), + 'current_stock': float(issue['current_stock']), + 'minimum_stock': float(issue['minimum_stock']), + 'shortage_amount': float(issue['shortage_amount']), + 'tomorrow_needed': float(issue['tomorrow_needed'] or 0), + 'lead_time_days': issue['lead_time_days'] + } + }, item_type='alert') + + elif issue['status'] == 'low': + # Low stock - high priority alert + template_data = self.format_spanish_message( + 'critical_stock_shortage', + ingredient_name=issue["name"], + current_stock=issue["current_stock"], + required_stock=issue["minimum_stock"] + ) + + severity = self.get_business_hours_severity('high') + + await self.publish_item(tenant_id, { + 'type': 'low_stock_warning', + 'severity': severity, + 'title': f'⚠️ Stock Bajo: {issue["name"]}', + 'message': f'Stock actual {issue["current_stock"]}kg, mínimo {issue["minimum_stock"]}kg. Considerar pedido pronto.', + 'actions': ['Revisar consumo', 'Programar pedido', 'Contactar proveedor'], + 'metadata': { + 'ingredient_id': str(issue['id']), + 'current_stock': float(issue['current_stock']), + 'minimum_stock': float(issue['minimum_stock']) + } + }, item_type='alert') + + elif issue['status'] == 'overstock': + # Overstock - medium priority alert + severity = self.get_business_hours_severity('medium') + + await self.publish_item(tenant_id, { + 'type': 'overstock_warning', + 'severity': severity, + 'title': f'📦 Exceso de Stock: {issue["name"]}', + 'message': f'Stock actual {issue["current_stock"]}kg excede máximo {issue["maximum_stock"]}kg. Revisar para evitar caducidad.', + 'actions': ['Revisar caducidades', 'Aumentar producción', 'Ofertas especiales', 'Ajustar pedidos'], + 'metadata': { + 'ingredient_id': str(issue['id']), + 'current_stock': float(issue['current_stock']), + 'maximum_stock': float(issue['maximum_stock']) + } + }, item_type='alert') + + except Exception as e: + logger.error("Error processing stock issue", + ingredient_id=str(issue.get('id')), + error=str(e)) + + async def check_expiring_products(self): + """Check for products approaching expiry (alerts)""" + try: + self._checks_performed += 1 + + query = """ + SELECT + i.id, i.name, i.current_stock, i.tenant_id, + b.id as batch_id, b.expiry_date, b.quantity, + EXTRACT(days FROM (b.expiry_date - CURRENT_DATE)) as days_to_expiry + FROM inventory_items i + JOIN inventory_batches b ON b.ingredient_id = i.id + WHERE b.expiry_date <= CURRENT_DATE + INTERVAL '7 days' + AND b.quantity > 0 + AND b.status = 'active' + ORDER BY b.expiry_date ASC + """ + + from sqlalchemy import text + async with self.db_manager.get_session() as session: + result = await session.execute(text(query)) + expiring_items = result.fetchall() + + # Group by tenant + by_tenant = {} + for item in expiring_items: + tenant_id = item['tenant_id'] + if tenant_id not in by_tenant: + by_tenant[tenant_id] = [] + by_tenant[tenant_id].append(item) + + for tenant_id, items in by_tenant.items(): + await self._process_expiring_items(tenant_id, items) + + except Exception as e: + logger.error("Expiry check failed", error=str(e)) + self._errors_count += 1 + + async def _process_expiring_items(self, tenant_id: UUID, items: List[Dict[str, Any]]): + """Process expiring items for a tenant""" + try: + # Group by urgency + expired = [i for i in items if i['days_to_expiry'] <= 0] + urgent = [i for i in items if 0 < i['days_to_expiry'] <= 2] + warning = [i for i in items if 2 < i['days_to_expiry'] <= 7] + + # Process expired products (urgent alerts) + if expired: + product_count = len(expired) + product_names = [i['name'] for i in expired[:3]] # First 3 names + if len(expired) > 3: + product_names.append(f"y {len(expired) - 3} más") + + template_data = self.format_spanish_message( + 'expired_products', + product_count=product_count, + product_names=", ".join(product_names) + ) + + await self.publish_item(tenant_id, { + 'type': 'expired_products', + 'severity': 'urgent', + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'expired_items': [ + { + 'id': str(item['id']), + 'name': item['name'], + 'batch_id': str(item['batch_id']), + 'quantity': float(item['quantity']), + 'days_expired': abs(item['days_to_expiry']) + } for item in expired + ] + } + }, item_type='alert') + + # Process urgent expiry (high alerts) + if urgent: + for item in urgent: + await self.publish_item(tenant_id, { + 'type': 'urgent_expiry', + 'severity': 'high', + 'title': f'⏰ Caducidad Urgente: {item["name"]}', + 'message': f'{item["name"]} caduca en {item["days_to_expiry"]} día(s). Usar prioritariamente.', + 'actions': ['Usar inmediatamente', 'Promoción especial', 'Revisar recetas', 'Documentar'], + 'metadata': { + 'ingredient_id': str(item['id']), + 'batch_id': str(item['batch_id']), + 'days_to_expiry': item['days_to_expiry'], + 'quantity': float(item['quantity']) + } + }, item_type='alert') + + except Exception as e: + logger.error("Error processing expiring items", + tenant_id=str(tenant_id), + error=str(e)) + + async def check_temperature_breaches(self): + """Check for temperature breaches (alerts)""" + try: + self._checks_performed += 1 + + query = """ + SELECT + t.id, t.sensor_id, t.location, t.temperature, + t.max_threshold, t.tenant_id, + EXTRACT(minutes FROM (NOW() - t.first_breach_time)) as breach_duration_minutes + FROM temperature_readings t + WHERE t.temperature > t.max_threshold + AND t.breach_duration_minutes >= 30 -- Only after 30 minutes + AND t.last_alert_sent < NOW() - INTERVAL '15 minutes' -- Avoid spam + ORDER BY t.temperature DESC, t.breach_duration_minutes DESC + """ + + from sqlalchemy import text + async with self.db_manager.get_session() as session: + result = await session.execute(text(query)) + breaches = result.fetchall() + + for breach in breaches: + await self._process_temperature_breach(breach) + + except Exception as e: + logger.error("Temperature check failed", error=str(e)) + self._errors_count += 1 + + async def _process_temperature_breach(self, breach: Dict[str, Any]): + """Process temperature breach""" + try: + # Determine severity based on duration and temperature + duration_minutes = breach['breach_duration_minutes'] + temp_excess = breach['temperature'] - breach['max_threshold'] + + if duration_minutes > 120 or temp_excess > 10: + severity = 'urgent' + elif duration_minutes > 60 or temp_excess > 5: + severity = 'high' + else: + severity = 'medium' + + template_data = self.format_spanish_message( + 'temperature_breach', + location=breach['location'], + temperature=breach['temperature'], + duration=duration_minutes + ) + + await self.publish_item(breach['tenant_id'], { + 'type': 'temperature_breach', + 'severity': severity, + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'sensor_id': breach['sensor_id'], + 'location': breach['location'], + 'temperature': float(breach['temperature']), + 'max_threshold': float(breach['max_threshold']), + 'duration_minutes': duration_minutes, + 'temperature_excess': temp_excess + } + }, item_type='alert') + + # Update last alert sent time to avoid spam + await self.db_manager.execute( + "UPDATE temperature_readings SET last_alert_sent = NOW() WHERE id = $1", + breach['id'] + ) + + except Exception as e: + logger.error("Error processing temperature breach", + sensor_id=breach.get('sensor_id'), + error=str(e)) + + async def generate_inventory_recommendations(self): + """Generate optimization recommendations based on usage patterns""" + try: + self._checks_performed += 1 + + # Analyze stock levels vs usage patterns + query = """ + WITH usage_analysis AS ( + SELECT + i.id, i.name, i.tenant_id, i.minimum_stock, i.maximum_stock, + i.current_stock, + AVG(sm.quantity) FILTER (WHERE sm.movement_type = 'out' + AND sm.created_at > CURRENT_DATE - INTERVAL '30 days') as avg_daily_usage, + COUNT(sm.id) FILTER (WHERE sm.movement_type = 'out' + AND sm.created_at > CURRENT_DATE - INTERVAL '30 days') as usage_days, + MAX(sm.created_at) FILTER (WHERE sm.movement_type = 'out') as last_used + FROM inventory_items i + LEFT JOIN stock_movements sm ON sm.ingredient_id = i.id + WHERE i.active = true AND i.tenant_id = $1 + GROUP BY i.id + HAVING COUNT(sm.id) FILTER (WHERE sm.movement_type = 'out' + AND sm.created_at > CURRENT_DATE - INTERVAL '30 days') >= 5 + ), + recommendations AS ( + SELECT *, + CASE + WHEN avg_daily_usage * 7 > maximum_stock THEN 'increase_max' + WHEN avg_daily_usage * 3 < minimum_stock THEN 'decrease_min' + WHEN current_stock / NULLIF(avg_daily_usage, 0) > 14 THEN 'reduce_stock' + WHEN avg_daily_usage > 0 AND minimum_stock / avg_daily_usage < 3 THEN 'increase_min' + ELSE null + END as recommendation_type + FROM usage_analysis + WHERE avg_daily_usage > 0 + ) + SELECT * FROM recommendations WHERE recommendation_type IS NOT NULL + ORDER BY avg_daily_usage DESC + """ + + tenants = await self.get_active_tenants() + + for tenant_id in tenants: + try: + from sqlalchemy import text + async with self.db_manager.get_session() as session: + result = await session.execute(text(query), {"tenant_id": tenant_id}) + recommendations = result.fetchall() + + for rec in recommendations: + await self._generate_stock_recommendation(tenant_id, rec) + + except Exception as e: + logger.error("Error generating recommendations for tenant", + tenant_id=str(tenant_id), + error=str(e)) + + except Exception as e: + logger.error("Inventory recommendations failed", error=str(e)) + self._errors_count += 1 + + async def _generate_stock_recommendation(self, tenant_id: UUID, rec: Dict[str, Any]): + """Generate specific stock recommendation""" + try: + if not self.should_send_recommendation(tenant_id, rec['recommendation_type']): + return + + rec_type = rec['recommendation_type'] + + if rec_type == 'increase_max': + suggested_max = rec['avg_daily_usage'] * 10 # 10 days supply + template_data = self.format_spanish_message( + 'inventory_optimization', + ingredient_name=rec['name'], + period=30, + suggested_increase=suggested_max - rec['maximum_stock'] + ) + + await self.publish_item(tenant_id, { + 'type': 'inventory_optimization', + 'severity': 'medium', + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'ingredient_id': str(rec['id']), + 'current_max': float(rec['maximum_stock']), + 'suggested_max': float(suggested_max), + 'avg_daily_usage': float(rec['avg_daily_usage']), + 'recommendation_type': rec_type + } + }, item_type='recommendation') + + elif rec_type == 'decrease_min': + suggested_min = rec['avg_daily_usage'] * 3 # 3 days safety stock + + await self.publish_item(tenant_id, { + 'type': 'inventory_optimization', + 'severity': 'low', + 'title': f'📉 Optimización de Stock Mínimo: {rec["name"]}', + 'message': f'Uso promedio sugiere reducir stock mínimo de {rec["minimum_stock"]}kg a {suggested_min:.1f}kg.', + 'actions': ['Revisar niveles mínimos', 'Analizar tendencias', 'Ajustar configuración'], + 'metadata': { + 'ingredient_id': str(rec['id']), + 'current_min': float(rec['minimum_stock']), + 'suggested_min': float(suggested_min), + 'avg_daily_usage': float(rec['avg_daily_usage']), + 'recommendation_type': rec_type + } + }, item_type='recommendation') + + except Exception as e: + logger.error("Error generating stock recommendation", + ingredient_id=str(rec.get('id')), + error=str(e)) + + async def generate_waste_reduction_recommendations(self): + """Generate waste reduction recommendations""" + try: + # Analyze waste patterns + query = """ + SELECT + i.id, i.name, i.tenant_id, + SUM(w.quantity) as total_waste_30d, + COUNT(w.id) as waste_incidents, + AVG(w.quantity) as avg_waste_per_incident, + w.waste_reason + FROM inventory_items i + JOIN waste_logs w ON w.ingredient_id = i.id + WHERE w.created_at > CURRENT_DATE - INTERVAL '30 days' + AND i.tenant_id = $1 + GROUP BY i.id, w.waste_reason + HAVING SUM(w.quantity) > 5 -- More than 5kg wasted + ORDER BY total_waste_30d DESC + """ + + tenants = await self.get_active_tenants() + + for tenant_id in tenants: + try: + from sqlalchemy import text + async with self.db_manager.get_session() as session: + result = await session.execute(text(query), {"tenant_id": tenant_id}) + waste_data = result.fetchall() + + for waste in waste_data: + await self._generate_waste_recommendation(tenant_id, waste) + + except Exception as e: + logger.error("Error generating waste recommendations", + tenant_id=str(tenant_id), + error=str(e)) + + except Exception as e: + logger.error("Waste reduction recommendations failed", error=str(e)) + self._errors_count += 1 + + async def _generate_waste_recommendation(self, tenant_id: UUID, waste: Dict[str, Any]): + """Generate waste reduction recommendation""" + try: + waste_percentage = (waste['total_waste_30d'] / (waste['total_waste_30d'] + 100)) * 100 # Simplified calculation + + template_data = self.format_spanish_message( + 'waste_reduction', + product=waste['name'], + waste_reduction_percent=waste_percentage + ) + + await self.publish_item(tenant_id, { + 'type': 'waste_reduction', + 'severity': 'low', + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'ingredient_id': str(waste['id']), + 'total_waste_30d': float(waste['total_waste_30d']), + 'waste_incidents': waste['waste_incidents'], + 'waste_reason': waste['waste_reason'], + 'estimated_reduction_percent': waste_percentage + } + }, item_type='recommendation') + + except Exception as e: + logger.error("Error generating waste recommendation", + ingredient_id=str(waste.get('id')), + error=str(e)) + + async def register_db_listeners(self, conn): + """Register inventory-specific database listeners""" + try: + await conn.add_listener('stock_alerts', self.handle_stock_db_alert) + await conn.add_listener('temperature_alerts', self.handle_temperature_db_alert) + + logger.info("Database listeners registered", + service=self.config.SERVICE_NAME) + except Exception as e: + logger.error("Failed to register database listeners", + service=self.config.SERVICE_NAME, + error=str(e)) + + async def handle_stock_db_alert(self, connection, pid, channel, payload): + """Handle stock alert from database trigger""" + try: + data = json.loads(payload) + tenant_id = UUID(data['tenant_id']) + + template_data = self.format_spanish_message( + 'critical_stock_shortage', + ingredient_name=data['name'], + current_stock=data['current_stock'], + required_stock=data['minimum_stock'] + ) + + await self.publish_item(tenant_id, { + 'type': 'critical_stock_shortage', + 'severity': 'urgent', + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'ingredient_id': data['ingredient_id'], + 'current_stock': data['current_stock'], + 'minimum_stock': data['minimum_stock'], + 'trigger_source': 'database' + } + }, item_type='alert') + + except Exception as e: + logger.error("Error handling stock DB alert", error=str(e)) + + async def handle_temperature_db_alert(self, connection, pid, channel, payload): + """Handle temperature alert from database trigger""" + try: + data = json.loads(payload) + tenant_id = UUID(data['tenant_id']) + + template_data = self.format_spanish_message( + 'temperature_breach', + location=data['location'], + temperature=data['temperature'], + duration=data['duration'] + ) + + await self.publish_item(tenant_id, { + 'type': 'temperature_breach', + 'severity': 'high', + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'sensor_id': data['sensor_id'], + 'location': data['location'], + 'temperature': data['temperature'], + 'duration': data['duration'], + 'trigger_source': 'database' + } + }, item_type='alert') + + except Exception as e: + logger.error("Error handling temperature DB alert", error=str(e)) + + async def start_event_listener(self): + """Listen for inventory-affecting events""" + try: + # Subscribe to order events that might affect inventory + await self.rabbitmq_client.consume_events( + "bakery_events", + f"inventory.orders.{self.config.SERVICE_NAME}", + "orders.placed", + self.handle_order_placed + ) + + logger.info("Event listeners started", + service=self.config.SERVICE_NAME) + except Exception as e: + logger.error("Failed to start event listeners", + service=self.config.SERVICE_NAME, + error=str(e)) + + async def handle_order_placed(self, message): + """Check if order critically affects stock""" + try: + order = json.loads(message.body) + tenant_id = UUID(order['tenant_id']) + + for item in order.get('items', []): + # Check stock impact + stock_info = await self.get_stock_after_order(item['ingredient_id'], item['quantity']) + + if stock_info and stock_info['remaining'] < stock_info['minimum_stock']: + await self.publish_item(tenant_id, { + 'type': 'stock_depleted_by_order', + 'severity': 'high', + 'title': f'⚠️ Pedido Agota Stock: {stock_info["name"]}', + 'message': f'Pedido #{order["id"]} dejará stock en {stock_info["remaining"]}kg (mínimo {stock_info["minimum_stock"]}kg)', + 'actions': ['Revisar pedido', 'Contactar proveedor', 'Ajustar producción', 'Usar stock reserva'], + 'metadata': { + 'order_id': order['id'], + 'ingredient_id': item['ingredient_id'], + 'order_quantity': item['quantity'], + 'remaining_stock': stock_info['remaining'], + 'minimum_stock': stock_info['minimum_stock'] + } + }, item_type='alert') + + except Exception as e: + logger.error("Error handling order placed event", error=str(e)) + + async def get_stock_after_order(self, ingredient_id: str, order_quantity: float) -> Optional[Dict[str, Any]]: + """Get stock information after hypothetical order""" + try: + query = """ + SELECT id, name, current_stock, minimum_stock, + (current_stock - $2) as remaining + FROM inventory_items + WHERE id = $1 + """ + + result = await self.db_manager.fetchrow(query, ingredient_id, order_quantity) + return dict(result) if result else None + + except Exception as e: + logger.error("Error getting stock after order", + ingredient_id=ingredient_id, + error=str(e)) + return None \ No newline at end of file diff --git a/services/inventory/requirements.txt b/services/inventory/requirements.txt index 36d5b1b4..3b1b1048 100644 --- a/services/inventory/requirements.txt +++ b/services/inventory/requirements.txt @@ -30,8 +30,12 @@ passlib[bcrypt]==1.7.4 structlog==23.2.0 prometheus-client==0.19.0 -# Message queues +# Message queues and Redis aio-pika==9.3.1 +redis>=4.0.0 + +# Scheduling +APScheduler==3.10.4 # Additional for inventory management python-barcode==0.15.1 diff --git a/services/notification/README.md b/services/notification/README.md deleted file mode 100644 index bba9a03e..00000000 --- a/services/notification/README.md +++ /dev/null @@ -1,321 +0,0 @@ -## 🎯 **Complete Notification Service Implementation** - -### **📁 File Structure Created** - -``` -services/notification/ -├── app/ -│ ├── main.py ✅ Complete FastAPI application -│ ├── core/ -│ │ ├── config.py ✅ Configuration settings -│ │ └── database.py ✅ Database initialization -│ ├── models/ -│ │ ├── notifications.py ✅ Core notification models -│ │ └── templates.py ✅ Template-specific models -│ ├── schemas/ -│ │ └── notifications.py ✅ Pydantic schemas -│ ├── services/ -│ │ ├── notification_service.py ✅ Main business logic -│ │ ├── email_service.py ✅ Email delivery -│ │ ├── whatsapp_service.py ✅ WhatsApp delivery -│ │ └── messaging.py ✅ RabbitMQ integration -│ └── api/ -│ └── notifications.py ✅ Complete API routes -├── requirements.txt ✅ Python dependencies -├── Dockerfile ✅ Container configuration -└── .env.example ✅ Environment variables -``` - -### **🔧 Key Features Implemented** - -#### **1. Complete Business Logic** - -- ✅ **NotificationService**: Core orchestration of all notification operations -- ✅ **Multi-channel support**: Email, WhatsApp, Push (extensible) -- ✅ **Template processing**: Jinja2-based template rendering -- ✅ **Bulk notifications**: Batch processing with rate limiting -- ✅ **User preferences**: Granular notification controls -- ✅ **Scheduling**: Delayed notification delivery - -#### **2. Email Service Integration** - -- ✅ **SMTP support**: Configurable email providers (Gmail, SendGrid, etc -- ✅ **HTML + Text emails**: Rich email templates with fallbacks -- ✅ **Bulk email processing**: Rate-limited batch sending -- ✅ **Template system**: Pre-built Spanish templates for bakeries -- ✅ **Health checks**: SMTP connection monitoring -- ✅ **Attachment support**: File attachment capabilities - -#### **3. WhatsApp Service Integration** - -- ✅ **Twilio integration**: WhatsApp Business API support -- ✅ **Spanish phone formatting**: Automatic +34 country code handling -- ✅ **Template messages**: WhatsApp Business template support -- ✅ **Bulk WhatsApp**: Rate-limited batch messaging -- ✅ **Delivery status**: Webhook handling for delivery confirmations - -#### **4. Database Models & Schemas** - -- ✅ **Complete data model**: Notifications, templates, preferences, logs -- ✅ **Multi-tenant support**: Tenant-scoped notifications -- ✅ **Audit trail**: Detailed delivery attempt logging -- ✅ **Template management**: System and custom templates -- ✅ **User preferences**: Granular notification controls - -#### **5. API Integration with Gateway** - -- ✅ **Gateway authentication**: Uses shared auth decorators -- ✅ **Tenant isolation**: Automatic tenant scoping -- ✅ **Role-based access**: Admin/manager/user permissions -- ✅ **Complete CRUD**: Full notification management API -- ✅ **Webhook endpoints**: External delivery status handling - -#### **6. RabbitMQ Event Integration** - -- ✅ **Event consumers**: Listens for user registration, forecasts, training -- ✅ **Event publishers**: Publishes notification status events -- ✅ **Auto-notifications**: Triggers welcome emails, alerts, reports -- ✅ **Error handling**: Robust message processing with retry logic - -#### **7. Spanish Bakery Templates** - -- ✅ **Welcome email**: Professional onboarding email -- ✅ **Forecast alerts**: Demand variation notifications -- ✅ **Weekly reports**: Performance summary emails -- ✅ **Responsive HTML**: Mobile-optimized email designs -- ✅ **Spanish localization**: All content in Spanish - -### **🚀 Integration with Your Architecture** - -#### **Seamless Gateway Integration** - -```python -# Gateway already routes to notification service -app.include_router(notification.router, prefix="/api/v1/notifications", tags=["notifications"]) - -# Authentication handled by gateway middleware -# Tenant isolation automatic -# User context passed via headers -``` - -#### **Shared Library Usage** - -```python -# Uses your existing shared components -from shared.auth.decorators import get_current_user_dep, get_current_tenant_id_dep -from shared.messaging.rabbitmq import RabbitMQClient -from shared.monitoring.metrics import MetricsCollector -from shared.database.base import DatabaseManager -``` - -#### **Event-Driven Architecture** - -```python -# Automatic notifications triggered by: -# - User registration → Welcome email -# - Forecast alerts → Alert emails + WhatsApp -# - Training completion → Status notifications -# - Data imports → Import confirmations -``` - -### **📊 Production Features** - -#### **Health Monitoring** - -- ✅ **Database health checks**: Connection monitoring -- ✅ **SMTP health checks**: Email service validation -- ✅ **WhatsApp health checks**: API connectivity tests -- ✅ **Prometheus metrics**: Delivery rates, response times -- ✅ **Structured logging**: Comprehensive error tracking - -#### **Rate Limiting & Scaling** - -- ✅ **Email rate limits**: 1000/hour configurable -- ✅ **WhatsApp rate limits**: 100/hour (Twilio limits) -- ✅ **Batch processing**: Configurable batch sizes -- ✅ **Retry logic**: Automatic retry with exponential backoff -- ✅ **Queue management**: Background task processing - -#### **Security & Compliance** - -- ✅ **User consent**: Preference-based opt-in/out -- ✅ **Tenant isolation**: Multi-tenant data separation -- ✅ **GDPR compliance**: User data control -- ✅ **Rate limiting**: DoS protection -- ✅ **Input validation**: Pydantic schema validation - -### **🎯 Business-Specific Features** - -#### **Bakery Use Cases** - -```python -# Forecast alerts when demand varies >20% -# Daily production recommendations -# Weekly performance reports -# Stock shortage notifications -# Weather impact alerts -# Holiday/event notifications -``` - -#### **Spanish Localization** - -- ✅ **Spanish templates**: Native Spanish content -- ✅ **Madrid timezone**: Europe/Madrid default -- ✅ **Spanish phone format**: +34 prefix handling -- ✅ **Local business hours**: Quiet hours support -- ✅ **Cultural context**: Bakery-specific terminology - -### **🔄 How to Deploy** - -#### **1. Add to Docker Compose** - -```yaml -# Already integrated in your docker-compose.yml -notification-service: - build: ./services/notification - ports: - - "8006:8000" - environment: - - DATABASE_URL=postgresql+asyncpg://notification_user:notification_pass123@notification-db:5432/notification_db - depends_on: - - notification-db - - redis - - rabbitmq -``` - -#### **2. Environment Setup** - -```bash -# Copy environment template -cp services/notification/.env.example services/notification/.env - -# Configure email provider -SMTP_USER=your-email@gmail.com -SMTP_PASSWORD=your-app-password - -# Configure WhatsApp (optional) -WHATSAPP_API_KEY=your-twilio-sid:your-twilio-token -``` - -#### **3. Start Service** - -```bash -# Service starts automatically with -docker-compose up -d - -# Check health -curl http://localhost:8006/health - -# View API docs -open http://localhost:8006/docs -``` - -### **📈 API Usage Examples** - -#### **Send Welcome Email** - -```python -POST /api/v1/notifications/send -{ - "type": "email", - "recipient_email": "usuario@panaderia.com", - "template_id": "welcome_email", - "template_data": { - "user_name": "Juan Carlos", - "dashboard_url": "https://app.bakeryforecast.es/dashboard" - } -} -``` - -#### **Send Forecast Alert** - -```python -POST /api/v1/notifications/send -{ - "type": "email", - "template_id": "forecast_alert_email", - "template_data": { - "bakery_name": "Panadería San Miguel", - "product_name": "Pan integral", - "forecast_date": "2025-01-25", - "predicted_demand": 120, - "variation_percentage": 35, - "alert_message": "Aumento significativo esperado. Se recomienda incrementar producción." - }, - "broadcast": true, - "priority": "high" -} -``` - -#### **Update User Preferences** - -```python -PATCH /api/v1/notifications/preferences -{ - "email_alerts": true, - "whatsapp_enabled": false, - "quiet_hours_start": "22:00", - "quiet_hours_end": "08:00", - "language": "es" -} -``` - -### **🎉 Key Benefits** - -#### **✅ Production Ready** - -- Complete error handling and logging -- Health checks and monitoring -- Rate limiting and security -- Multi-tenant architecture -- Scalable event-driven design - -#### **✅ Business Focused** - -- Spanish bakery templates -- Madrid timezone/localization -- Forecast-specific notifications -- Professional email designs -- WhatsApp support for urgent alerts - -#### **✅ Developer Friendly** - -- Comprehensive API documentation -- Type-safe Pydantic schemas -- Async/await throughout -- Structured logging -- Easy testing and debugging - -#### **✅ Seamless Integration** - -- Uses your shared libraries -- Integrates with gateway auth -- Follows your architectural patterns -- Maintains tenant isolation -- Publishes events to RabbitMQ - -### **🚀 Next Steps** - -#### **Immediate (Week 2)** - -1. **Deploy the service**: Add to your docker-compose and start -2. **Configure SMTP**: Set up email provider credentials -3. **Test integration**: Send test notifications via API -4. **Event integration**: Verify RabbitMQ event handling - -#### **Production Optimization** - -1. **Email provider**: Consider SendGrid/Mailgun for production -2. **WhatsApp setup**: Configure Twilio Business API -3. **Template customization**: Add tenant-specific templates -4. **Analytics dashboard**: Add notification analytics to frontend - -### **💡 Advanced Features Ready for Extension** - -- ✅ **Push notifications**: Framework ready for mobile push -- ✅ **SMS support**: Easy to add SMS providers -- ✅ **A/B testing**: Template variant testing -- ✅ **Scheduled campaigns**: Marketing email campaigns -- ✅ **Analytics integration**: Detailed delivery analytics - -**This notification service is now a complete, production-ready microservice that fully integrates with your bakery forecasting platform! It handles all notification needs from welcome emails to urgent forecast alerts, with proper Spanish localization and bakery-specific templates.** 🎯 \ No newline at end of file diff --git a/services/notification/app/api/sse_routes.py b/services/notification/app/api/sse_routes.py new file mode 100644 index 00000000..437283c4 --- /dev/null +++ b/services/notification/app/api/sse_routes.py @@ -0,0 +1,189 @@ +# services/notification/app/api/sse_routes.py +""" +SSE routes for real-time alert and recommendation streaming +""" + +import asyncio +import json +from datetime import datetime +from typing import Optional +from fastapi import APIRouter, Request, Depends, HTTPException, BackgroundTasks +from sse_starlette.sse import EventSourceResponse +import structlog + +from shared.auth.decorators import get_current_user + +router = APIRouter(prefix="/sse", tags=["sse"]) +logger = structlog.get_logger() + +@router.get("/alerts/stream/{tenant_id}") +async def stream_alerts( + tenant_id: str, + request: Request, + background_tasks: BackgroundTasks, + current_user = Depends(get_current_user) +): + """ + SSE endpoint for real-time alert and recommendation streaming + Supports both alerts and recommendations through unified stream + """ + + # Verify user has access to this tenant + if not hasattr(current_user, 'has_access_to_tenant') or not current_user.has_access_to_tenant(tenant_id): + raise HTTPException(403, "Access denied to this tenant") + + # Get SSE service from app state + sse_service = getattr(request.app.state, 'sse_service', None) + if not sse_service: + raise HTTPException(500, "SSE service not available") + + async def event_generator(): + """Generate SSE events for the client""" + client_queue = asyncio.Queue(maxsize=100) # Limit queue size + + try: + # Register client + await sse_service.add_client(tenant_id, client_queue) + + logger.info("SSE client connected", + tenant_id=tenant_id, + user_id=getattr(current_user, 'id', 'unknown')) + + # Stream events + while True: + # Check if client disconnected + if await request.is_disconnected(): + logger.info("SSE client disconnected", tenant_id=tenant_id) + break + + try: + # Wait for events with timeout for keepalive + event = await asyncio.wait_for( + client_queue.get(), + timeout=30.0 + ) + + yield event + + except asyncio.TimeoutError: + # Send keepalive ping + yield { + "event": "ping", + "data": json.dumps({ + "timestamp": datetime.utcnow().isoformat(), + "status": "keepalive" + }), + "id": f"ping_{int(datetime.now().timestamp())}" + } + + except Exception as e: + logger.error("Error in SSE event generator", + tenant_id=tenant_id, + error=str(e)) + break + + except Exception as e: + logger.error("SSE connection error", + tenant_id=tenant_id, + error=str(e)) + finally: + # Clean up on disconnect + try: + await sse_service.remove_client(tenant_id, client_queue) + logger.info("SSE client cleanup completed", tenant_id=tenant_id) + except Exception as e: + logger.error("Error cleaning up SSE client", + tenant_id=tenant_id, + error=str(e)) + + return EventSourceResponse( + event_generator(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", # Disable nginx buffering + } + ) + +@router.post("/items/{item_id}/acknowledge") +async def acknowledge_item( + item_id: str, + current_user = Depends(get_current_user) +): + """Acknowledge an alert or recommendation""" + try: + # This would update the database + # For now, just return success + + logger.info("Item acknowledged", + item_id=item_id, + user_id=getattr(current_user, 'id', 'unknown')) + + return { + "status": "success", + "item_id": item_id, + "acknowledged_by": getattr(current_user, 'id', 'unknown'), + "acknowledged_at": datetime.utcnow().isoformat() + } + + except Exception as e: + logger.error("Failed to acknowledge item", item_id=item_id, error=str(e)) + raise HTTPException(500, "Failed to acknowledge item") + +@router.post("/items/{item_id}/resolve") +async def resolve_item( + item_id: str, + current_user = Depends(get_current_user) +): + """Resolve an alert or recommendation""" + try: + # This would update the database + # For now, just return success + + logger.info("Item resolved", + item_id=item_id, + user_id=getattr(current_user, 'id', 'unknown')) + + return { + "status": "success", + "item_id": item_id, + "resolved_by": getattr(current_user, 'id', 'unknown'), + "resolved_at": datetime.utcnow().isoformat() + } + + except Exception as e: + logger.error("Failed to resolve item", item_id=item_id, error=str(e)) + raise HTTPException(500, "Failed to resolve item") + +@router.get("/status/{tenant_id}") +async def get_sse_status( + tenant_id: str, + current_user = Depends(get_current_user) +): + """Get SSE connection status for a tenant""" + + # Verify user has access to this tenant + if not hasattr(current_user, 'has_access_to_tenant') or not current_user.has_access_to_tenant(tenant_id): + raise HTTPException(403, "Access denied to this tenant") + + try: + # Get SSE service from app state + sse_service = getattr(request.app.state, 'sse_service', None) + if not sse_service: + return {"status": "unavailable", "message": "SSE service not initialized"} + + metrics = sse_service.get_metrics() + tenant_connections = len(sse_service.active_connections.get(tenant_id, set())) + + return { + "status": "available", + "tenant_id": tenant_id, + "connections": tenant_connections, + "total_connections": metrics["total_connections"], + "active_tenants": metrics["active_tenants"] + } + + except Exception as e: + logger.error("Failed to get SSE status", tenant_id=tenant_id, error=str(e)) + raise HTTPException(500, "Failed to get SSE status") \ No newline at end of file diff --git a/services/notification/app/main.py b/services/notification/app/main.py index a430f426..647bd67e 100644 --- a/services/notification/app/main.py +++ b/services/notification/app/main.py @@ -1,9 +1,9 @@ # ================================================================ -# services/notification/app/main.py - COMPLETE IMPLEMENTATION +# services/notification/app/main.py - ENHANCED WITH SSE SUPPORT # ================================================================ """ Notification Service Main Application -Handles email and WhatsApp notifications with full integration +Handles email, WhatsApp notifications and SSE for real-time alerts/recommendations """ import structlog @@ -15,7 +15,12 @@ from fastapi.responses import JSONResponse from app.core.config import settings from app.core.database import init_db from app.api.notifications import router as notification_router +from app.api.sse_routes import router as sse_router from app.services.messaging import setup_messaging, cleanup_messaging +from app.services.sse_service import SSEService +from app.services.notification_orchestrator import NotificationOrchestrator +from app.services.email_service import EmailService +from app.services.whatsapp_service import WhatsAppService from shared.monitoring import setup_logging, HealthChecker from shared.monitoring.metrics import setup_metrics_early @@ -30,8 +35,8 @@ health_checker = None # Create FastAPI app FIRST app = FastAPI( title="Bakery Notification Service", - description="Email and WhatsApp notification service for bakery forecasting platform", - version="1.0.0", + description="Email, WhatsApp and SSE notification service for bakery alerts and recommendations", + version="2.0.0", docs_url="/docs", redoc_url="/redoc" ) @@ -56,12 +61,36 @@ async def lifespan(app: FastAPI): await setup_messaging() logger.info("Messaging initialized") + # Initialize services + email_service = EmailService() + whatsapp_service = WhatsAppService() + + # Initialize SSE service + sse_service = SSEService(settings.REDIS_URL) + await sse_service.initialize() + logger.info("SSE service initialized") + + # Create orchestrator + orchestrator = NotificationOrchestrator( + email_service=email_service, + whatsapp_service=whatsapp_service, + sse_service=sse_service + ) + + # Store services in app state + app.state.orchestrator = orchestrator + app.state.sse_service = sse_service + app.state.email_service = email_service + app.state.whatsapp_service = whatsapp_service + # Register custom metrics (metrics_collector already exists) - metrics_collector.register_counter("notifications_sent_total", "Total notifications sent", labels=["type", "status"]) + metrics_collector.register_counter("notifications_sent_total", "Total notifications sent", labels=["type", "status", "channel"]) metrics_collector.register_counter("emails_sent_total", "Total emails sent", labels=["status"]) metrics_collector.register_counter("whatsapp_sent_total", "Total WhatsApp messages sent", labels=["status"]) + metrics_collector.register_counter("sse_events_sent_total", "Total SSE events sent", labels=["tenant", "event_type"]) metrics_collector.register_histogram("notification_processing_duration_seconds", "Time spent processing notifications") metrics_collector.register_gauge("notification_queue_size", "Current notification queue size") + metrics_collector.register_gauge("sse_active_connections", "Number of active SSE connections") # Setup health checker health_checker = HealthChecker("notification-service") @@ -93,14 +122,22 @@ async def lifespan(app: FastAPI): # Add WhatsApp service health check async def check_whatsapp_service(): try: - from app.services.whatsapp_service import WhatsAppService - whatsapp_service = WhatsAppService() return await whatsapp_service.health_check() except Exception as e: return f"WhatsApp service error: {e}" health_checker.add_check("whatsapp_service", check_whatsapp_service, timeout=10.0, critical=False) + # Add SSE service health check + async def check_sse_service(): + try: + metrics = sse_service.get_metrics() + return "healthy" if metrics["redis_connected"] else "Redis connection failed" + except Exception as e: + return f"SSE service error: {e}" + + health_checker.add_check("sse_service", check_sse_service, timeout=5.0, critical=True) + # Add messaging health check def check_messaging(): try: @@ -115,7 +152,7 @@ async def lifespan(app: FastAPI): # Store health checker in app state app.state.health_checker = health_checker - logger.info("Notification Service started successfully") + logger.info("Notification Service with SSE support started successfully") except Exception as e: logger.error(f"Failed to start Notification Service: {e}") @@ -126,10 +163,15 @@ async def lifespan(app: FastAPI): # Shutdown logger.info("Shutting down Notification Service...") try: + # Shutdown SSE service + if hasattr(app.state, 'sse_service'): + await app.state.sse_service.shutdown() + logger.info("SSE service shutdown completed") + await cleanup_messaging() logger.info("Messaging cleanup completed") except Exception as e: - logger.error(f"Error during messaging cleanup: {e}") + logger.error(f"Error during shutdown: {e}") # Set lifespan AFTER metrics setup app.router.lifespan_context = lifespan @@ -145,18 +187,30 @@ app.add_middleware( # Include routers app.include_router(notification_router, prefix="/api/v1", tags=["notifications"]) +app.include_router(sse_router, prefix="/api/v1", tags=["sse"]) # Health check endpoint @app.get("/health") async def health_check(): - """Comprehensive health check endpoint""" + """Comprehensive health check endpoint including SSE""" if health_checker: - return await health_checker.check_health() + health_result = await health_checker.check_health() + + # Add SSE metrics to health check + if hasattr(app.state, 'sse_service'): + try: + sse_metrics = app.state.sse_service.get_metrics() + health_result['sse_metrics'] = sse_metrics + except Exception as e: + health_result['sse_error'] = str(e) + + return health_result else: return { "service": "notification-service", "status": "healthy", - "version": "1.0.0" + "version": "2.0.0", + "features": ["email", "whatsapp", "sse", "alerts", "recommendations"] } # Metrics endpoint diff --git a/services/notification/app/services/email_service.py b/services/notification/app/services/email_service.py index d1575a19..5422f57d 100644 --- a/services/notification/app/services/email_service.py +++ b/services/notification/app/services/email_service.py @@ -276,14 +276,26 @@ class EmailService: # Test SMTP connection if self.smtp_ssl: + # Use implicit TLS/SSL connection (port 465 typically) server = aiosmtplib.SMTP(hostname=self.smtp_host, port=self.smtp_port, use_tls=True) + await server.connect() + # No need for starttls() when using implicit TLS else: + # Use plain connection, optionally upgrade with STARTTLS server = aiosmtplib.SMTP(hostname=self.smtp_host, port=self.smtp_port) - - await server.connect() - - if self.smtp_tls: - await server.starttls() + await server.connect() + + if self.smtp_tls: + # Try STARTTLS, but handle case where connection is already secure + try: + await server.starttls() + except Exception as starttls_error: + # If STARTTLS fails because connection is already using TLS, that's okay + if "already using TLS" in str(starttls_error) or "already secure" in str(starttls_error): + logger.debug("SMTP connection already secure, skipping STARTTLS") + else: + # Re-raise other STARTTLS errors + raise starttls_error await server.login(self.smtp_user, self.smtp_password) await server.quit() diff --git a/services/notification/app/services/notification_orchestrator.py b/services/notification/app/services/notification_orchestrator.py new file mode 100644 index 00000000..60ba56da --- /dev/null +++ b/services/notification/app/services/notification_orchestrator.py @@ -0,0 +1,279 @@ +# services/notification/app/services/notification_orchestrator.py +""" +Notification orchestrator for managing delivery across all channels +Includes SSE integration for real-time dashboard updates +""" + +from typing import List, Dict, Any +from datetime import datetime +import structlog + +from .email_service import EmailService +from .whatsapp_service import WhatsAppService +from .sse_service import SSEService + +logger = structlog.get_logger() + +class NotificationOrchestrator: + """ + Orchestrates delivery across all notification channels + Now includes SSE for real-time dashboard updates, with support for recommendations + """ + + def __init__( + self, + email_service: EmailService, + whatsapp_service: WhatsAppService, + sse_service: SSEService, + push_service=None # Optional push service + ): + self.email_service = email_service + self.whatsapp_service = whatsapp_service + self.sse_service = sse_service + self.push_service = push_service + + async def send_notification( + self, + tenant_id: str, + notification: Dict[str, Any], + channels: List[str] + ) -> Dict[str, Any]: + """ + Send notification through specified channels + Channels can include: email, whatsapp, push, dashboard (SSE) + """ + results = {} + + # Always send to dashboard for visibility (SSE) + if 'dashboard' in channels or notification.get('type') in ['alert', 'recommendation']: + try: + await self.sse_service.send_item_notification( + tenant_id, + notification + ) + results['dashboard'] = {'status': 'sent', 'timestamp': datetime.utcnow().isoformat()} + logger.info("Item sent to dashboard via SSE", + tenant_id=tenant_id, + item_type=notification.get('type'), + item_id=notification.get('id')) + except Exception as e: + logger.error("Failed to send to dashboard", + tenant_id=tenant_id, + error=str(e)) + results['dashboard'] = {'status': 'failed', 'error': str(e)} + + # Send to email channel + if 'email' in channels: + try: + email_result = await self.email_service.send_notification_email( + to_email=notification.get('email'), + subject=notification.get('title'), + template_data={ + 'title': notification.get('title'), + 'message': notification.get('message'), + 'severity': notification.get('severity'), + 'item_type': notification.get('type'), + 'actions': notification.get('actions', []), + 'metadata': notification.get('metadata', {}), + 'timestamp': datetime.utcnow().isoformat() + }, + notification_type=notification.get('type', 'alert') + ) + results['email'] = email_result + except Exception as e: + logger.error("Failed to send email", + tenant_id=tenant_id, + error=str(e)) + results['email'] = {'status': 'failed', 'error': str(e)} + + # Send to WhatsApp channel + if 'whatsapp' in channels: + try: + whatsapp_result = await self.whatsapp_service.send_notification_message( + to_phone=notification.get('phone'), + message=self._format_whatsapp_message(notification), + notification_type=notification.get('type', 'alert') + ) + results['whatsapp'] = whatsapp_result + except Exception as e: + logger.error("Failed to send WhatsApp", + tenant_id=tenant_id, + error=str(e)) + results['whatsapp'] = {'status': 'failed', 'error': str(e)} + + # Send to push notification channel + if 'push' in channels and self.push_service: + try: + push_result = await self.push_service.send_notification( + user_id=notification.get('user_id'), + title=notification.get('title'), + body=notification.get('message'), + data={ + 'item_type': notification.get('type'), + 'severity': notification.get('severity'), + 'item_id': notification.get('id'), + 'metadata': notification.get('metadata', {}) + } + ) + results['push'] = push_result + except Exception as e: + logger.error("Failed to send push notification", + tenant_id=tenant_id, + error=str(e)) + results['push'] = {'status': 'failed', 'error': str(e)} + + # Log summary + successful_channels = [ch for ch, result in results.items() if result.get('status') == 'sent'] + failed_channels = [ch for ch, result in results.items() if result.get('status') == 'failed'] + + logger.info("Notification delivery completed", + tenant_id=tenant_id, + item_type=notification.get('type'), + item_id=notification.get('id'), + successful_channels=successful_channels, + failed_channels=failed_channels, + total_channels=len(channels)) + + return { + 'status': 'completed', + 'successful_channels': successful_channels, + 'failed_channels': failed_channels, + 'results': results, + 'timestamp': datetime.utcnow().isoformat() + } + + def _format_whatsapp_message(self, notification: Dict[str, Any]) -> str: + """Format message for WhatsApp with emojis and structure""" + item_type = notification.get('type', 'alert') + severity = notification.get('severity', 'medium') + + # Get appropriate emoji + type_emoji = '🚨' if item_type == 'alert' else '💡' + severity_emoji = { + 'urgent': '🔴', + 'high': '🟡', + 'medium': '🔵', + 'low': '🟢' + }.get(severity, '🔵') + + message = f"{type_emoji} {severity_emoji} *{notification.get('title', 'Notificación')}*\n\n" + message += f"{notification.get('message', '')}\n" + + # Add actions if available + actions = notification.get('actions', []) + if actions and len(actions) > 0: + message += "\n*Acciones sugeridas:*\n" + for i, action in enumerate(actions[:3], 1): # Limit to 3 actions for WhatsApp + message += f"{i}. {action}\n" + + # Add timestamp + message += f"\n_Enviado: {datetime.now().strftime('%H:%M, %d/%m/%Y')}_" + + return message + + def get_channels_by_severity(self, severity: str, item_type: str, hour: int = None) -> List[str]: + """ + Determine notification channels based on severity and item_type + Now includes 'dashboard' as a channel + """ + if hour is None: + hour = datetime.now().hour + + # Dashboard always gets all items + channels = ['dashboard'] + + if item_type == 'alert': + if severity == 'urgent': + # Urgent alerts: All channels immediately + channels.extend(['email', 'whatsapp', 'push']) + + elif severity == 'high': + # High alerts: Email and WhatsApp during extended hours + if 6 <= hour <= 22: + channels.extend(['email', 'whatsapp']) + else: + channels.append('email') # Email only during night + + elif severity == 'medium': + # Medium alerts: Email during business hours + if 7 <= hour <= 20: + channels.append('email') + + elif item_type == 'recommendation': + # Recommendations: Generally less urgent, respect business hours + if severity in ['medium', 'high']: + if 8 <= hour <= 19: # Stricter business hours for recommendations + channels.append('email') + # Low/urgent: Dashboard only (urgent rare for recommendations) + + return channels + + async def health_check(self) -> Dict[str, Any]: + """Check health of all notification channels""" + health_status = { + 'status': 'healthy', + 'channels': {}, + 'timestamp': datetime.utcnow().isoformat() + } + + # Check email service + try: + email_health = await self.email_service.health_check() + health_status['channels']['email'] = email_health + except Exception as e: + health_status['channels']['email'] = {'status': 'unhealthy', 'error': str(e)} + + # Check WhatsApp service + try: + whatsapp_health = await self.whatsapp_service.health_check() + health_status['channels']['whatsapp'] = whatsapp_health + except Exception as e: + health_status['channels']['whatsapp'] = {'status': 'unhealthy', 'error': str(e)} + + # Check SSE service + try: + sse_metrics = self.sse_service.get_metrics() + sse_status = 'healthy' if sse_metrics['redis_connected'] else 'unhealthy' + health_status['channels']['sse'] = { + 'status': sse_status, + 'metrics': sse_metrics + } + except Exception as e: + health_status['channels']['sse'] = {'status': 'unhealthy', 'error': str(e)} + + # Check push service if available + if self.push_service: + try: + push_health = await self.push_service.health_check() + health_status['channels']['push'] = push_health + except Exception as e: + health_status['channels']['push'] = {'status': 'unhealthy', 'error': str(e)} + + # Determine overall status + unhealthy_channels = [ + ch for ch, status in health_status['channels'].items() + if status.get('status') != 'healthy' + ] + + if unhealthy_channels: + health_status['status'] = 'degraded' if len(unhealthy_channels) < len(health_status['channels']) else 'unhealthy' + health_status['unhealthy_channels'] = unhealthy_channels + + return health_status + + def get_metrics(self) -> Dict[str, Any]: + """Get aggregated metrics from all services""" + metrics = { + 'timestamp': datetime.utcnow().isoformat(), + 'channels': {} + } + + # Get SSE metrics + try: + metrics['channels']['sse'] = self.sse_service.get_metrics() + except Exception as e: + logger.error("Failed to get SSE metrics", error=str(e)) + + # Additional metrics could be added here for other services + + return metrics \ No newline at end of file diff --git a/services/notification/app/services/sse_service.py b/services/notification/app/services/sse_service.py new file mode 100644 index 00000000..e543af5f --- /dev/null +++ b/services/notification/app/services/sse_service.py @@ -0,0 +1,256 @@ +# services/notification/app/services/sse_service.py +""" +Server-Sent Events service for real-time notifications +Integrated within the notification service for alerts and recommendations +""" + +import asyncio +from redis.asyncio import Redis +import json +from typing import Dict, Set, Any +from datetime import datetime +import structlog + +logger = structlog.get_logger() + +class SSEService: + """ + Server-Sent Events service for real-time notifications + Handles both alerts and recommendations through unified SSE streams + """ + + def __init__(self, redis_url: str): + self.redis_url = redis_url + self.redis = None + self.active_connections: Dict[str, Set[asyncio.Queue]] = {} + self.pubsub_tasks: Dict[str, asyncio.Task] = {} + + async def initialize(self): + """Initialize Redis connection""" + try: + self.redis = Redis.from_url(self.redis_url) + logger.info("SSE Service initialized with Redis connection") + except Exception as e: + logger.error("Failed to initialize SSE service", error=str(e)) + raise + + async def shutdown(self): + """Clean shutdown""" + try: + # Cancel all pubsub tasks + for task in self.pubsub_tasks.values(): + if not task.done(): + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + + # Close all client connections + for tenant_id, connections in self.active_connections.items(): + for queue in connections.copy(): + try: + await queue.put({"event": "shutdown", "data": json.dumps({"status": "server_shutdown"})}) + except: + pass + + # Close Redis connection + if self.redis: + await self.redis.close() + + logger.info("SSE Service shutdown completed") + + except Exception as e: + logger.error("Error during SSE shutdown", error=str(e)) + + async def add_client(self, tenant_id: str, client_queue: asyncio.Queue): + """Add a new SSE client connection""" + try: + if tenant_id not in self.active_connections: + self.active_connections[tenant_id] = set() + # Start pubsub listener for this tenant if not exists + if tenant_id not in self.pubsub_tasks: + task = asyncio.create_task(self._listen_to_tenant_channel(tenant_id)) + self.pubsub_tasks[tenant_id] = task + + self.active_connections[tenant_id].add(client_queue) + + client_count = len(self.active_connections[tenant_id]) + logger.info("SSE client added", + tenant_id=tenant_id, + total_clients=client_count) + + # Send connection confirmation + await client_queue.put({ + "event": "connected", + "data": json.dumps({ + "status": "connected", + "tenant_id": tenant_id, + "timestamp": datetime.utcnow().isoformat(), + "client_count": client_count + }) + }) + + # Send any active items (alerts and recommendations) + active_items = await self.get_active_items(tenant_id) + if active_items: + await client_queue.put({ + "event": "initial_items", + "data": json.dumps(active_items) + }) + + except Exception as e: + logger.error("Error adding SSE client", tenant_id=tenant_id, error=str(e)) + + async def remove_client(self, tenant_id: str, client_queue: asyncio.Queue): + """Remove SSE client connection""" + try: + if tenant_id in self.active_connections: + self.active_connections[tenant_id].discard(client_queue) + + # If no more clients for this tenant, stop the pubsub listener + if not self.active_connections[tenant_id]: + del self.active_connections[tenant_id] + if tenant_id in self.pubsub_tasks: + task = self.pubsub_tasks[tenant_id] + if not task.done(): + task.cancel() + del self.pubsub_tasks[tenant_id] + + logger.info("SSE client removed", tenant_id=tenant_id) + + except Exception as e: + logger.error("Error removing SSE client", tenant_id=tenant_id, error=str(e)) + + async def _listen_to_tenant_channel(self, tenant_id: str): + """Listen to Redis channel for tenant-specific items""" + try: + # Create a separate Redis connection for pubsub + pubsub_redis = Redis.from_url(self.redis_url) + pubsub = pubsub_redis.pubsub() + channel = f"alerts:{tenant_id}" + await pubsub.subscribe(channel) + + logger.info("Started listening to tenant channel", + tenant_id=tenant_id, + channel=channel) + + async for message in pubsub.listen(): + if message["type"] == "message": + # Broadcast to all connected clients for this tenant + await self.broadcast_to_tenant(tenant_id, message["data"]) + + except asyncio.CancelledError: + logger.info("Stopped listening to tenant channel", tenant_id=tenant_id) + except Exception as e: + logger.error("Error in pubsub listener", tenant_id=tenant_id, error=str(e)) + finally: + try: + await pubsub.unsubscribe(channel) + await pubsub_redis.close() + except: + pass + + async def broadcast_to_tenant(self, tenant_id: str, message: str): + """Broadcast message to all connected clients of a tenant""" + if tenant_id not in self.active_connections: + return + + try: + item_data = json.loads(message) + event = { + "event": item_data.get('item_type', 'item'), # 'alert' or 'recommendation' + "data": json.dumps(item_data), + "id": item_data.get("id") + } + + # Send to all connected clients + disconnected = [] + for client_queue in self.active_connections[tenant_id]: + try: + # Use put_nowait to avoid blocking + client_queue.put_nowait(event) + except asyncio.QueueFull: + logger.warning("Client queue full, dropping message", tenant_id=tenant_id) + disconnected.append(client_queue) + except Exception as e: + logger.warning("Failed to send to client", tenant_id=tenant_id, error=str(e)) + disconnected.append(client_queue) + + # Clean up disconnected clients + for queue in disconnected: + await self.remove_client(tenant_id, queue) + + if disconnected: + logger.info("Cleaned up disconnected clients", + tenant_id=tenant_id, + count=len(disconnected)) + + except Exception as e: + logger.error("Error broadcasting to tenant", tenant_id=tenant_id, error=str(e)) + + async def send_item_notification(self, tenant_id: str, item: Dict[str, Any]): + """ + Send alert or recommendation via SSE (called by notification orchestrator) + """ + try: + # Publish to Redis for SSE streaming + channel = f"alerts:{tenant_id}" + + item_message = { + 'id': item.get('id'), + 'item_type': item.get('type'), # 'alert' or 'recommendation' + 'type': item.get('alert_type', item.get('type')), + 'severity': item.get('severity'), + 'title': item.get('title'), + 'message': item.get('message'), + 'actions': item.get('actions', []), + 'metadata': item.get('metadata', {}), + 'timestamp': item.get('timestamp', datetime.utcnow().isoformat()), + 'status': 'active' + } + + await self.redis.publish(channel, json.dumps(item_message)) + + logger.info("Item published to SSE", + tenant_id=tenant_id, + item_type=item.get('type'), + item_id=item.get('id')) + + except Exception as e: + logger.error("Error sending item notification via SSE", + tenant_id=tenant_id, + error=str(e)) + + async def get_active_items(self, tenant_id: str) -> list: + """Fetch active alerts and recommendations from database""" + try: + # This would integrate with the actual database + # For now, return empty list as placeholder + # In real implementation, this would query the alerts table + + # Example query: + # query = """ + # SELECT id, item_type, alert_type, severity, title, message, + # actions, metadata, created_at, status + # FROM alerts + # WHERE tenant_id = $1 + # AND status = 'active' + # ORDER BY severity_weight DESC, created_at DESC + # LIMIT 50 + # """ + + return [] # Placeholder + + except Exception as e: + logger.error("Error fetching active items", tenant_id=tenant_id, error=str(e)) + return [] + + def get_metrics(self) -> Dict[str, Any]: + """Get SSE service metrics""" + return { + "active_tenants": len(self.active_connections), + "total_connections": sum(len(connections) for connections in self.active_connections.values()), + "active_listeners": len(self.pubsub_tasks), + "redis_connected": self.redis and not self.redis.closed + } \ No newline at end of file diff --git a/services/notification/app/services/whatsapp_service.py b/services/notification/app/services/whatsapp_service.py index 1c4b2cc6..550dd694 100644 --- a/services/notification/app/services/whatsapp_service.py +++ b/services/notification/app/services/whatsapp_service.py @@ -30,6 +30,17 @@ class WhatsAppService: self.from_number = settings.WHATSAPP_FROM_NUMBER self.enabled = settings.ENABLE_WHATSAPP_NOTIFICATIONS + def _parse_api_credentials(self): + """Parse API key into username and password for Twilio basic auth""" + if not self.api_key or ":" not in self.api_key: + raise ValueError("WhatsApp API key must be in format 'username:password'") + + api_parts = self.api_key.split(":", 1) + if len(api_parts) != 2: + raise ValueError("Invalid WhatsApp API key format") + + return api_parts[0], api_parts[1] + async def send_message( self, to_phone: str, @@ -181,10 +192,22 @@ class WhatsAppService: return False # Test API connectivity with a simple request + # Parse API key (expected format: username:password for Twilio basic auth) + if ":" not in self.api_key: + logger.error("WhatsApp API key must be in format 'username:password'") + return False + + api_parts = self.api_key.split(":", 1) # Split on first : only + if len(api_parts) != 2: + logger.error("Invalid WhatsApp API key format") + return False + + username, password = api_parts + async with httpx.AsyncClient(timeout=10.0) as client: response = await client.get( f"{self.base_url}/v1/Account", # Twilio account info endpoint - auth=(self.api_key.split(":")[0], self.api_key.split(":")[1]) + auth=(username, password) ) if response.status_code == 200: @@ -206,6 +229,13 @@ class WhatsAppService: async def _send_text_message(self, to_phone: str, message: str) -> bool: """Send regular text message via Twilio""" try: + # Parse API credentials + try: + username, password = self._parse_api_credentials() + except ValueError as e: + logger.error(f"WhatsApp API key configuration error: {e}") + return False + # Prepare request data data = { "From": f"whatsapp:{self.from_number}", @@ -216,9 +246,9 @@ class WhatsAppService: # Send via Twilio API async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post( - f"{self.base_url}/2010-04-01/Accounts/{self.api_key.split(':')[0]}/Messages.json", + f"{self.base_url}/2010-04-01/Accounts/{username}/Messages.json", data=data, - auth=(self.api_key.split(":")[0], self.api_key.split(":")[1]) + auth=(username, password) ) if response.status_code == 201: @@ -245,6 +275,13 @@ class WhatsAppService: ) -> bool: """Send WhatsApp template message via Twilio""" try: + # Parse API credentials + try: + username, password = self._parse_api_credentials() + except ValueError as e: + logger.error(f"WhatsApp API key configuration error: {e}") + return False + # Prepare template data content_variables = {str(i+1): param for i, param in enumerate(parameters)} @@ -258,9 +295,9 @@ class WhatsAppService: # Send via Twilio API async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post( - f"{self.base_url}/2010-04-01/Accounts/{self.api_key.split(':')[0]}/Messages.json", + f"{self.base_url}/2010-04-01/Accounts/{username}/Messages.json", data=data, - auth=(self.api_key.split(":")[0], self.api_key.split(":")[1]) + auth=(username, password) ) if response.status_code == 201: @@ -315,10 +352,17 @@ class WhatsAppService: async def _get_message_status(self, message_sid: str) -> Optional[str]: """Get message delivery status from Twilio""" try: + # Parse API credentials + try: + username, password = self._parse_api_credentials() + except ValueError as e: + logger.error(f"WhatsApp API key configuration error: {e}") + return None + async with httpx.AsyncClient(timeout=10.0) as client: response = await client.get( - f"{self.base_url}/2010-04-01/Accounts/{self.api_key.split(':')[0]}/Messages/{message_sid}.json", - auth=(self.api_key.split(":")[0], self.api_key.split(":")[1]) + f"{self.base_url}/2010-04-01/Accounts/{username}/Messages/{message_sid}.json", + auth=(username, password) ) if response.status_code == 200: diff --git a/services/notification/requirements.txt b/services/notification/requirements.txt index 584431c9..6808b999 100644 --- a/services/notification/requirements.txt +++ b/services/notification/requirements.txt @@ -3,6 +3,7 @@ fastapi==0.104.1 uvicorn[standard]==0.24.0 pydantic==2.5.0 pydantic-settings==2.1.0 +sse-starlette==1.6.5 # Database sqlalchemy==2.0.23 @@ -22,8 +23,9 @@ aiofiles==23.2.1 aiosmtplib==3.0.1 email-validator==2.1.0 -# Messaging +# Messaging & Redis aio-pika==9.3.1 +redis==5.0.1 # Template Engine jinja2==3.1.2 diff --git a/services/orders/README.md b/services/orders/README.md deleted file mode 100644 index ed86487e..00000000 --- a/services/orders/README.md +++ /dev/null @@ -1,248 +0,0 @@ -# Orders Service - -Customer orders and procurement planning service for the bakery management system. - -## Overview - -The Orders Service handles all order-related operations including: - -- **Customer Management**: Complete customer lifecycle and relationship management -- **Order Processing**: End-to-end order management from creation to fulfillment -- **Procurement Planning**: Automated procurement requirement calculation and planning -- **Business Intelligence**: Order pattern analysis and business model detection -- **Dashboard Analytics**: Comprehensive reporting and metrics for order operations - -## Features - -### Core Capabilities -- Customer registration and management with detailed profiles -- Order creation, tracking, and status management -- Automated demand requirements calculation for production planning -- Procurement planning with supplier coordination -- Business model detection (individual bakery vs central bakery) -- Comprehensive dashboard with real-time metrics -- Integration with production, inventory, suppliers, and sales services - -### API Endpoints - -#### Dashboard & Analytics -- `GET /api/v1/tenants/{tenant_id}/orders/dashboard-summary` - Comprehensive dashboard data -- `GET /api/v1/tenants/{tenant_id}/orders/demand-requirements` - Demand analysis for production -- `GET /api/v1/tenants/{tenant_id}/orders/business-model` - Business model detection - -#### Order Management -- `POST /api/v1/tenants/{tenant_id}/orders` - Create new customer order -- `GET /api/v1/tenants/{tenant_id}/orders` - List orders with filtering and pagination -- `GET /api/v1/tenants/{tenant_id}/orders/{order_id}` - Get order details with items -- `PUT /api/v1/tenants/{tenant_id}/orders/{order_id}/status` - Update order status - -#### Customer Management -- `POST /api/v1/tenants/{tenant_id}/customers` - Create new customer -- `GET /api/v1/tenants/{tenant_id}/customers` - List customers with filtering -- `GET /api/v1/tenants/{tenant_id}/customers/{customer_id}` - Get customer details - -#### Health & Status -- `GET /api/v1/tenants/{tenant_id}/orders/status` - Service status information - -## Service Integration - -### Shared Clients Used -- **InventoryServiceClient**: Stock levels, product availability validation -- **ProductionServiceClient**: Production notifications, capacity planning -- **SalesServiceClient**: Historical sales data for demand forecasting -- **NotificationServiceClient**: Customer notifications and alerts - -### Authentication -Uses shared authentication patterns with tenant isolation: -- JWT token validation -- Tenant access verification -- User permission checks - -## Configuration - -Key configuration options in `app/core/config.py`: - -### Order Processing -- `ORDER_PROCESSING_ENABLED`: Enable automatic order processing (default: true) -- `AUTO_APPROVE_ORDERS`: Automatically approve orders (default: false) -- `MAX_ORDER_ITEMS`: Maximum items per order (default: 50) - -### Procurement Planning -- `PROCUREMENT_PLANNING_ENABLED`: Enable procurement planning (default: true) -- `PROCUREMENT_LEAD_TIME_DAYS`: Standard procurement lead time (default: 3) -- `DEMAND_FORECAST_DAYS`: Days for demand forecasting (default: 14) -- `SAFETY_STOCK_PERCENTAGE`: Safety stock buffer (default: 20%) - -### Business Model Detection -- `ENABLE_BUSINESS_MODEL_DETECTION`: Enable automatic detection (default: true) -- `CENTRAL_BAKERY_ORDER_THRESHOLD`: Order threshold for central bakery (default: 20) -- `INDIVIDUAL_BAKERY_ORDER_THRESHOLD`: Order threshold for individual bakery (default: 5) - -### Customer Management -- `CUSTOMER_VALIDATION_ENABLED`: Enable customer validation (default: true) -- `MAX_CUSTOMERS_PER_TENANT`: Maximum customers per tenant (default: 10000) -- `CUSTOMER_CREDIT_CHECK_ENABLED`: Enable credit checking (default: false) - -### Order Validation -- `MIN_ORDER_VALUE`: Minimum order value (default: 0.0) -- `MAX_ORDER_VALUE`: Maximum order value (default: 100000.0) -- `VALIDATE_PRODUCT_AVAILABILITY`: Check product availability (default: true) - -### Alert Thresholds -- `HIGH_VALUE_ORDER_THRESHOLD`: High-value order alert (default: 5000.0) -- `LARGE_QUANTITY_ORDER_THRESHOLD`: Large quantity alert (default: 100) -- `RUSH_ORDER_HOURS_THRESHOLD`: Rush order time threshold (default: 24) -- `PROCUREMENT_SHORTAGE_THRESHOLD`: Procurement shortage alert (default: 90%) - -### Payment and Pricing -- `PAYMENT_VALIDATION_ENABLED`: Enable payment validation (default: true) -- `DYNAMIC_PRICING_ENABLED`: Enable dynamic pricing (default: false) -- `DISCOUNT_ENABLED`: Enable discounts (default: true) -- `MAX_DISCOUNT_PERCENTAGE`: Maximum discount allowed (default: 50%) - -### Delivery and Fulfillment -- `DELIVERY_TRACKING_ENABLED`: Enable delivery tracking (default: true) -- `DEFAULT_DELIVERY_WINDOW_HOURS`: Default delivery window (default: 48) -- `PICKUP_ENABLED`: Enable pickup orders (default: true) -- `DELIVERY_ENABLED`: Enable delivery orders (default: true) - -## Database Models - -### Customer -- Complete customer profile with contact information -- Business type classification (individual, business, central_bakery) -- Payment terms and credit management -- Order history and metrics tracking -- Delivery preferences and special requirements - -### CustomerOrder -- Comprehensive order tracking from creation to delivery -- Status management with full audit trail -- Financial calculations including discounts and taxes -- Delivery scheduling and fulfillment tracking -- Business model detection and categorization -- Customer communication preferences - -### OrderItem -- Detailed line item tracking with product specifications -- Customization and special instruction support -- Production requirement integration -- Cost tracking and margin analysis -- Quality control integration - -### OrderStatusHistory -- Complete audit trail of order status changes -- Event tracking with detailed context -- User attribution and change reasons -- Customer notification tracking - -### ProcurementPlan -- Master procurement planning with business model context -- Supplier diversification and risk assessment -- Performance tracking and cost analysis -- Integration with demand forecasting - -### ProcurementRequirement -- Detailed procurement requirements per product/ingredient -- Current inventory level integration -- Supplier preference and lead time management -- Quality specifications and special requirements - -### OrderAlert -- Comprehensive alert system for order issues -- Multiple severity levels with appropriate routing -- Business impact assessment -- Resolution tracking and performance metrics - -## Business Logic - -### Order Processing Flow -1. **Order Creation**: Validate customer, calculate totals, create order record -2. **Item Processing**: Create order items with specifications and requirements -3. **Status Tracking**: Maintain complete audit trail of status changes -4. **Customer Metrics**: Update customer statistics and relationship data -5. **Business Model Detection**: Analyze patterns to determine bakery type -6. **Alert Generation**: Check for high-value, rush, or large orders -7. **Service Integration**: Notify production and inventory services - -### Procurement Planning -1. **Demand Analysis**: Aggregate orders by delivery date and products -2. **Inventory Integration**: Check current stock levels and reservations -3. **Requirement Calculation**: Calculate net procurement needs with safety buffer -4. **Supplier Coordination**: Match requirements with preferred suppliers -5. **Lead Time Planning**: Account for supplier lead times and delivery windows -6. **Risk Assessment**: Evaluate supply risks and backup options - -### Business Model Detection -- **Individual Bakery**: Low order volume, direct customer sales, standard products -- **Central Bakery**: High volume, wholesale operations, bulk orders -- **Detection Factors**: Order frequency, quantity, customer types, sales channels - -## Alert System - -### Alert Types -- **High Value Orders**: Orders exceeding configured thresholds -- **Rush Orders**: Orders with tight delivery requirements -- **Large Quantity Orders**: Orders with unusually high item counts -- **Payment Issues**: Payment validation failures or credit problems -- **Procurement Shortages**: Insufficient inventory for order fulfillment -- **Customer Issues**: New customers, credit limit exceedances, special requirements - -### Severity Levels -- **Critical**: WhatsApp + Email + Dashboard + SMS -- **High**: WhatsApp + Email + Dashboard -- **Medium**: Email + Dashboard -- **Low**: Dashboard only - -## Development - -### Setup -```bash -# Install dependencies -pip install -r requirements.txt - -# Set up database -# Configure ORDERS_DATABASE_URL environment variable - -# Run migrations -alembic upgrade head - -# Start service -uvicorn app.main:app --reload -``` - -### Testing -```bash -# Run tests -pytest - -# Run with coverage -pytest --cov=app -``` - -### Docker -```bash -# Build image -docker build -t orders-service . - -# Run container -docker run -p 8000:8000 orders-service -``` - -## Deployment - -The service is designed for containerized deployment with: -- Health checks at `/health` -- Structured logging -- Metrics collection -- Database migrations -- Service discovery integration - -## Architecture - -Follows Domain-Driven Microservices Architecture: -- Clean separation of concerns -- Repository pattern for data access -- Service layer for business logic -- API layer for external interface -- Shared infrastructure for cross-cutting concerns \ No newline at end of file diff --git a/services/orders/app/core/database.py b/services/orders/app/core/database.py index 4130aa82..ca51b27d 100644 --- a/services/orders/app/core/database.py +++ b/services/orders/app/core/database.py @@ -5,7 +5,7 @@ Orders Service Database Configuration """ -from sqlalchemy import create_engine +from sqlalchemy import create_engine, text from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncSession from sqlalchemy.orm import sessionmaker, DeclarativeBase import structlog @@ -72,7 +72,7 @@ async def get_db_health() -> bool: """Check database health""" try: async with async_engine.begin() as conn: - await conn.execute("SELECT 1") + await conn.execute(text("SELECT 1")) return True except Exception as e: logger.error("Database health check failed", error=str(e)) diff --git a/services/pos/README.md b/services/pos/README.md deleted file mode 100644 index 7a3eb703..00000000 --- a/services/pos/README.md +++ /dev/null @@ -1,138 +0,0 @@ -# POS Integration Service - -This service handles integration with external Point of Sale (POS) systems for the Bakery IA platform. - -## Supported POS Systems - -- **Square POS** - Popular payment and POS solution with strong API support -- **Toast POS** - Restaurant-focused POS system with comprehensive features -- **Lightspeed Restaurant** - Full-featured restaurant management system - -## Features - -- **Real-time webhook handling** from POS systems -- **Bidirectional data synchronization** with sales service -- **Secure credential management** with encryption -- **Multi-tenant support** with tenant-specific configurations -- **Comprehensive transaction logging** and audit trails -- **Automatic duplicate detection** and handling -- **Rate limiting and retry mechanisms** for reliability - -## Architecture - -The POS service follows the established microservices architecture: - -``` -POS Service -├── API Layer (FastAPI) -├── Business Logic (Services) -├── Data Access (Repositories) -├── External Integrations (POS Providers) -├── Webhook Handlers -└── Background Sync Jobs -``` - -## API Endpoints - -### Configuration Management -- `GET /api/v1/tenants/{tenant_id}/pos/configurations` - List POS configurations -- `POST /api/v1/tenants/{tenant_id}/pos/configurations` - Create new configuration -- `PUT /api/v1/tenants/{tenant_id}/pos/configurations/{config_id}` - Update configuration -- `DELETE /api/v1/tenants/{tenant_id}/pos/configurations/{config_id}` - Delete configuration - -### Webhook Handling -- `POST /api/v1/webhooks/{pos_system}` - Receive webhooks from POS systems -- `GET /api/v1/webhooks/{pos_system}/status` - Get webhook status - -### Data Synchronization -- `POST /api/v1/tenants/{tenant_id}/pos/configurations/{config_id}/sync` - Trigger sync -- `GET /api/v1/tenants/{tenant_id}/pos/configurations/{config_id}/sync/status` - Get sync status -- `GET /api/v1/tenants/{tenant_id}/pos/transactions` - Get POS transactions - -## Database Schema - -### Core Tables -- `pos_configurations` - POS system configurations per tenant -- `pos_transactions` - Transaction data from POS systems -- `pos_transaction_items` - Individual items within transactions -- `pos_webhook_logs` - Webhook event logs -- `pos_sync_logs` - Synchronization operation logs - -## Environment Variables - -See `app/core/config.py` for all configuration options. Key variables include: - -```bash -# Database -POS_DATABASE_URL=postgresql+asyncpg://pos_user:pos_pass123@pos-db:5432/pos_db - -# POS Provider Credentials -SQUARE_APPLICATION_ID=your_square_app_id -SQUARE_ACCESS_TOKEN=your_square_token -TOAST_CLIENT_ID=your_toast_client_id -LIGHTSPEED_CLIENT_ID=your_lightspeed_client_id - -# Webhook Configuration -WEBHOOK_BASE_URL=https://your-domain.com -WEBHOOK_SECRET=your_webhook_secret -``` - -## Development - -### Running the Service - -```bash -# Using Docker Compose (recommended) -docker-compose up pos-service - -# Local development -cd services/pos -pip install -r requirements.txt -uvicorn app.main:app --reload --port 8000 -``` - -### Database Migrations - -```bash -# Create migration -alembic revision --autogenerate -m "Description" - -# Apply migrations -alembic upgrade head -``` - -### Testing - -```bash -# Run tests -pytest tests/ - -# Run with coverage -pytest --cov=app tests/ -``` - -## Security Considerations - -- POS credentials are encrypted before storage -- Webhook signatures are verified for authenticity -- All API endpoints require tenant-based authentication -- Rate limiting prevents abuse -- Sensitive data is logged with appropriate redaction - -## Monitoring - -The service includes comprehensive monitoring: - -- Health check endpoints -- Prometheus metrics -- Structured logging -- Performance tracking -- Error rate monitoring - -## Integration Flow - -1. **Configuration**: Set up POS system credentials via API -2. **Webhook Registration**: Register webhook URLs with POS providers -3. **Real-time Events**: Receive and process webhook events -4. **Data Sync**: Periodic synchronization of transaction data -5. **Sales Integration**: Forward processed data to sales service \ No newline at end of file diff --git a/services/production/README.md b/services/production/README.md deleted file mode 100644 index 3451c382..00000000 --- a/services/production/README.md +++ /dev/null @@ -1,187 +0,0 @@ -# Production Service - -Production planning and batch management service for the bakery management system. - -## Overview - -The Production Service handles all production-related operations including: - -- **Production Planning**: Calculate daily requirements using demand forecasts and inventory levels -- **Batch Management**: Track production batches from start to finish -- **Capacity Management**: Equipment, staff, and time scheduling -- **Quality Control**: Yield tracking, waste management, efficiency metrics -- **Alert System**: Comprehensive monitoring and notifications - -## Features - -### Core Capabilities -- Daily production requirements calculation -- Production batch lifecycle management -- Real-time capacity planning and utilization -- Quality control tracking and metrics -- Comprehensive alert system with multiple severity levels -- Integration with inventory, orders, recipes, and sales services - -### API Endpoints - -#### Dashboard & Planning -- `GET /api/v1/tenants/{tenant_id}/production/dashboard-summary` - Production dashboard data -- `GET /api/v1/tenants/{tenant_id}/production/daily-requirements` - Daily production planning -- `GET /api/v1/tenants/{tenant_id}/production/requirements` - Requirements for procurement - -#### Batch Management -- `POST /api/v1/tenants/{tenant_id}/production/batches` - Create production batch -- `GET /api/v1/tenants/{tenant_id}/production/batches/active` - Get active batches -- `GET /api/v1/tenants/{tenant_id}/production/batches/{batch_id}` - Get batch details -- `PUT /api/v1/tenants/{tenant_id}/production/batches/{batch_id}/status` - Update batch status - -#### Scheduling & Capacity -- `GET /api/v1/tenants/{tenant_id}/production/schedule` - Production schedule -- `GET /api/v1/tenants/{tenant_id}/production/capacity/status` - Capacity status - -#### Alerts & Monitoring -- `GET /api/v1/tenants/{tenant_id}/production/alerts` - Production alerts -- `POST /api/v1/tenants/{tenant_id}/production/alerts/{alert_id}/acknowledge` - Acknowledge alerts - -#### Analytics -- `GET /api/v1/tenants/{tenant_id}/production/metrics/yield` - Yield metrics - -## Service Integration - -### Shared Clients Used -- **InventoryServiceClient**: Stock levels, ingredient availability -- **OrdersServiceClient**: Demand requirements, customer orders -- **RecipesServiceClient**: Recipe requirements, ingredient calculations -- **SalesServiceClient**: Historical sales data -- **NotificationServiceClient**: Alert notifications - -### Authentication -Uses shared authentication patterns with tenant isolation: -- JWT token validation -- Tenant access verification -- User permission checks - -## Configuration - -Key configuration options in `app/core/config.py`: - -### Production Planning -- `PLANNING_HORIZON_DAYS`: Days ahead for planning (default: 7) -- `PRODUCTION_BUFFER_PERCENTAGE`: Safety buffer for production (default: 10%) -- `MINIMUM_BATCH_SIZE`: Minimum batch size (default: 1.0) -- `MAXIMUM_BATCH_SIZE`: Maximum batch size (default: 100.0) - -### Capacity Management -- `DEFAULT_WORKING_HOURS_PER_DAY`: Standard working hours (default: 12) -- `MAX_OVERTIME_HOURS`: Maximum overtime allowed (default: 4) -- `CAPACITY_UTILIZATION_TARGET`: Target utilization (default: 85%) - -### Quality Control -- `MINIMUM_YIELD_PERCENTAGE`: Minimum acceptable yield (default: 85%) -- `QUALITY_SCORE_THRESHOLD`: Minimum quality score (default: 8.0) - -### Alert Thresholds -- `CAPACITY_EXCEEDED_THRESHOLD`: Capacity alert threshold (default: 100%) -- `PRODUCTION_DELAY_THRESHOLD_MINUTES`: Delay alert threshold (default: 60) -- `LOW_YIELD_ALERT_THRESHOLD`: Low yield alert (default: 80%) - -## Database Models - -### ProductionBatch -- Complete batch tracking from planning to completion -- Status management (pending, in_progress, completed, etc.) -- Cost tracking and yield calculations -- Quality metrics integration - -### ProductionSchedule -- Daily production scheduling -- Capacity planning and tracking -- Staff and equipment assignments -- Performance metrics - -### ProductionCapacity -- Resource availability tracking -- Equipment and staff capacity -- Maintenance scheduling -- Utilization monitoring - -### QualityCheck -- Quality control measurements -- Pass/fail tracking -- Defect recording -- Corrective action management - -### ProductionAlert -- Comprehensive alert system -- Multiple severity levels -- Action recommendations -- Resolution tracking - -## Alert System - -### Alert Types -- **Capacity Exceeded**: When production requirements exceed available capacity -- **Production Delay**: When batches are delayed beyond thresholds -- **Cost Spike**: When production costs exceed normal ranges -- **Low Yield**: When yield percentages fall below targets -- **Quality Issues**: When quality scores consistently decline -- **Equipment Maintenance**: When equipment needs maintenance - -### Severity Levels -- **Critical**: WhatsApp + Email + Dashboard + SMS -- **High**: WhatsApp + Email + Dashboard -- **Medium**: Email + Dashboard -- **Low**: Dashboard only - -## Development - -### Setup -```bash -# Install dependencies -pip install -r requirements.txt - -# Set up database -# Configure DATABASE_URL environment variable - -# Run migrations -alembic upgrade head - -# Start service -uvicorn app.main:app --reload -``` - -### Testing -```bash -# Run tests -pytest - -# Run with coverage -pytest --cov=app -``` - -### Docker -```bash -# Build image -docker build -t production-service . - -# Run container -docker run -p 8000:8000 production-service -``` - -## Deployment - -The service is designed for containerized deployment with: -- Health checks at `/health` -- Structured logging -- Metrics collection -- Database migrations -- Service discovery integration - -## Architecture - -Follows Domain-Driven Microservices Architecture: -- Clean separation of concerns -- Repository pattern for data access -- Service layer for business logic -- API layer for external interface -- Shared infrastructure for cross-cutting concerns \ No newline at end of file diff --git a/services/production/app/main.py b/services/production/app/main.py index 04c2fea6..53537791 100644 --- a/services/production/app/main.py +++ b/services/production/app/main.py @@ -14,6 +14,7 @@ import structlog from app.core.config import settings from app.core.database import init_database, get_db_health from app.api.production import router as production_router +from app.services.production_alert_service import ProductionAlertService # Configure logging logger = structlog.get_logger() @@ -25,6 +26,16 @@ async def lifespan(app: FastAPI): # Startup try: await init_database() + logger.info("Database initialized") + + # Initialize alert service + alert_service = ProductionAlertService(settings) + await alert_service.start() + logger.info("Production alert service started") + + # Store alert service in app state + app.state.alert_service = alert_service + logger.info("Production service started successfully") except Exception as e: logger.error("Failed to initialize production service", error=str(e)) @@ -34,6 +45,13 @@ async def lifespan(app: FastAPI): # Shutdown logger.info("Production service shutting down") + try: + # Stop alert service + if hasattr(app.state, 'alert_service'): + await app.state.alert_service.stop() + logger.info("Alert service stopped") + except Exception as e: + logger.error("Error during shutdown", error=str(e)) # Create FastAPI application diff --git a/services/production/app/services/production_alert_service.py b/services/production/app/services/production_alert_service.py new file mode 100644 index 00000000..0f257f72 --- /dev/null +++ b/services/production/app/services/production_alert_service.py @@ -0,0 +1,795 @@ +# services/production/app/services/production_alert_service.py +""" +Production-specific alert and recommendation detection service +Monitors production capacity, delays, quality issues, and optimization opportunities +""" + +import json +from typing import List, Dict, Any, Optional +from uuid import UUID +from datetime import datetime, timedelta +import structlog +from apscheduler.triggers.cron import CronTrigger + +from shared.alerts.base_service import BaseAlertService, AlertServiceMixin +from shared.alerts.templates import format_item_message + +logger = structlog.get_logger() + +class ProductionAlertService(BaseAlertService, AlertServiceMixin): + """Production service alert and recommendation detection""" + + def setup_scheduled_checks(self): + """Production-specific scheduled checks for alerts and recommendations""" + + # Production capacity checks - every 10 minutes during business hours (alerts) + self.scheduler.add_job( + self.check_production_capacity, + CronTrigger(minute='*/10', hour='6-20'), + id='capacity_check', + misfire_grace_time=60, + max_instances=1 + ) + + # Production delays - every 5 minutes during production hours (alerts) + self.scheduler.add_job( + self.check_production_delays, + CronTrigger(minute='*/5', hour='4-22'), + id='delay_check', + misfire_grace_time=30, + max_instances=1 + ) + + # Quality issues check - every 15 minutes (alerts) + self.scheduler.add_job( + self.check_quality_issues, + CronTrigger(minute='*/15'), + id='quality_check', + misfire_grace_time=60, + max_instances=1 + ) + + # Equipment monitoring - every 3 minutes (alerts) + self.scheduler.add_job( + self.check_equipment_status, + CronTrigger(minute='*/3'), + id='equipment_check', + misfire_grace_time=30, + max_instances=1 + ) + + # Efficiency recommendations - every 30 minutes (recommendations) + self.scheduler.add_job( + self.generate_efficiency_recommendations, + CronTrigger(minute='*/30'), + id='efficiency_recs', + misfire_grace_time=120, + max_instances=1 + ) + + # Energy optimization - every hour (recommendations) + self.scheduler.add_job( + self.generate_energy_recommendations, + CronTrigger(minute='0'), + id='energy_recs', + misfire_grace_time=300, + max_instances=1 + ) + + logger.info("Production alert schedules configured", + service=self.config.SERVICE_NAME) + + async def check_production_capacity(self): + """Check if production plan exceeds capacity (alerts)""" + try: + self._checks_performed += 1 + + query = """ + WITH capacity_analysis AS ( + SELECT + p.tenant_id, + p.planned_date, + SUM(p.planned_quantity) as total_planned, + MAX(pc.daily_capacity) as max_daily_capacity, + COUNT(DISTINCT p.equipment_id) as equipment_count, + AVG(pc.efficiency_percent) as avg_efficiency, + CASE + WHEN SUM(p.planned_quantity) > MAX(pc.daily_capacity) * 1.2 THEN 'severe_overload' + WHEN SUM(p.planned_quantity) > MAX(pc.daily_capacity) THEN 'overload' + WHEN SUM(p.planned_quantity) > MAX(pc.daily_capacity) * 0.9 THEN 'near_capacity' + ELSE 'normal' + END as capacity_status, + (SUM(p.planned_quantity) / MAX(pc.daily_capacity)) * 100 as capacity_percentage + FROM production_schedule p + JOIN production_capacity pc ON pc.equipment_id = p.equipment_id + WHERE p.planned_date >= CURRENT_DATE + AND p.planned_date <= CURRENT_DATE + INTERVAL '3 days' + AND p.status IN ('planned', 'in_progress') + AND p.tenant_id = $1 + GROUP BY p.tenant_id, p.planned_date + ) + SELECT * FROM capacity_analysis + WHERE capacity_status != 'normal' + ORDER BY capacity_percentage DESC + """ + + # Check production capacity without tenant dependencies + try: + from sqlalchemy import text + # Simplified query using only existing production tables + simplified_query = text(""" + SELECT + pb.tenant_id, + DATE(pb.planned_start_time) as planned_date, + COUNT(*) as batch_count, + SUM(pb.planned_quantity) as total_planned, + 'capacity_check' as capacity_status + FROM production_batches pb + WHERE pb.planned_start_time >= CURRENT_DATE + AND pb.planned_start_time <= CURRENT_DATE + INTERVAL '3 days' + AND pb.status IN ('planned', 'pending', 'in_progress') + GROUP BY pb.tenant_id, DATE(pb.planned_start_time) + HAVING COUNT(*) > 10 -- Alert if more than 10 batches per day + ORDER BY total_planned DESC + """) + + async with self.db_manager.get_session() as session: + result = await session.execute(simplified_query) + capacity_issues = result.fetchall() + + for issue in capacity_issues: + await self._process_capacity_issue(issue.tenant_id, issue) + + except Exception as e: + logger.debug("Simplified capacity check failed", error=str(e)) + + except Exception as e: + # Skip capacity checks if tables don't exist (graceful degradation) + if "does not exist" in str(e): + logger.debug("Capacity check skipped - missing tables", error=str(e)) + else: + logger.error("Capacity check failed", error=str(e)) + self._errors_count += 1 + + async def _process_capacity_issue(self, tenant_id: UUID, issue: Dict[str, Any]): + """Process capacity overload issue""" + try: + status = issue['capacity_status'] + percentage = issue['capacity_percentage'] + + if status == 'severe_overload': + template_data = self.format_spanish_message( + 'order_overload', + percentage=int(percentage - 100) + ) + + await self.publish_item(tenant_id, { + 'type': 'severe_capacity_overload', + 'severity': 'urgent', + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'planned_date': issue['planned_date'].isoformat(), + 'capacity_percentage': float(percentage), + 'overload_percentage': float(percentage - 100), + 'equipment_count': issue['equipment_count'] + } + }, item_type='alert') + + elif status == 'overload': + severity = self.get_business_hours_severity('high') + + await self.publish_item(tenant_id, { + 'type': 'capacity_overload', + 'severity': severity, + 'title': f'⚠️ Capacidad Excedida: {percentage:.0f}%', + 'message': f'Producción planificada para {issue["planned_date"]} excede capacidad en {percentage-100:.0f}%.', + 'actions': ['Redistribuir cargas', 'Ampliar turnos', 'Subcontratar', 'Posponer pedidos'], + 'metadata': { + 'planned_date': issue['planned_date'].isoformat(), + 'capacity_percentage': float(percentage), + 'equipment_count': issue['equipment_count'] + } + }, item_type='alert') + + elif status == 'near_capacity': + severity = self.get_business_hours_severity('medium') + + await self.publish_item(tenant_id, { + 'type': 'near_capacity', + 'severity': severity, + 'title': f'📊 Cerca de Capacidad Máxima: {percentage:.0f}%', + 'message': f'Producción del {issue["planned_date"]} está al {percentage:.0f}% de capacidad. Monitorear de cerca.', + 'actions': ['Revisar planificación', 'Preparar contingencias', 'Optimizar eficiencia'], + 'metadata': { + 'planned_date': issue['planned_date'].isoformat(), + 'capacity_percentage': float(percentage) + } + }, item_type='alert') + + except Exception as e: + logger.error("Error processing capacity issue", error=str(e)) + + async def check_production_delays(self): + """Check for production delays (alerts)""" + try: + self._checks_performed += 1 + + # Simplified query without customer_orders dependency + query = """ + SELECT + pb.id, pb.tenant_id, pb.product_name, pb.batch_number, + pb.planned_end_time as planned_completion_time, pb.actual_start_time, + pb.actual_end_time as estimated_completion_time, pb.status, + EXTRACT(minutes FROM (NOW() - pb.planned_end_time)) as delay_minutes, + COALESCE(pb.priority::text, 'medium') as priority_level, + 1 as affected_orders -- Default to 1 since we can't count orders + FROM production_batches pb + WHERE pb.status IN ('in_progress', 'delayed') + AND ( + (pb.planned_end_time < NOW() AND pb.status = 'in_progress') + OR pb.status = 'delayed' + ) + AND pb.planned_end_time > NOW() - INTERVAL '24 hours' + ORDER BY + CASE COALESCE(pb.priority::text, 'medium') + WHEN 'urgent' THEN 1 WHEN 'high' THEN 2 ELSE 3 + END, + delay_minutes DESC + """ + + from sqlalchemy import text + async with self.db_manager.get_session() as session: + result = await session.execute(text(query)) + delays = result.fetchall() + + for delay in delays: + await self._process_production_delay(delay) + + except Exception as e: + # Skip delay checks if tables don't exist (graceful degradation) + if "does not exist" in str(e): + logger.debug("Production delay check skipped - missing tables", error=str(e)) + else: + logger.error("Production delay check failed", error=str(e)) + self._errors_count += 1 + + async def _process_production_delay(self, delay: Dict[str, Any]): + """Process production delay""" + try: + delay_minutes = delay['delay_minutes'] + priority = delay['priority_level'] + affected_orders = delay['affected_orders'] + + # Determine severity based on delay time and priority + if delay_minutes > 120 or priority == 'urgent': + severity = 'urgent' + elif delay_minutes > 60 or priority == 'high': + severity = 'high' + elif delay_minutes > 30: + severity = 'medium' + else: + severity = 'low' + + template_data = self.format_spanish_message( + 'production_delay', + batch_name=f"{delay['product_name']} #{delay['batch_number']}", + delay_minutes=int(delay_minutes) + ) + + await self.publish_item(delay['tenant_id'], { + 'type': 'production_delay', + 'severity': severity, + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'batch_id': str(delay['id']), + 'product_name': delay['product_name'], + 'batch_number': delay['batch_number'], + 'delay_minutes': delay_minutes, + 'priority_level': priority, + 'affected_orders': affected_orders, + 'planned_completion': delay['planned_completion_time'].isoformat() + } + }, item_type='alert') + + except Exception as e: + logger.error("Error processing production delay", + batch_id=str(delay.get('id')), + error=str(e)) + + async def check_quality_issues(self): + """Check for quality control issues (alerts)""" + try: + self._checks_performed += 1 + + # Fixed query using actual quality_checks table structure + query = """ + SELECT + qc.id, qc.tenant_id, qc.batch_id, qc.check_type as test_type, + qc.quality_score as result_value, + qc.target_weight as min_acceptable, + (qc.target_weight * (1 + qc.tolerance_percentage/100)) as max_acceptable, + CASE + WHEN qc.pass_fail = false AND qc.defect_count > 5 THEN 'critical' + WHEN qc.pass_fail = false THEN 'major' + ELSE 'minor' + END as qc_severity, + qc.created_at, + pb.product_name, pb.batch_number, + COUNT(*) OVER (PARTITION BY qc.batch_id) as total_failures + FROM quality_checks qc + JOIN production_batches pb ON pb.id = qc.batch_id + WHERE qc.pass_fail = false -- Use pass_fail instead of status + AND qc.created_at > NOW() - INTERVAL '4 hours' + AND qc.corrective_action_needed = true -- Use this instead of acknowledged + ORDER BY + CASE + WHEN qc.pass_fail = false AND qc.defect_count > 5 THEN 1 + WHEN qc.pass_fail = false THEN 2 + ELSE 3 + END, + qc.created_at DESC + """ + + from sqlalchemy import text + async with self.db_manager.get_session() as session: + result = await session.execute(text(query)) + quality_issues = result.fetchall() + + for issue in quality_issues: + await self._process_quality_issue(issue) + + except Exception as e: + # Skip quality checks if tables don't exist (graceful degradation) + if "does not exist" in str(e) or "column" in str(e).lower() and "does not exist" in str(e).lower(): + logger.debug("Quality check skipped - missing tables or columns", error=str(e)) + else: + logger.error("Quality check failed", error=str(e)) + self._errors_count += 1 + + async def _process_quality_issue(self, issue: Dict[str, Any]): + """Process quality control failure""" + try: + qc_severity = issue['qc_severity'] + total_failures = issue['total_failures'] + + # Map QC severity to alert severity + if qc_severity == 'critical' or total_failures > 2: + severity = 'urgent' + elif qc_severity == 'major': + severity = 'high' + else: + severity = 'medium' + + await self.publish_item(issue['tenant_id'], { + 'type': 'quality_control_failure', + 'severity': severity, + 'title': f'❌ Fallo Control Calidad: {issue["product_name"]}', + 'message': f'Lote {issue["batch_number"]} falló en {issue["test_type"]}. Valor: {issue["result_value"]} (rango: {issue["min_acceptable"]}-{issue["max_acceptable"]})', + 'actions': ['Revisar lote', 'Repetir prueba', 'Ajustar proceso', 'Documentar causa'], + 'metadata': { + 'quality_check_id': str(issue['id']), + 'batch_id': str(issue['batch_id']), + 'test_type': issue['test_type'], + 'result_value': float(issue['result_value']), + 'min_acceptable': float(issue['min_acceptable']), + 'max_acceptable': float(issue['max_acceptable']), + 'qc_severity': qc_severity, + 'total_failures': total_failures + } + }, item_type='alert') + + # Mark as acknowledged to avoid duplicates + await self.db_manager.execute( + "UPDATE quality_checks SET acknowledged = true WHERE id = $1", + issue['id'] + ) + + except Exception as e: + logger.error("Error processing quality issue", + quality_check_id=str(issue.get('id')), + error=str(e)) + + async def check_equipment_status(self): + """Check equipment status and failures (alerts)""" + # Equipment tables don't exist in production database - skip this check + logger.debug("Equipment check skipped - equipment tables not available in production database") + return + + async def _process_equipment_issue(self, equipment: Dict[str, Any]): + """Process equipment issue""" + try: + status = equipment['status'] + efficiency = equipment.get('efficiency_percent', 100) + days_to_maintenance = equipment.get('days_to_maintenance', 30) + + if status == 'error': + template_data = self.format_spanish_message( + 'equipment_failure', + equipment_name=equipment['name'] + ) + + await self.publish_item(equipment['tenant_id'], { + 'type': 'equipment_failure', + 'severity': 'urgent', + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'equipment_id': str(equipment['id']), + 'equipment_name': equipment['name'], + 'equipment_type': equipment['type'], + 'error_count': equipment.get('error_count', 0), + 'last_reading': equipment.get('last_reading').isoformat() if equipment.get('last_reading') else None + } + }, item_type='alert') + + elif status == 'maintenance_required' or days_to_maintenance <= 1: + severity = 'high' if days_to_maintenance <= 1 else 'medium' + + await self.publish_item(equipment['tenant_id'], { + 'type': 'maintenance_required', + 'severity': severity, + 'title': f'🔧 Mantenimiento Requerido: {equipment["name"]}', + 'message': f'Equipo {equipment["name"]} requiere mantenimiento en {days_to_maintenance} días.', + 'actions': ['Programar mantenimiento', 'Revisar historial', 'Preparar repuestos', 'Planificar parada'], + 'metadata': { + 'equipment_id': str(equipment['id']), + 'days_to_maintenance': days_to_maintenance, + 'last_maintenance': equipment.get('last_maintenance').isoformat() if equipment.get('last_maintenance') else None + } + }, item_type='alert') + + elif efficiency < 80: + severity = 'medium' if efficiency < 70 else 'low' + + await self.publish_item(equipment['tenant_id'], { + 'type': 'low_equipment_efficiency', + 'severity': severity, + 'title': f'📉 Baja Eficiencia: {equipment["name"]}', + 'message': f'Eficiencia del {equipment["name"]} bajó a {efficiency:.1f}%. Revisar funcionamiento.', + 'actions': ['Revisar configuración', 'Limpiar equipo', 'Calibrar sensores', 'Revisar mantenimiento'], + 'metadata': { + 'equipment_id': str(equipment['id']), + 'efficiency_percent': float(efficiency), + 'temperature': equipment.get('temperature'), + 'vibration_level': equipment.get('vibration_level') + } + }, item_type='alert') + + except Exception as e: + logger.error("Error processing equipment issue", + equipment_id=str(equipment.get('id')), + error=str(e)) + + async def generate_efficiency_recommendations(self): + """Generate production efficiency recommendations""" + try: + self._checks_performed += 1 + + # Analyze production patterns for efficiency opportunities + query = """ + WITH efficiency_analysis AS ( + SELECT + pb.tenant_id, pb.product_name, + AVG(EXTRACT(minutes FROM (pb.actual_completion_time - pb.actual_start_time))) as avg_production_time, + AVG(pb.planned_duration_minutes) as avg_planned_duration, + COUNT(*) as batch_count, + AVG(pb.yield_percentage) as avg_yield, + EXTRACT(hour FROM pb.actual_start_time) as start_hour + FROM production_batches pb + WHERE pb.status = 'completed' + AND pb.actual_completion_time > CURRENT_DATE - INTERVAL '30 days' + AND pb.tenant_id = $1 + GROUP BY pb.tenant_id, pb.product_name, EXTRACT(hour FROM pb.actual_start_time) + HAVING COUNT(*) >= 3 + ), + recommendations AS ( + SELECT *, + CASE + WHEN avg_production_time > avg_planned_duration * 1.2 THEN 'reduce_production_time' + WHEN avg_yield < 85 THEN 'improve_yield' + WHEN start_hour BETWEEN 14 AND 16 AND avg_production_time > avg_planned_duration * 1.1 THEN 'avoid_afternoon_production' + ELSE null + END as recommendation_type, + (avg_production_time - avg_planned_duration) / avg_planned_duration * 100 as efficiency_loss_percent + FROM efficiency_analysis + ) + SELECT * FROM recommendations + WHERE recommendation_type IS NOT NULL + AND efficiency_loss_percent > 10 + ORDER BY efficiency_loss_percent DESC + """ + + tenants = await self.get_active_tenants() + + for tenant_id in tenants: + try: + from sqlalchemy import text + async with self.db_manager.get_session() as session: + result = await session.execute(text(query), {"tenant_id": tenant_id}) + recommendations = result.fetchall() + + for rec in recommendations: + await self._generate_efficiency_recommendation(tenant_id, rec) + + except Exception as e: + logger.error("Error generating efficiency recommendations", + tenant_id=str(tenant_id), + error=str(e)) + + except Exception as e: + logger.error("Efficiency recommendations failed", error=str(e)) + self._errors_count += 1 + + async def _generate_efficiency_recommendation(self, tenant_id: UUID, rec: Dict[str, Any]): + """Generate specific efficiency recommendation""" + try: + if not self.should_send_recommendation(tenant_id, rec['recommendation_type']): + return + + rec_type = rec['recommendation_type'] + efficiency_loss = rec['efficiency_loss_percent'] + + if rec_type == 'reduce_production_time': + template_data = self.format_spanish_message( + 'production_efficiency', + suggested_time=f"{rec['start_hour']:02d}:00", + savings_percent=efficiency_loss + ) + + await self.publish_item(tenant_id, { + 'type': 'production_efficiency', + 'severity': 'medium', + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'product_name': rec['product_name'], + 'avg_production_time': float(rec['avg_production_time']), + 'avg_planned_duration': float(rec['avg_planned_duration']), + 'efficiency_loss_percent': float(efficiency_loss), + 'batch_count': rec['batch_count'], + 'recommendation_type': rec_type + } + }, item_type='recommendation') + + elif rec_type == 'improve_yield': + await self.publish_item(tenant_id, { + 'type': 'yield_improvement', + 'severity': 'medium', + 'title': f'📈 Mejorar Rendimiento: {rec["product_name"]}', + 'message': f'Rendimiento promedio del {rec["product_name"]} es {rec["avg_yield"]:.1f}%. Oportunidad de mejora.', + 'actions': ['Revisar receta', 'Optimizar proceso', 'Entrenar personal', 'Verificar ingredientes'], + 'metadata': { + 'product_name': rec['product_name'], + 'avg_yield': float(rec['avg_yield']), + 'batch_count': rec['batch_count'], + 'recommendation_type': rec_type + } + }, item_type='recommendation') + + elif rec_type == 'avoid_afternoon_production': + await self.publish_item(tenant_id, { + 'type': 'schedule_optimization', + 'severity': 'low', + 'title': f'⏰ Optimizar Horario: {rec["product_name"]}', + 'message': f'Producción de {rec["product_name"]} en horario {rec["start_hour"]}:00 muestra menor eficiencia.', + 'actions': ['Cambiar horario', 'Analizar causas', 'Revisar personal', 'Optimizar ambiente'], + 'metadata': { + 'product_name': rec['product_name'], + 'start_hour': rec['start_hour'], + 'efficiency_loss_percent': float(efficiency_loss), + 'recommendation_type': rec_type + } + }, item_type='recommendation') + + except Exception as e: + logger.error("Error generating efficiency recommendation", + product_name=rec.get('product_name'), + error=str(e)) + + async def generate_energy_recommendations(self): + """Generate energy optimization recommendations""" + try: + # Analyze energy consumption patterns + query = """ + SELECT + e.tenant_id, e.name as equipment_name, e.type, + AVG(ec.energy_consumption_kwh) as avg_energy, + EXTRACT(hour FROM ec.recorded_at) as hour_of_day, + COUNT(*) as readings_count + FROM equipment e + JOIN energy_consumption ec ON ec.equipment_id = e.id + WHERE ec.recorded_at > CURRENT_DATE - INTERVAL '30 days' + AND e.tenant_id = $1 + GROUP BY e.tenant_id, e.id, EXTRACT(hour FROM ec.recorded_at) + HAVING COUNT(*) >= 10 + ORDER BY avg_energy DESC + """ + + tenants = await self.get_active_tenants() + + for tenant_id in tenants: + try: + from sqlalchemy import text + async with self.db_manager.get_session() as session: + result = await session.execute(text(query), {"tenant_id": tenant_id}) + energy_data = result.fetchall() + + # Analyze for peak hours and optimization opportunities + await self._analyze_energy_patterns(tenant_id, energy_data) + + except Exception as e: + logger.error("Error generating energy recommendations", + tenant_id=str(tenant_id), + error=str(e)) + + except Exception as e: + logger.error("Energy recommendations failed", error=str(e)) + self._errors_count += 1 + + async def _analyze_energy_patterns(self, tenant_id: UUID, energy_data: List[Dict[str, Any]]): + """Analyze energy consumption patterns for optimization""" + try: + if not energy_data: + return + + # Group by equipment and find peak hours + equipment_data = {} + for record in energy_data: + equipment = record['equipment_name'] + if equipment not in equipment_data: + equipment_data[equipment] = [] + equipment_data[equipment].append(record) + + for equipment, records in equipment_data.items(): + # Find peak consumption hours + peak_hour_record = max(records, key=lambda x: x['avg_energy']) + off_peak_records = [r for r in records if r['hour_of_day'] < 7 or r['hour_of_day'] > 22] + + if off_peak_records and peak_hour_record['avg_energy'] > 0: + min_off_peak = min(off_peak_records, key=lambda x: x['avg_energy']) + potential_savings = ((peak_hour_record['avg_energy'] - min_off_peak['avg_energy']) / + peak_hour_record['avg_energy']) * 100 + + if potential_savings > 15: # More than 15% potential savings + template_data = self.format_spanish_message( + 'energy_optimization', + start_time=f"{min_off_peak['hour_of_day']:02d}:00", + end_time=f"{min_off_peak['hour_of_day']+2:02d}:00", + savings_euros=potential_savings * 0.15 # Rough estimate + ) + + await self.publish_item(tenant_id, { + 'type': 'energy_optimization', + 'severity': 'low', + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'equipment_name': equipment, + 'peak_hour': peak_hour_record['hour_of_day'], + 'optimal_hour': min_off_peak['hour_of_day'], + 'potential_savings_percent': float(potential_savings), + 'peak_consumption': float(peak_hour_record['avg_energy']), + 'optimal_consumption': float(min_off_peak['avg_energy']) + } + }, item_type='recommendation') + + except Exception as e: + logger.error("Error analyzing energy patterns", error=str(e)) + + async def register_db_listeners(self, conn): + """Register production-specific database listeners""" + try: + await conn.add_listener('production_alerts', self.handle_production_db_alert) + + logger.info("Database listeners registered", + service=self.config.SERVICE_NAME) + except Exception as e: + logger.error("Failed to register database listeners", + service=self.config.SERVICE_NAME, + error=str(e)) + + async def handle_production_db_alert(self, connection, pid, channel, payload): + """Handle production alert from database trigger""" + try: + data = json.loads(payload) + tenant_id = UUID(data['tenant_id']) + + template_data = self.format_spanish_message( + 'production_delay', + batch_name=f"{data['product_name']} #{data.get('batch_number', 'N/A')}", + delay_minutes=data['delay_minutes'] + ) + + await self.publish_item(tenant_id, { + 'type': 'production_delay', + 'severity': 'high', + 'title': template_data['title'], + 'message': template_data['message'], + 'actions': template_data['actions'], + 'metadata': { + 'batch_id': data['batch_id'], + 'delay_minutes': data['delay_minutes'], + 'trigger_source': 'database' + } + }, item_type='alert') + + except Exception as e: + logger.error("Error handling production DB alert", error=str(e)) + + async def start_event_listener(self): + """Listen for production-affecting events""" + try: + # Subscribe to inventory events that might affect production + await self.rabbitmq_client.consume_events( + "bakery_events", + f"production.inventory.{self.config.SERVICE_NAME}", + "inventory.critical_shortage", + self.handle_inventory_shortage + ) + + logger.info("Event listeners started", + service=self.config.SERVICE_NAME) + except Exception as e: + logger.error("Failed to start event listeners", + service=self.config.SERVICE_NAME, + error=str(e)) + + async def handle_inventory_shortage(self, message): + """Handle critical inventory shortage affecting production""" + try: + shortage = json.loads(message.body) + tenant_id = UUID(shortage['tenant_id']) + + # Check if this ingredient affects any current production + affected_batches = await self.get_affected_production_batches( + shortage['ingredient_id'] + ) + + if affected_batches: + await self.publish_item(tenant_id, { + 'type': 'production_ingredient_shortage', + 'severity': 'high', + 'title': f'🚨 Falta Ingrediente para Producción', + 'message': f'Escasez de {shortage["ingredient_name"]} afecta {len(affected_batches)} lotes en producción.', + 'actions': ['Buscar ingrediente alternativo', 'Pausar producción', 'Contactar proveedor urgente', 'Reorganizar plan'], + 'metadata': { + 'ingredient_id': shortage['ingredient_id'], + 'ingredient_name': shortage['ingredient_name'], + 'affected_batches': [str(b) for b in affected_batches], + 'shortage_amount': shortage.get('shortage_amount', 0) + } + }, item_type='alert') + + except Exception as e: + logger.error("Error handling inventory shortage event", error=str(e)) + + async def get_affected_production_batches(self, ingredient_id: str) -> List[str]: + """Get production batches affected by ingredient shortage""" + try: + query = """ + SELECT DISTINCT pb.id + FROM production_batches pb + JOIN recipe_ingredients ri ON ri.recipe_id = pb.recipe_id + WHERE ri.ingredient_id = $1 + AND pb.status IN ('planned', 'in_progress') + AND pb.planned_completion_time > NOW() + """ + + from sqlalchemy import text + async with self.db_manager.get_session() as session: + result_rows = await session.execute(text(query), {"ingredient_id": ingredient_id}) + result = result_rows.fetchall() + return [str(row['id']) for row in result] + + except Exception as e: + logger.error("Error getting affected production batches", + ingredient_id=ingredient_id, + error=str(e)) + return [] \ No newline at end of file diff --git a/services/production/requirements.txt b/services/production/requirements.txt index 6c5044b2..5363529e 100644 --- a/services/production/requirements.txt +++ b/services/production/requirements.txt @@ -15,6 +15,14 @@ httpx==0.25.2 # Logging and monitoring structlog==23.2.0 +prometheus-client==0.19.0 + +# Message queues and Redis +aio-pika==9.3.1 +redis>=4.0.0 + +# Scheduling +APScheduler==3.10.4 # Date and time utilities python-dateutil==2.8.2 diff --git a/services/training/README.md b/services/training/README.md deleted file mode 100644 index aa6f552e..00000000 --- a/services/training/README.md +++ /dev/null @@ -1,220 +0,0 @@ -## 🎯 **Migration Summary: Prophet Models to Training Service** (Continued) - -### **✅ What Was Migrated** - -Perfect! I've created a complete migration of the Prophet ML training functionality from the monolithic backend to a dedicated training microservice. Here's a comprehensive summary of what has been implemented: - -## 🎯 **Migration Summary: Prophet Models to Training Service** - -### **✅ What Was Migrated** - -1. **Prophet Manager** (`prophet_manager.py`): - - Enhanced model training with bakery-specific configurations - - Spanish holidays integration - - Advanced model persistence and metadata storage - - Training metrics calculation - -2. **ML Trainer** (`trainer.py`): - - Complete training orchestration for multiple products - - Single product training capability - - Model performance evaluation - - Async-first design replacing Celery complexity - -3. **Data Processor** (`data_processor.py`): - - Advanced feature engineering for bakery forecasting - - Weather and traffic data integration - - Spanish holiday and school calendar detection - - Temporal feature extraction - -4. **API Layer** (`training.py`): - - RESTful endpoints for training job management - - Real-time progress tracking - - Job cancellation and status monitoring - - Data validation before training - -5. **Database Models** (`training.py`): - - `ModelTrainingLog`: Job execution tracking - - `TrainedModel`: Model registry and versioning - - `ModelPerformanceMetric`: Performance monitoring - - `TrainingJobQueue`: Job scheduling system - -6. **Service Layer** (`training_service.py`): - - Business logic orchestration - - External service integration (data service) - - Job lifecycle management - - Error handling and recovery - -7. **Messaging Integration** (`messaging.py`): - - Event-driven architecture with RabbitMQ - - Inter-service communication - - Real-time notifications - - Event publishing for other services - -### **🔧 Key Improvements Over Old System** - -#### **1. Eliminated Celery Complexity** -- **Before**: Complex Celery worker setup with sync/async mixing -- **After**: Pure async implementation with FastAPI background tasks - -#### **2. Better Error Handling** -- **Before**: Celery task failures were hard to debug -- **After**: Detailed error tracking and recovery mechanisms - -#### **3. Real-Time Progress Tracking** -- **Before**: Limited visibility into training progress -- **After**: Real-time updates with detailed step-by-step progress - -#### **4. Service Isolation** -- **Before**: Training tightly coupled with main application -- **After**: Independent service that can scale separately - -#### **5. Enhanced Model Management** -- **Before**: Basic model storage in filesystem -- **After**: Complete model lifecycle with versioning and metadata - -### **🚀 New Capabilities** - -#### **1. Advanced Training Features** -```python -# Support for different training modes -await trainer.train_tenant_models(...) # All products -await trainer.train_single_product(...) # Single product -await trainer.evaluate_model_performance(...) # Performance evaluation -``` - -#### **2. Real-Time Job Management** -```python -# Job lifecycle management -POST /training/jobs # Start training -GET /training/jobs/{id}/status # Get progress -POST /training/jobs/{id}/cancel # Cancel job -GET /training/jobs/{id}/logs # View detailed logs -``` - -#### **3. Data Validation** -```python -# Pre-training validation -POST /training/validate # Check data quality before training -``` - -#### **4. Event-Driven Architecture** -```python -# Automatic event publishing -await publish_job_started(job_id, tenant_id, config) -await publish_job_completed(job_id, tenant_id, results) -await publish_model_trained(model_id, tenant_id, product_name, metrics) -``` - -### **📊 Performance Improvements** - -#### **1. Faster Training Startup** -- **Before**: 30-60 seconds Celery worker initialization -- **After**: <5 seconds direct async execution - -#### **2. Better Resource Utilization** -- **Before**: Fixed Celery worker pools -- **After**: Dynamic scaling based on demand - -#### **3. Improved Memory Management** -- **Before**: Memory leaks in long-running Celery workers -- **After**: Clean memory usage with proper cleanup - -### **🔒 Enhanced Security & Monitoring** - -#### **1. Authentication Integration** -```python -# Secure endpoints with tenant isolation -@router.post("/jobs") -async def start_training_job( - request: TrainingJobRequest, - tenant_id: str = Depends(get_current_tenant_id) # Automatic tenant isolation -): -``` - -#### **2. Comprehensive Monitoring** -```python -# Built-in metrics collection -metrics.increment_counter("training_jobs_started") -metrics.increment_counter("training_jobs_completed") -metrics.increment_counter("training_jobs_failed") -``` - -#### **3. Detailed Logging** -```python -# Structured logging with context -logger.info(f"Training job {job_id} completed successfully", - extra={"tenant_id": tenant_id, "models_trained": count}) -``` - -### **🔄 Integration with Existing Architecture** - -#### **1. Seamless API Integration** -The new training service integrates perfectly with the existing gateway: - -```yaml -# API Gateway routes to training service -/api/training/* → http://training-service:8000/ -``` - -#### **2. Event-Driven Communication** -```python -# Other services can listen to training events -"training.job.completed" → forecasting-service (update models) -"training.job.completed" → notification-service (send alerts) -"training.model.updated" → tenant-service (update quotas) -``` - -#### **3. Database Independence** -- Training service has its own PostgreSQL database -- Clean separation from other service data -- Easy to scale and backup independently - -### **📦 Deployment Ready** - -#### **1. Docker Configuration** -- Optimized Dockerfile with proper security -- Non-root user execution -- Health checks included - -#### **2. Requirements Management** -- Pinned dependency versions -- Separated development/production requirements -- Prophet and ML libraries properly configured - -#### **3. Environment Configuration** -```python -# Flexible configuration management -MODEL_STORAGE_PATH=/app/models -MAX_TRAINING_TIME_MINUTES=30 -MIN_TRAINING_DATA_DAYS=30 -PROPHET_SEASONALITY_MODE=additive -``` - -### **🎯 Migration Benefits Summary** - -| Aspect | Before (Celery) | After (Microservice) | -|--------|----------------|----------------------| -| **Startup Time** | 30-60 seconds | <5 seconds | -| **Error Handling** | Basic | Comprehensive | -| **Progress Tracking** | Limited | Real-time | -| **Scalability** | Fixed workers | Dynamic scaling | -| **Debugging** | Difficult | Easy with logs | -| **Testing** | Complex | Simple unit tests | -| **Deployment** | Monolithic | Independent | -| **Monitoring** | Basic | Full observability | - -### **🔧 Ready for Production** - -This training service is **production-ready** and provides: - -1. **Robust Error Handling**: Graceful failure recovery -2. **Horizontal Scaling**: Can run multiple instances -3. **Performance Monitoring**: Built-in metrics and health checks -4. **Security**: Proper authentication and tenant isolation -5. **Maintainability**: Clean code structure and comprehensive tests - -### **🚀 Next Steps** - -The training service is now ready to be integrated into your microservices architecture. It completely replaces the old Celery-based training system while providing significant improvements in reliability, performance, and maintainability. - -The implementation follows all the microservices best practices and integrates seamlessly with the broader platform architecture you're building for the Madrid bakery forecasting system. \ No newline at end of file diff --git a/shared/alerts/__init__.py b/shared/alerts/__init__.py new file mode 100644 index 00000000..bbb7a0fd --- /dev/null +++ b/shared/alerts/__init__.py @@ -0,0 +1 @@ +# shared/alerts/__init__.py \ No newline at end of file diff --git a/shared/alerts/base_service.py b/shared/alerts/base_service.py new file mode 100644 index 00000000..62b50a4d --- /dev/null +++ b/shared/alerts/base_service.py @@ -0,0 +1,353 @@ +# shared/alerts/base_service.py +""" +Base alert service pattern for all microservices +Supports both alerts and recommendations through unified detection patterns +""" + +import asyncio +import json +import uuid +from typing import List, Dict, Any, Optional +from uuid import UUID +from datetime import datetime, timedelta +import structlog +from redis.asyncio import Redis +from apscheduler.schedulers.asyncio import AsyncIOScheduler +from apscheduler.triggers.cron import CronTrigger + +from shared.messaging.rabbitmq import RabbitMQClient +from shared.database.base import DatabaseManager +from shared.config.rabbitmq_config import get_routing_key + +logger = structlog.get_logger() + +class BaseAlertService: + """ + Base class for service-specific alert and recommendation detection + Implements hybrid detection patterns: scheduled jobs, event-driven, and database triggers + """ + + def __init__(self, config): + self.config = config + self.db_manager = DatabaseManager(config.DATABASE_URL) + self.rabbitmq_client = RabbitMQClient(config.RABBITMQ_URL, config.SERVICE_NAME) + self.redis = None + self.scheduler = AsyncIOScheduler() + self.is_leader = False + self.exchange = "alerts.exchange" + + # Metrics + self._items_published = 0 + self._checks_performed = 0 + self._errors_count = 0 + + async def start(self): + """Initialize all detection mechanisms""" + try: + # Connect to Redis for leader election and deduplication + self.redis = await Redis.from_url(self.config.REDIS_URL) + logger.info("Connected to Redis", service=self.config.SERVICE_NAME) + + # Connect to RabbitMQ + await self.rabbitmq_client.connect() + logger.info("Connected to RabbitMQ", service=self.config.SERVICE_NAME) + + # Start leader election for scheduled jobs + asyncio.create_task(self.maintain_leadership()) + + # Setup scheduled checks (runs only on leader) + self.setup_scheduled_checks() + + # Start database listener (runs on all instances) + await self.start_database_listener() + + # Start event listener (runs on all instances) + await self.start_event_listener() + + logger.info("Alert service started", service=self.config.SERVICE_NAME) + + except Exception as e: + logger.error("Failed to start alert service", service=self.config.SERVICE_NAME, error=str(e)) + raise + + async def stop(self): + """Clean shutdown""" + try: + # Stop scheduler + if self.scheduler.running: + self.scheduler.shutdown() + + # Close connections + if self.redis: + await self.redis.aclose() # Use aclose() for modern Redis client + + await self.rabbitmq_client.disconnect() + + logger.info("Alert service stopped", service=self.config.SERVICE_NAME) + + except Exception as e: + logger.error("Error stopping alert service", service=self.config.SERVICE_NAME, error=str(e)) + + # PATTERN 1: Scheduled Background Jobs + def setup_scheduled_checks(self): + """Configure scheduled alert checks - Override in service""" + raise NotImplementedError("Subclasses must implement setup_scheduled_checks") + + async def maintain_leadership(self): + """Leader election for scheduled jobs""" + lock_key = f"scheduler_lock:{self.config.SERVICE_NAME}" + lock_ttl = 60 + + while True: + try: + instance_id = getattr(self.config, 'INSTANCE_ID', 'default') + was_leader = self.is_leader + + # Try to acquire new leadership if not currently leader + if not self.is_leader: + result = await self.redis.set( + lock_key, + instance_id, + ex=lock_ttl, + nx=True + ) + self.is_leader = result is not None + else: + # Already leader - try to extend the lock + current_value = await self.redis.get(lock_key) + if current_value and current_value.decode() == instance_id: + # Still our lock, extend it + await self.redis.expire(lock_key, lock_ttl) + self.is_leader = True + else: + # Lock expired or taken by someone else + self.is_leader = False + + # Handle leadership changes + if self.is_leader and not was_leader: + self.scheduler.start() + logger.info("Acquired scheduler leadership", service=self.config.SERVICE_NAME) + elif not self.is_leader and was_leader: + self.scheduler.shutdown() + logger.info("Lost scheduler leadership", service=self.config.SERVICE_NAME) + + await asyncio.sleep(lock_ttl // 2) + + except Exception as e: + logger.error("Leadership error", service=self.config.SERVICE_NAME, error=str(e)) + self.is_leader = False + await asyncio.sleep(5) + + # PATTERN 2: Event-Driven Detection + async def start_event_listener(self): + """Listen for business events - Override in service""" + pass + + # PATTERN 3: Database Triggers + async def start_database_listener(self): + """Listen for database notifications""" + try: + import asyncpg + # Convert SQLAlchemy URL format to plain PostgreSQL for asyncpg + database_url = self.config.DATABASE_URL + if database_url.startswith('postgresql+asyncpg://'): + database_url = database_url.replace('postgresql+asyncpg://', 'postgresql://') + + conn = await asyncpg.connect(database_url) + + # Register listeners based on service + await self.register_db_listeners(conn) + + logger.info("Database listeners registered", service=self.config.SERVICE_NAME) + + except Exception as e: + logger.error("Failed to setup database listeners", service=self.config.SERVICE_NAME, error=str(e)) + + async def register_db_listeners(self, conn): + """Register database listeners - Override in service""" + pass + + # Publishing (Updated for type) + async def publish_item(self, tenant_id: UUID, item: Dict[str, Any], item_type: str = 'alert'): + """Publish alert or recommendation to RabbitMQ with deduplication""" + + try: + # Check for duplicate + item_key = f"{tenant_id}:{item_type}:{item['type']}:{item.get('metadata', {}).get('id', '')}" + if await self.is_duplicate_item(item_key): + logger.debug("Duplicate item skipped", + service=self.config.SERVICE_NAME, + item_type=item_type, + alert_type=item['type']) + return False + + # Add metadata + item['id'] = str(uuid.uuid4()) + item['tenant_id'] = str(tenant_id) + item['service'] = self.config.SERVICE_NAME + item['timestamp'] = datetime.utcnow().isoformat() + item['item_type'] = item_type # 'alert' or 'recommendation' + + # Determine routing key based on severity and type + routing_key = get_routing_key(item_type, item['severity'], self.config.SERVICE_NAME) + + # Publish to RabbitMQ + success = await self.rabbitmq_client.publish_event( + exchange_name=self.exchange, + routing_key=routing_key, + event_data=item + ) + + if success: + self._items_published += 1 + logger.info("Item published successfully", + service=self.config.SERVICE_NAME, + item_type=item_type, + alert_type=item['type'], + severity=item['severity'], + routing_key=routing_key) + else: + self._errors_count += 1 + logger.error("Failed to publish item", + service=self.config.SERVICE_NAME, + item_type=item_type, + alert_type=item['type']) + + return success + + except Exception as e: + self._errors_count += 1 + logger.error("Error publishing item", + service=self.config.SERVICE_NAME, + error=str(e), + item_type=item_type) + return False + + async def is_duplicate_item(self, item_key: str, window_minutes: int = 15) -> bool: + """Prevent duplicate items within time window""" + key = f"item_sent:{item_key}" + try: + result = await self.redis.set( + key, "1", + ex=window_minutes * 60, + nx=True + ) + return result is None # None means duplicate + except Exception as e: + logger.error("Error checking duplicate", error=str(e)) + return False # Allow publishing if check fails + + # Helper methods + async def get_active_tenants(self) -> List[UUID]: + """Get list of active tenant IDs""" + try: + from sqlalchemy import text + query = text("SELECT DISTINCT tenant_id FROM tenants WHERE status = 'active'") + async with self.db_manager.get_session() as session: + result = await session.execute(query) + return [row.tenant_id for row in result.fetchall()] + except Exception as e: + # If tenants table doesn't exist, skip tenant-based processing + if "does not exist" in str(e): + logger.debug("Tenants table not found, skipping tenant-based alert processing") + return [] + else: + logger.error("Error fetching active tenants", error=str(e)) + return [] + + async def get_tenant_config(self, tenant_id: UUID) -> Dict[str, Any]: + """Get tenant-specific configuration""" + try: + from sqlalchemy import text + query = text("SELECT config FROM tenants WHERE tenant_id = :tenant_id") + async with self.db_manager.get_session() as session: + result = await session.execute(query, {"tenant_id": tenant_id}) + row = result.fetchone() + return json.loads(row.config) if row and row.config else {} + except Exception as e: + logger.error("Error fetching tenant config", tenant_id=str(tenant_id), error=str(e)) + return {} + + # Health and metrics + def get_metrics(self) -> Dict[str, Any]: + """Get service metrics""" + return { + "items_published": self._items_published, + "checks_performed": self._checks_performed, + "errors_count": self._errors_count, + "is_leader": self.is_leader, + "scheduler_running": self.scheduler.running, + "redis_connected": self.redis and not self.redis.closed, + "rabbitmq_connected": self.rabbitmq_client.connected if self.rabbitmq_client else False + } + + async def health_check(self) -> Dict[str, Any]: + """Comprehensive health check""" + try: + # Check Redis + redis_healthy = False + if self.redis and not self.redis.closed: + await self.redis.ping() + redis_healthy = True + + # Check RabbitMQ + rabbitmq_healthy = self.rabbitmq_client.connected if self.rabbitmq_client else False + + # Check database + db_healthy = False + try: + from sqlalchemy import text + async with self.db_manager.get_session() as session: + await session.execute(text("SELECT 1")) + db_healthy = True + except: + pass + + status = "healthy" if all([redis_healthy, rabbitmq_healthy, db_healthy]) else "unhealthy" + + return { + "status": status, + "service": self.config.SERVICE_NAME, + "components": { + "redis": "healthy" if redis_healthy else "unhealthy", + "rabbitmq": "healthy" if rabbitmq_healthy else "unhealthy", + "database": "healthy" if db_healthy else "unhealthy", + "scheduler": "running" if self.scheduler.running else "stopped" + }, + "metrics": self.get_metrics() + } + + except Exception as e: + return { + "status": "error", + "service": self.config.SERVICE_NAME, + "error": str(e) + } + + +class AlertServiceMixin: + """Mixin providing common alert helper methods""" + + def format_spanish_message(self, template_key: str, **kwargs) -> Dict[str, Any]: + """Format Spanish alert message""" + from shared.alerts.templates import format_item_message + return format_item_message(template_key, 'es', **kwargs) + + def get_business_hours_severity(self, base_severity: str) -> str: + """Adjust severity based on business hours""" + current_hour = datetime.now().hour + + # Reduce non-critical severity outside business hours (7-20) + if not (7 <= current_hour <= 20): + if base_severity == 'medium': + return 'low' + elif base_severity == 'high' and current_hour < 6 or current_hour > 22: + return 'medium' + + return base_severity + + def should_send_recommendation(self, tenant_id: UUID, rec_type: str) -> bool: + """Check if recommendation should be sent based on tenant preferences""" + # Implement tenant-specific recommendation frequency limits + # This is a simplified version + return True \ No newline at end of file diff --git a/shared/alerts/templates.py b/shared/alerts/templates.py new file mode 100644 index 00000000..1483a386 --- /dev/null +++ b/shared/alerts/templates.py @@ -0,0 +1,218 @@ +# shared/alerts/templates.py +""" +Alert and recommendation templates in Spanish for the bakery platform +""" + +from typing import Dict, Any + +ITEM_TEMPLATES = { + # ALERTS - Critical Issues Requiring Immediate Action + 'critical_stock_shortage': { + 'es': { + 'title': '🚨 Stock Crítico: {ingredient_name}', + 'message': 'Solo {current_stock}kg disponibles, necesarios {required_stock}kg para producción de mañana. Acción inmediata requerida.', + 'actions': ['Realizar pedido de emergencia', 'Contactar proveedor', 'Ajustar plan de producción'] + }, + 'en': { + 'title': '🚨 Critical Stock: {ingredient_name}', + 'message': 'Only {current_stock}kg available, {required_stock}kg needed for tomorrow\'s production. Immediate action required.', + 'actions': ['Place emergency order', 'Contact supplier', 'Adjust production plan'] + } + }, + 'temperature_breach': { + 'es': { + 'title': '🌡️ ALERTA TEMPERATURA', + 'message': '{location}: {temperature}°C durante {duration} minutos. Revisar productos inmediatamente para evitar deterioro.', + 'actions': ['Verificar productos', 'Llamar técnico refrigeración', 'Documentar incidencia', 'Mover productos'] + }, + 'en': { + 'title': '🌡️ TEMPERATURE ALERT', + 'message': '{location}: {temperature}°C for {duration} minutes. Check products immediately to prevent spoilage.', + 'actions': ['Check products', 'Call refrigeration technician', 'Document incident', 'Move products'] + } + }, + 'production_delay': { + 'es': { + 'title': '⏰ Retraso en Producción', + 'message': 'Lote {batch_name} con {delay_minutes} minutos de retraso. Impacto en entregas del día.', + 'actions': ['Acelerar producción', 'Notificar clientes', 'Reorganizar horarios', 'Buscar capacidad adicional'] + } + }, + 'expired_products': { + 'es': { + 'title': '📅 Productos Caducados', + 'message': '{product_count} productos han caducado hoy. Retirar inmediatamente por seguridad alimentaria.', + 'actions': ['Retirar productos', 'Revisar inventario', 'Ajustar pedidos', 'Documentar pérdidas'] + } + }, + 'equipment_failure': { + 'es': { + 'title': '⚙️ Fallo de Equipo', + 'message': '{equipment_name} no está funcionando correctamente. Producción afectada.', + 'actions': ['Parar producción', 'Llamar mantenimiento', 'Usar equipo alternativo', 'Documentar fallo'] + } + }, + 'order_overload': { + 'es': { + 'title': '📋 Sobrecarga de Pedidos', + 'message': 'Capacidad excedida en {percentage}%. Riesgo de no cumplir entregas.', + 'actions': ['Priorizar pedidos', 'Aumentar turnos', 'Rechazar nuevos pedidos', 'Buscar ayuda externa'] + } + }, + 'supplier_delay': { + 'es': { + 'title': '🚚 Retraso de Proveedor', + 'message': 'Entrega de {supplier_name} retrasada {hours} horas. Impacto en producción de {products}.', + 'actions': ['Contactar proveedor', 'Buscar alternativas', 'Ajustar producción', 'Usar stock reserva'] + } + }, + + # RECOMMENDATIONS - Proactive Suggestions for Optimization + 'inventory_optimization': { + 'es': { + 'title': '📈 Optimización de Stock: {ingredient_name}', + 'message': 'Basado en tendencias de {period} días, sugerimos aumentar stock mínimo en {suggested_increase}kg para reducir costos.', + 'actions': ['Revisar niveles mínimos', 'Analizar proveedores', 'Actualizar configuración', 'Programar pedido mayor'] + }, + 'en': { + 'title': '📈 Stock Optimization: {ingredient_name}', + 'message': 'Based on {period} day trends, suggest increasing minimum stock by {suggested_increase}kg to reduce costs.', + 'actions': ['Review minimum levels', 'Analyze suppliers', 'Update configuration', 'Schedule larger order'] + } + }, + 'production_efficiency': { + 'es': { + 'title': '⚙️ Mejora de Eficiencia', + 'message': 'Cambiar horarios de horneado a {suggested_time} puede reducir costos energéticos en {savings_percent}%.', + 'actions': ['Revisar horarios', 'Consultar personal', 'Probar nuevo horario', 'Medir resultados'] + } + }, + 'sales_opportunity': { + 'es': { + 'title': '💰 Oportunidad de Venta', + 'message': '{product_name} tiene alta demanda los {days}. Incrementar producción puede aumentar ventas {increase_percent}%.', + 'actions': ['Aumentar producción', 'Promocionar producto', 'Revisar precios', 'Planificar ingredientes'] + } + }, + 'seasonal_adjustment': { + 'es': { + 'title': '🍂 Ajuste Estacional', + 'message': 'Época de {season}: ajustar producción de {products} según patrones históricos.', + 'actions': ['Revisar recetas estacionales', 'Ajustar inventario', 'Planificar promociones', 'Entrenar personal'] + } + }, + 'cost_reduction': { + 'es': { + 'title': '💡 Reducción de Costos', + 'message': 'Cambiar a proveedor {supplier_name} para {ingredient} puede ahorrar {savings_euros}€/mes.', + 'actions': ['Evaluar calidad', 'Negociar precios', 'Probar muestras', 'Cambiar proveedor gradualmente'] + } + }, + 'waste_reduction': { + 'es': { + 'title': '♻️ Reducción de Desperdicio', + 'message': 'Ajustar tamaños de lote de {product} puede reducir desperdicio en {waste_reduction_percent}%.', + 'actions': ['Analizar ventas', 'Ajustar recetas', 'Cambiar lotes', 'Monitorear resultados'] + } + }, + 'quality_improvement': { + 'es': { + 'title': '⭐ Mejora de Calidad', + 'message': 'Temperatura de horneado de {product} puede optimizarse para mejor textura y sabor.', + 'actions': ['Probar temperaturas', 'Documentar cambios', 'Entrenar panaderos', 'Obtener feedback'] + } + }, + 'customer_satisfaction': { + 'es': { + 'title': '😊 Satisfacción del Cliente', + 'message': 'Clientes solicitan más {product} los {days}. Considerar aumentar disponibilidad.', + 'actions': ['Revisar comentarios', 'Aumentar producción', 'Crear promociones', 'Mejorar exhibición'] + } + }, + 'energy_optimization': { + 'es': { + 'title': '⚡ Optimización Energética', + 'message': 'Consolidar horneado entre {start_time} y {end_time} puede reducir costos energéticos {savings_euros}€/día.', + 'actions': ['Revisar horarios energía', 'Reorganizar producción', 'Optimizar hornos', 'Medir consumo'] + } + }, + 'staff_optimization': { + 'es': { + 'title': '👥 Optimización de Personal', + 'message': 'Picos de trabajo los {days} a las {hours}. Considerar ajustar turnos para mejor eficiencia.', + 'actions': ['Analizar cargas trabajo', 'Reorganizar turnos', 'Entrenar polivalencia', 'Contratar temporal'] + } + } +} + +def format_item_message(template_key: str, language: str, **kwargs) -> Dict[str, Any]: + """Format item message using template with validation""" + template = ITEM_TEMPLATES.get(template_key, {}).get(language, {}) + + if not template: + # Fallback for missing templates + return { + 'title': f'Notificación: {template_key}', + 'message': f'Información: {", ".join([f"{k}: {v}" for k, v in kwargs.items()])}', + 'actions': ['Revisar', 'Documentar'] + } + + try: + # Format with provided kwargs, handling missing values gracefully + formatted_title = template['title'].format(**kwargs) + formatted_message = template['message'].format(**kwargs) + + return { + 'title': formatted_title, + 'message': formatted_message, + 'actions': template.get('actions', []) + } + except KeyError as e: + # Handle missing format parameters + return { + 'title': template.get('title', f'Notificación: {template_key}'), + 'message': f"Error en plantilla - parámetro faltante: {e}. Datos: {kwargs}", + 'actions': template.get('actions', ['Revisar configuración']) + } + +def get_severity_emoji(severity: str) -> str: + """Get emoji for severity level""" + emoji_map = { + 'urgent': '🚨', + 'high': '⚠️', + 'medium': '💡', + 'low': 'ℹ️' + } + return emoji_map.get(severity, '📋') + +def get_item_type_emoji(item_type: str) -> str: + """Get emoji for item type""" + emoji_map = { + 'alert': '🚨', + 'recommendation': '💡' + } + return emoji_map.get(item_type, '📋') + +def format_business_time(hour: int) -> str: + """Format hour in Spanish business context""" + if hour == 0: + return "medianoche" + elif hour < 12: + return f"{hour}:00 AM" + elif hour == 12: + return "12:00 PM (mediodía)" + else: + return f"{hour-12}:00 PM" + +def get_spanish_day_name(day_number: int) -> str: + """Get Spanish day name (0=Monday)""" + days = ["lunes", "martes", "miércoles", "jueves", "viernes", "sábado", "domingo"] + return days[day_number] if 0 <= day_number <= 6 else "día desconocido" + +def format_currency(amount: float) -> str: + """Format currency in Spanish Euro format""" + return f"{amount:.2f}€" + +def format_percentage(value: float) -> str: + """Format percentage in Spanish format""" + return f"{value:.1f}%" \ No newline at end of file diff --git a/shared/config/rabbitmq_config.py b/shared/config/rabbitmq_config.py new file mode 100644 index 00000000..cd901fe6 --- /dev/null +++ b/shared/config/rabbitmq_config.py @@ -0,0 +1,82 @@ +# shared/config/rabbitmq_config.py +""" +RabbitMQ configuration for the alert and recommendation system +Supports both alerts and recommendations through a unified topic exchange +""" + +RABBITMQ_CONFIG = { + "exchanges": { + "alerts": { + "name": "alerts.exchange", + "type": "topic", + "durable": True, + "auto_delete": False + }, + "dead_letter": { + "name": "dlx.exchange", + "type": "direct", + "durable": True, + "auto_delete": False + } + }, + "queues": { + "alert_processing": { + "name": "alert.processing.queue", + "durable": True, + "arguments": { + "x-message-ttl": 3600000, # 1 hour TTL + "x-max-length": 10000, # Max 10k messages + "x-overflow": "reject-publish", + "x-dead-letter-exchange": "dlx.exchange", + "x-dead-letter-routing-key": "failed.items" + } + }, + "dead_letter": { + "name": "alert.dead_letter.queue", + "durable": True, + "arguments": { + "x-message-ttl": 86400000 # 24 hours for dead letters + } + } + }, + "bindings": [ + { + "queue": "alert.processing.queue", + "exchange": "alerts.exchange", + "routing_key": "*.*.*" # alert/recommendation.severity.service + }, + { + "queue": "alert.dead_letter.queue", + "exchange": "dlx.exchange", + "routing_key": "failed.items" + } + ], + "routing_patterns": { + # alert/recommendation.severity.service_name + "alert": "alert.{severity}.{service}", + "recommendation": "recommendation.{severity}.{service}", + "all_alerts": "alert.*.*", + "all_recommendations": "recommendation.*.*", + "urgent_items": "*.urgent.*", + "high_items": "*.high.*" + } +} + +def get_routing_key(item_type: str, severity: str, service: str) -> str: + """Generate routing key for item publishing""" + return f"{item_type}.{severity}.{service}" + +def get_binding_patterns(item_types: list = None, severities: list = None, services: list = None) -> list: + """Generate binding patterns for selective consumption""" + patterns = [] + + item_types = item_types or ["alert", "recommendation"] + severities = severities or ["urgent", "high", "medium", "low"] + services = services or ["*"] + + for item_type in item_types: + for severity in severities: + for service in services: + patterns.append(f"{item_type}.{severity}.{service}") + + return patterns \ No newline at end of file diff --git a/shared/database/utils.py b/shared/database/utils.py index 38da2e04..04a11c1e 100644 --- a/shared/database/utils.py +++ b/shared/database/utils.py @@ -112,7 +112,7 @@ class DatabaseUtils: "checked_in": pool.checkedin(), "checked_out": pool.checkedout(), "overflow": pool.overflow(), - "invalid": pool.invalid() + "status": pool.status() } else: return {"status": "no_pool"} diff --git a/shared/monitoring/alert_metrics.py b/shared/monitoring/alert_metrics.py new file mode 100644 index 00000000..a8b385fb --- /dev/null +++ b/shared/monitoring/alert_metrics.py @@ -0,0 +1,420 @@ +# shared/monitoring/alert_metrics.py +""" +Metrics and monitoring for the alert and recommendation system +Provides comprehensive metrics for tracking system performance and effectiveness +""" + +from prometheus_client import Counter, Histogram, Gauge, Summary, Info +from typing import Dict, Any +import time +from functools import wraps +import structlog + +logger = structlog.get_logger() + +# ================================================================= +# DETECTION METRICS +# ================================================================= + +# Alert and recommendation generation +items_published = Counter( + 'alert_items_published_total', + 'Total number of alerts and recommendations published', + ['service', 'item_type', 'severity', 'type'] +) + +item_checks_performed = Counter( + 'alert_checks_performed_total', + 'Total number of alert checks performed', + ['service', 'check_type', 'pattern'] +) + +item_check_duration = Histogram( + 'alert_check_duration_seconds', + 'Time taken to perform alert checks', + ['service', 'check_type'], + buckets=[0.1, 0.5, 1, 2, 5, 10, 30, 60] +) + +alert_detection_errors = Counter( + 'alert_detection_errors_total', + 'Total number of errors during alert detection', + ['service', 'error_type', 'check_type'] +) + +# Deduplication metrics +duplicate_items_prevented = Counter( + 'duplicate_items_prevented_total', + 'Number of duplicate alerts/recommendations prevented', + ['service', 'item_type', 'type'] +) + +# ================================================================= +# PROCESSING METRICS +# ================================================================= + +# Alert processor metrics +items_processed = Counter( + 'alert_items_processed_total', + 'Total number of items processed by alert processor', + ['item_type', 'severity', 'type', 'status'] +) + +item_processing_duration = Histogram( + 'alert_processing_duration_seconds', + 'Time taken to process alerts/recommendations', + ['item_type', 'severity'], + buckets=[0.01, 0.05, 0.1, 0.5, 1, 2, 5] +) + +database_storage_duration = Histogram( + 'alert_database_storage_duration_seconds', + 'Time taken to store items in database', + buckets=[0.01, 0.05, 0.1, 0.5, 1] +) + +processing_errors = Counter( + 'alert_processing_errors_total', + 'Total number of processing errors', + ['error_type', 'item_type'] +) + +# ================================================================= +# DELIVERY METRICS +# ================================================================= + +# Notification delivery +notifications_sent = Counter( + 'alert_notifications_sent_total', + 'Total notifications sent through all channels', + ['channel', 'item_type', 'severity', 'status'] +) + +notification_delivery_duration = Histogram( + 'alert_notification_delivery_duration_seconds', + 'Time from item generation to delivery', + ['item_type', 'severity', 'channel'], + buckets=[0.1, 0.5, 1, 5, 10, 30, 60] +) + +delivery_failures = Counter( + 'alert_delivery_failures_total', + 'Failed notification deliveries', + ['channel', 'item_type', 'error_type'] +) + +# Channel-specific metrics +email_notifications = Counter( + 'alert_email_notifications_total', + 'Email notifications sent', + ['status', 'item_type'] +) + +whatsapp_notifications = Counter( + 'alert_whatsapp_notifications_total', + 'WhatsApp notifications sent', + ['status', 'item_type'] +) + +sse_events_sent = Counter( + 'alert_sse_events_sent_total', + 'SSE events sent to dashboard', + ['tenant', 'event_type', 'item_type'] +) + +# ================================================================= +# SSE METRICS +# ================================================================= + +# SSE connection metrics +sse_active_connections = Gauge( + 'alert_sse_active_connections', + 'Number of active SSE connections', + ['tenant_id'] +) + +sse_connection_duration = Histogram( + 'alert_sse_connection_duration_seconds', + 'Duration of SSE connections', + buckets=[10, 30, 60, 300, 600, 1800, 3600] +) + +sse_message_queue_size = Gauge( + 'alert_sse_message_queue_size', + 'Current size of SSE message queues', + ['tenant_id'] +) + +sse_connection_errors = Counter( + 'alert_sse_connection_errors_total', + 'SSE connection errors', + ['error_type', 'tenant_id'] +) + +# ================================================================= +# SYSTEM HEALTH METRICS +# ================================================================= + +# Active items gauge +active_items_gauge = Gauge( + 'alert_active_items_current', + 'Current number of active alerts and recommendations', + ['tenant_id', 'item_type', 'severity'] +) + +# System component health +system_component_health = Gauge( + 'alert_system_component_health', + 'Health status of alert system components (1=healthy, 0=unhealthy)', + ['component', 'service'] +) + +# Leader election status +scheduler_leader_status = Gauge( + 'alert_scheduler_leader_status', + 'Leader election status for schedulers (1=leader, 0=follower)', + ['service'] +) + +# Message queue health +rabbitmq_connection_status = Gauge( + 'alert_rabbitmq_connection_status', + 'RabbitMQ connection status (1=connected, 0=disconnected)', + ['service'] +) + +redis_connection_status = Gauge( + 'alert_redis_connection_status', + 'Redis connection status (1=connected, 0=disconnected)', + ['service'] +) + +# ================================================================= +# BUSINESS METRICS +# ================================================================= + +# Alert response metrics +items_acknowledged = Counter( + 'alert_items_acknowledged_total', + 'Number of items acknowledged by users', + ['item_type', 'severity', 'service'] +) + +items_resolved = Counter( + 'alert_items_resolved_total', + 'Number of items resolved by users', + ['item_type', 'severity', 'service'] +) + +item_response_time = Histogram( + 'alert_item_response_time_seconds', + 'Time from item creation to acknowledgment', + ['item_type', 'severity'], + buckets=[60, 300, 600, 1800, 3600, 7200, 14400] +) + +# Recommendation adoption +recommendations_implemented = Counter( + 'alert_recommendations_implemented_total', + 'Number of recommendations marked as implemented', + ['type', 'service'] +) + +# Effectiveness metrics +false_positive_rate = Gauge( + 'alert_false_positive_rate', + 'Rate of false positive alerts', + ['service', 'alert_type'] +) + +# ================================================================= +# PERFORMANCE DECORATORS +# ================================================================= + +def track_duration(metric: Histogram, **labels): + """Decorator to track function execution time""" + def decorator(func): + @wraps(func) + async def async_wrapper(*args, **kwargs): + start_time = time.time() + try: + result = await func(*args, **kwargs) + metric.labels(**labels).observe(time.time() - start_time) + return result + except Exception as e: + # Track error duration too + metric.labels(**labels).observe(time.time() - start_time) + raise + + @wraps(func) + def sync_wrapper(*args, **kwargs): + start_time = time.time() + try: + result = func(*args, **kwargs) + metric.labels(**labels).observe(time.time() - start_time) + return result + except Exception as e: + metric.labels(**labels).observe(time.time() - start_time) + raise + + return async_wrapper if hasattr(func, '__code__') and func.__code__.co_flags & 0x80 else sync_wrapper + return decorator + +def track_errors(error_counter: Counter, **labels): + """Decorator to track errors in functions""" + def decorator(func): + @wraps(func) + async def async_wrapper(*args, **kwargs): + try: + return await func(*args, **kwargs) + except Exception as e: + error_counter.labels(error_type=type(e).__name__, **labels).inc() + raise + + @wraps(func) + def sync_wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except Exception as e: + error_counter.labels(error_type=type(e).__name__, **labels).inc() + raise + + return async_wrapper if hasattr(func, '__code__') and func.__code__.co_flags & 0x80 else sync_wrapper + return decorator + +# ================================================================= +# UTILITY FUNCTIONS +# ================================================================= + +def record_item_published(service: str, item_type: str, severity: str, alert_type: str): + """Record that an item was published""" + items_published.labels( + service=service, + item_type=item_type, + severity=severity, + type=alert_type + ).inc() + +def record_item_processed(item_type: str, severity: str, alert_type: str, status: str): + """Record that an item was processed""" + items_processed.labels( + item_type=item_type, + severity=severity, + type=alert_type, + status=status + ).inc() + +def record_notification_sent(channel: str, item_type: str, severity: str, status: str): + """Record notification delivery""" + notifications_sent.labels( + channel=channel, + item_type=item_type, + severity=severity, + status=status + ).inc() + +def update_active_items(tenant_id: str, item_type: str, severity: str, count: int): + """Update active items gauge""" + active_items_gauge.labels( + tenant_id=tenant_id, + item_type=item_type, + severity=severity + ).set(count) + +def update_component_health(component: str, service: str, is_healthy: bool): + """Update component health status""" + system_component_health.labels( + component=component, + service=service + ).set(1 if is_healthy else 0) + +def update_connection_status(connection_type: str, service: str, is_connected: bool): + """Update connection status""" + if connection_type == 'rabbitmq': + rabbitmq_connection_status.labels(service=service).set(1 if is_connected else 0) + elif connection_type == 'redis': + redis_connection_status.labels(service=service).set(1 if is_connected else 0) + +# ================================================================= +# METRICS AGGREGATOR +# ================================================================= + +class AlertMetricsCollector: + """Centralized metrics collector for alert system""" + + def __init__(self, service_name: str): + self.service_name = service_name + + def record_check_performed(self, check_type: str, pattern: str): + """Record that a check was performed""" + item_checks_performed.labels( + service=self.service_name, + check_type=check_type, + pattern=pattern + ).inc() + + def record_detection_error(self, error_type: str, check_type: str): + """Record detection error""" + alert_detection_errors.labels( + service=self.service_name, + error_type=error_type, + check_type=check_type + ).inc() + + def record_duplicate_prevented(self, item_type: str, alert_type: str): + """Record prevented duplicate""" + duplicate_items_prevented.labels( + service=self.service_name, + item_type=item_type, + type=alert_type + ).inc() + + def update_leader_status(self, is_leader: bool): + """Update leader election status""" + scheduler_leader_status.labels(service=self.service_name).set(1 if is_leader else 0) + + def get_service_metrics(self) -> Dict[str, Any]: + """Get all metrics for this service""" + return { + 'service': self.service_name, + 'items_published': items_published._value._value, + 'checks_performed': item_checks_performed._value._value, + 'detection_errors': alert_detection_errors._value._value, + 'duplicates_prevented': duplicate_items_prevented._value._value + } + +# ================================================================= +# DASHBOARD METRICS +# ================================================================= + +def get_system_overview_metrics() -> Dict[str, Any]: + """Get overview metrics for monitoring dashboard""" + try: + return { + 'total_items_published': sum(items_published._value._value.values()), + 'total_checks_performed': sum(item_checks_performed._value._value.values()), + 'total_notifications_sent': sum(notifications_sent._value._value.values()), + 'active_sse_connections': sum(sse_active_connections._value._value.values()), + 'processing_errors': sum(processing_errors._value._value.values()), + 'delivery_failures': sum(delivery_failures._value._value.values()), + 'timestamp': time.time() + } + except Exception as e: + logger.error("Error collecting overview metrics", error=str(e)) + return {'error': str(e), 'timestamp': time.time()} + +def get_tenant_metrics(tenant_id: str) -> Dict[str, Any]: + """Get metrics for a specific tenant""" + try: + return { + 'tenant_id': tenant_id, + 'active_connections': sse_active_connections.labels(tenant_id=tenant_id)._value._value, + 'events_sent': sum([ + v for k, v in sse_events_sent._value._value.items() + if k[0] == tenant_id + ]), + 'timestamp': time.time() + } + except Exception as e: + logger.error("Error collecting tenant metrics", tenant_id=tenant_id, error=str(e)) + return {'tenant_id': tenant_id, 'error': str(e), 'timestamp': time.time()} \ No newline at end of file diff --git a/shared/notifications/__init__.py b/shared/notifications/__init__.py deleted file mode 100644 index 3509e3b4..00000000 --- a/shared/notifications/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# ================================================================ -# shared/notifications/__init__.py -# ================================================================ -""" -Shared Notifications Module - Alert integration using existing notification service -""" - -__all__ = [] \ No newline at end of file