New alert service

2025-12-05 20:07:01 +01:00
parent 1fe3a73549
commit 667e6e0404
393 changed files with 26002 additions and 61033 deletions
--- a/services/production/app/api/batch.py
+++ b/services/production/app/api/batch.py
@@ -0,0 +1,165 @@
+# services/production/app/api/batch.py
+"""
+Production Batch API - Batch operations for enterprise dashboards
+
+Phase 2 optimization: Eliminate N+1 query patterns by fetching production data
+for multiple tenants in a single request.
+"""
+
+from fastapi import APIRouter, Depends, HTTPException, Body
+from typing import List, Dict, Any
+from uuid import UUID
+from pydantic import BaseModel, Field
+import structlog
+import asyncio
+
+from app.services.production_service import ProductionService
+from app.core.config import settings
+from shared.auth.decorators import get_current_user_dep
+
+router = APIRouter(tags=["production-batch"])
+logger = structlog.get_logger()
+
+
+def get_production_service() -> ProductionService:
+    """Dependency injection for production service"""
+    from app.core.database import database_manager
+    return ProductionService(database_manager, settings)
+
+
+class ProductionSummaryBatchRequest(BaseModel):
+    """Request model for batch production summary"""
+    tenant_ids: List[str] = Field(..., description="List of tenant IDs", max_length=100)
+
+
+class ProductionSummary(BaseModel):
+    """Production summary for a single tenant"""
+    tenant_id: str
+    total_batches: int
+    pending_batches: int
+    in_progress_batches: int
+    completed_batches: int
+    on_hold_batches: int
+    cancelled_batches: int
+    total_planned_quantity: float
+    total_actual_quantity: float
+    efficiency_rate: float
+
+
+@router.post("/batch/production-summary", response_model=Dict[str, ProductionSummary])
+async def get_production_summary_batch(
+    request: ProductionSummaryBatchRequest = Body(...),
+    current_user: Dict[str, Any] = Depends(get_current_user_dep),
+    production_service: ProductionService = Depends(get_production_service)
+):
+    """
+    Get production summary for multiple tenants in a single request.
+
+    Optimized for enterprise dashboards to eliminate N+1 query patterns.
+    Fetches production data for all tenants in parallel.
+
+    Args:
+        request: Batch request with tenant IDs
+
+    Returns:
+        Dictionary mapping tenant_id -> production summary
+
+    Example:
+        POST /api/v1/production/batch/production-summary
+        {
+            "tenant_ids": ["tenant-1", "tenant-2", "tenant-3"]
+        }
+
+        Response:
+        {
+            "tenant-1": {"tenant_id": "tenant-1", "total_batches": 25, ...},
+            "tenant-2": {"tenant_id": "tenant-2", "total_batches": 18, ...},
+            "tenant-3": {"tenant_id": "tenant-3", "total_batches": 32, ...}
+        }
+    """
+    try:
+        if len(request.tenant_ids) > 100:
+            raise HTTPException(
+                status_code=400,
+                detail="Maximum 100 tenant IDs allowed per batch request"
+            )
+
+        if not request.tenant_ids:
+            return {}
+
+        logger.info(
+            "Batch fetching production summaries",
+            tenant_count=len(request.tenant_ids)
+        )
+
+        async def fetch_tenant_production(tenant_id: str) -> tuple[str, ProductionSummary]:
+            """Fetch production summary for a single tenant"""
+            try:
+                tenant_uuid = UUID(tenant_id)
+                summary = await production_service.get_dashboard_summary(tenant_uuid)
+
+                # Calculate efficiency rate
+                efficiency_rate = 0.0
+                if summary.total_planned_quantity > 0 and summary.total_actual_quantity is not None:
+                    efficiency_rate = (summary.total_actual_quantity / summary.total_planned_quantity) * 100
+
+                return tenant_id, ProductionSummary(
+                    tenant_id=tenant_id,
+                    total_batches=int(summary.total_batches or 0),
+                    pending_batches=int(summary.pending_batches or 0),
+                    in_progress_batches=int(summary.in_progress_batches or 0),
+                    completed_batches=int(summary.completed_batches or 0),
+                    on_hold_batches=int(summary.on_hold_batches or 0),
+                    cancelled_batches=int(summary.cancelled_batches or 0),
+                    total_planned_quantity=float(summary.total_planned_quantity or 0),
+                    total_actual_quantity=float(summary.total_actual_quantity or 0),
+                    efficiency_rate=efficiency_rate
+                )
+            except Exception as e:
+                logger.warning(
+                    "Failed to fetch production for tenant in batch",
+                    tenant_id=tenant_id,
+                    error=str(e)
+                )
+                return tenant_id, ProductionSummary(
+                    tenant_id=tenant_id,
+                    total_batches=0,
+                    pending_batches=0,
+                    in_progress_batches=0,
+                    completed_batches=0,
+                    on_hold_batches=0,
+                    cancelled_batches=0,
+                    total_planned_quantity=0.0,
+                    total_actual_quantity=0.0,
+                    efficiency_rate=0.0
+                )
+
+        # Fetch all tenant production data in parallel
+        tasks = [fetch_tenant_production(tid) for tid in request.tenant_ids]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        # Build result dictionary
+        result_dict = {}
+        for result in results:
+            if isinstance(result, Exception):
+                logger.error("Exception in batch production fetch", error=str(result))
+                continue
+            tenant_id, summary = result
+            result_dict[tenant_id] = summary
+
+        logger.info(
+            "Batch production summaries retrieved",
+            requested_count=len(request.tenant_ids),
+            successful_count=len(result_dict)
+        )
+
+        return result_dict
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error("Error in batch production summary", error=str(e), exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to fetch batch production summaries: {str(e)}"
+        )
--- a/services/production/app/api/internal_alert_trigger.py
+++ b/services/production/app/api/internal_alert_trigger.py
@@ -0,0 +1,85 @@
+# services/production/app/api/internal_alert_trigger.py
+"""
+Internal API for triggering production alerts.
+Used by demo session cloning to generate realistic production delay alerts.
+"""
+
+from fastapi import APIRouter, HTTPException, Request, Path
+from uuid import UUID
+import structlog
+
+logger = structlog.get_logger()
+
+router = APIRouter()
+
+
+@router.post("/api/internal/production-alerts/trigger/{tenant_id}")
+async def trigger_production_alerts(
+    tenant_id: UUID = Path(..., description="Tenant ID to check production for"),
+    request: Request = None
+) -> dict:
+    """
+    Trigger production alert checks for a specific tenant (internal use only).
+
+    This endpoint is called by the demo session cloning process after production
+    batches are seeded to generate realistic production delay alerts.
+
+    Security: Protected by X-Internal-Service header check.
+    """
+    try:
+        # Verify internal service header
+        if not request or request.headers.get("X-Internal-Service") not in ["demo-session", "internal"]:
+            logger.warning("Unauthorized internal API call", tenant_id=str(tenant_id))
+            raise HTTPException(
+                status_code=403,
+                detail="This endpoint is for internal service use only"
+            )
+
+        # Get production alert service from app state
+        production_alert_service = getattr(request.app.state, 'production_alert_service', None)
+
+        if not production_alert_service:
+            logger.error("Production alert service not initialized")
+            raise HTTPException(
+                status_code=500,
+                detail="Production alert service not available"
+            )
+
+        # Trigger production alert checks (checks all tenants, including this one)
+        logger.info("Triggering production alert checks", tenant_id=str(tenant_id))
+        await production_alert_service.check_production_delays()
+
+        # Return success (service checks all tenants, we can't get specific count)
+        result = {"total_alerts": 0, "message": "Production alert checks triggered"}
+
+        logger.info(
+            "Production alert checks completed",
+            tenant_id=str(tenant_id),
+            alerts_generated=result.get("total_alerts", 0)
+        )
+
+        return {
+            "success": True,
+            "tenant_id": str(tenant_id),
+            "alerts_generated": result.get("total_alerts", 0),
+            "breakdown": {
+                "critical": result.get("critical", 0),
+                "high": result.get("high", 0),
+                "medium": result.get("medium", 0),
+                "low": result.get("low", 0)
+            }
+        }
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(
+            "Error triggering production alerts",
+            tenant_id=str(tenant_id),
+            error=str(e),
+            exc_info=True
+        )
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to trigger production alerts: {str(e)}"
+        )
--- a/services/production/app/api/production_batches.py
+++ b/services/production/app/api/production_batches.py
@@ -25,6 +25,7 @@ from app.schemas.production import (
    ProductionStatusEnum
 )
 from app.core.config import settings
+from app.utils.cache import get_cached, set_cached, make_cache_key

 logger = structlog.get_logger()
 route_builder = RouteBuilder('production')
@@ -56,8 +57,23 @@ async def list_production_batches(
    current_user: dict = Depends(get_current_user_dep),
    production_service: ProductionService = Depends(get_production_service)
 ):
-    """List batches with filters: date, status, product, order_id"""
+    """List batches with filters: date, status, product, order_id (with Redis caching - 20s TTL)"""
    try:
+        # PERFORMANCE OPTIMIZATION: Cache frequently accessed queries (status filter, first page)
+        cache_key = None
+        if page == 1 and product_id is None and order_id is None and start_date is None and end_date is None:
+            # Cache simple status-filtered queries (common for dashboards)
+            cache_key = make_cache_key(
+                "production_batches",
+                str(tenant_id),
+                status=status.value if status else None,
+                page_size=page_size
+            )
+            cached_result = await get_cached(cache_key)
+            if cached_result is not None:
+                logger.debug("Cache hit for production batches", cache_key=cache_key, tenant_id=str(tenant_id), status=status)
+                return ProductionBatchListResponse(**cached_result)
+
        filters = {
            "status": status,
            "product_id": str(product_id) if product_id else None,
@@ -68,6 +84,11 @@ async def list_production_batches(

        batch_list = await production_service.get_production_batches_list(tenant_id, filters, page, page_size)

+        # Cache the result if applicable (20s TTL for production batches)
+        if cache_key:
+            await set_cached(cache_key, batch_list.model_dump(), ttl=20)
+            logger.debug("Cached production batches", cache_key=cache_key, ttl=20, tenant_id=str(tenant_id), status=status)
+
        logger.info("Retrieved production batches list",
                   tenant_id=str(tenant_id), filters=filters)

--- a/services/production/app/api/production_dashboard.py
+++ b/services/production/app/api/production_dashboard.py
@@ -14,6 +14,7 @@ from shared.routing import RouteBuilder
 from app.services.production_service import ProductionService
 from app.schemas.production import ProductionDashboardSummary
 from app.core.config import settings
+from app.utils.cache import get_cached, set_cached, make_cache_key

 logger = structlog.get_logger()
 route_builder = RouteBuilder('production')
@@ -35,10 +36,22 @@ async def get_dashboard_summary(
    current_user: dict = Depends(get_current_user_dep),
    production_service: ProductionService = Depends(get_production_service)
 ):
-    """Get production dashboard summary"""
+    """Get production dashboard summary with caching (60s TTL)"""
    try:
+        # PHASE 2: Check cache first
+        cache_key = make_cache_key("production_dashboard", str(tenant_id))
+        cached_result = await get_cached(cache_key)
+        if cached_result is not None:
+            logger.debug("Cache hit for production dashboard", cache_key=cache_key, tenant_id=str(tenant_id))
+            return ProductionDashboardSummary(**cached_result)
+
+        # Cache miss - fetch from database
        summary = await production_service.get_dashboard_summary(tenant_id)

+        # PHASE 2: Cache the result (60s TTL for production batches)
+        await set_cached(cache_key, summary.model_dump(), ttl=60)
+        logger.debug("Cached production dashboard", cache_key=cache_key, ttl=60, tenant_id=str(tenant_id))
+
        logger.info("Retrieved production dashboard summary",
                   tenant_id=str(tenant_id))

--- a/services/production/app/main.py
+++ b/services/production/app/main.py
@@ -27,14 +27,16 @@ from app.api import (
    orchestrator,  # NEW: Orchestrator integration endpoint
    production_orders_operations,  # Tenant deletion endpoints
    audit,
-    ml_insights  # ML insights endpoint
+    ml_insights,  # ML insights endpoint
+    batch
 )
+from app.api.internal_alert_trigger import router as internal_alert_trigger_router


 class ProductionService(StandardFastAPIService):
    """Production Service with standardized setup"""

-    expected_migration_version = "00001"
+    expected_migration_version = "001_initial_schema"

    async def on_startup(self, app):
        """Custom startup logic including migration verification"""
@@ -63,6 +65,8 @@ class ProductionService(StandardFastAPIService):
        ]

        self.alert_service = None
+        self.rabbitmq_client = None
+        self.event_publisher = None
        # REMOVED: scheduler_service (replaced by Orchestrator Service)

        # Create custom checks for services
@@ -84,22 +88,53 @@ class ProductionService(StandardFastAPIService):
            expected_tables=production_expected_tables,
            custom_health_checks={
                "alert_service": check_alert_service
-            }
+            },
+            enable_messaging=True  # Enable messaging support
        )

+    async def _setup_messaging(self):
+        """Setup messaging for production service using unified messaging"""
+        from shared.messaging import UnifiedEventPublisher, RabbitMQClient
+        try:
+            self.rabbitmq_client = RabbitMQClient(settings.RABBITMQ_URL, service_name="production-service")
+            await self.rabbitmq_client.connect()
+            # Create unified event publisher
+            self.event_publisher = UnifiedEventPublisher(self.rabbitmq_client, "production-service")
+            self.logger.info("Production service unified messaging setup completed")
+        except Exception as e:
+            self.logger.error("Failed to setup production unified messaging", error=str(e))
+            raise
+
+    async def _cleanup_messaging(self):
+        """Cleanup messaging for production service"""
+        try:
+            if self.rabbitmq_client:
+                await self.rabbitmq_client.disconnect()
+                self.logger.info("Production service messaging cleanup completed")
+        except Exception as e:
+            self.logger.error("Error during production messaging cleanup", error=str(e))
+
    async def on_startup(self, app: FastAPI):
        """Custom startup logic for production service"""
-        # Initialize alert service
-        self.alert_service = ProductionAlertService(settings)
+        # Initialize messaging
+        await self._setup_messaging()
+
+        # Initialize alert service with EventPublisher and database manager
+        self.alert_service = ProductionAlertService(self.event_publisher, self.database_manager)
        await self.alert_service.start()
        self.logger.info("Production alert service started")

+        # Store services in app state
+        app.state.alert_service = self.alert_service
+        app.state.production_alert_service = self.alert_service  # Also store with this name for internal trigger
+
        # REMOVED: Production scheduler service initialization
        # Scheduling is now handled by the Orchestrator Service
        # which calls our /generate-schedule endpoint

        # Store services in app state
        app.state.alert_service = self.alert_service
+        app.state.production_alert_service = self.alert_service  # Also store with this name for internal trigger

    async def on_shutdown(self, app: FastAPI):
        """Custom shutdown logic for production service"""
@@ -108,6 +143,9 @@ class ProductionService(StandardFastAPIService):
            await self.alert_service.stop()
            self.logger.info("Alert service stopped")

+        # Cleanup messaging
+        await self._cleanup_messaging()
+
    def get_service_features(self):
        """Return production-specific features"""
        return [
@@ -155,6 +193,7 @@ service.setup_custom_middleware()
 # NOTE: Register more specific routes before generic parameterized routes
 # IMPORTANT: Register audit router FIRST to avoid route matching conflicts
 service.add_router(audit.router)
+service.add_router(batch.router)
 service.add_router(orchestrator.router)  # NEW: Orchestrator integration endpoint
 service.add_router(production_orders_operations.router)  # Tenant deletion endpoints
 service.add_router(quality_templates.router)  # Register first to avoid route conflicts
@@ -166,6 +205,7 @@ service.add_router(production_dashboard.router)
 service.add_router(analytics.router)
 service.add_router(internal_demo.router)
 service.add_router(ml_insights.router)  # ML insights endpoint
+service.add_router(internal_alert_trigger_router)  # Internal alert trigger for demo cloning

 # REMOVED: test_production_scheduler endpoint
 # Production scheduling is now triggered by the Orchestrator Service
--- a/services/production/app/services/production_alert_service.py
+++ b/services/production/app/services/production_alert_service.py
--- a/services/production/app/services/production_notification_service.py
+++ b/services/production/app/services/production_notification_service.py
@@ -1,38 +1,33 @@
 """
-Production Notification Service
+Production Notification Service - Simplified

-Emits informational notifications for production state changes:
- batch_state_changed: When batch transitions between states
- batch_completed: When batch production completes
- batch_started: When batch production begins
+Emits minimal events using EventPublisher.
+All enrichment handled by alert_processor.

 These are NOTIFICATIONS (not alerts) - informational state changes that don't require user action.
 """

-import logging
 from datetime import datetime, timezone
 from typing import Optional, Dict, Any
-from sqlalchemy.orm import Session
+from uuid import UUID
+import structlog

-from shared.schemas.event_classification import RawEvent, EventClass, EventDomain
-from shared.alerts.base_service import BaseAlertService
+from shared.messaging import UnifiedEventPublisher
+
+logger = structlog.get_logger()


-logger = logging.getLogger(__name__)
-
-
-class ProductionNotificationService(BaseAlertService):
+class ProductionNotificationService:
    """
-    Service for emitting production notifications (informational state changes).
+    Service for emitting production notifications using EventPublisher.
    """

-    def __init__(self, rabbitmq_url: str = None):
-        super().__init__(service_name="production", rabbitmq_url=rabbitmq_url)
+    def __init__(self, event_publisher: UnifiedEventPublisher):
+        self.publisher = event_publisher

    async def emit_batch_state_changed_notification(
        self,
-        db: Session,
-        tenant_id: str,
+        tenant_id: UUID,
        batch_id: str,
        product_sku: str,
        product_name: str,
@@ -44,76 +39,50 @@ class ProductionNotificationService(BaseAlertService):
    ) -> None:
        """
        Emit notification when a production batch changes state.
-
-        Args:
-            db: Database session
-            tenant_id: Tenant ID
-            batch_id: Production batch ID
-            product_sku: Product SKU
-            product_name: Product name
-            old_status: Previous status (PENDING, IN_PROGRESS, COMPLETED, etc.)
-            new_status: New status
-            quantity: Batch quantity
-            unit: Unit of measurement
-            assigned_to: Assigned worker/station (optional)
        """
-        try:
-            # Build message based on state transition
-            transition_messages = {
-                ("PENDING", "IN_PROGRESS"): f"Production started for {product_name}",
-                ("IN_PROGRESS", "COMPLETED"): f"Production completed for {product_name}",
-                ("IN_PROGRESS", "PAUSED"): f"Production paused for {product_name}",
-                ("PAUSED", "IN_PROGRESS"): f"Production resumed for {product_name}",
-                ("IN_PROGRESS", "FAILED"): f"Production failed for {product_name}",
-            }
+        # Build message based on state transition
+        transition_messages = {
+            ("PENDING", "IN_PROGRESS"): f"Production started for {product_name}",
+            ("IN_PROGRESS", "COMPLETED"): f"Production completed for {product_name}",
+            ("IN_PROGRESS", "PAUSED"): f"Production paused for {product_name}",
+            ("PAUSED", "IN_PROGRESS"): f"Production resumed for {product_name}",
+            ("IN_PROGRESS", "FAILED"): f"Production failed for {product_name}",
+        }

-            message = transition_messages.get(
-                (old_status, new_status),
-                f"{product_name} status changed from {old_status} to {new_status}"
-            )
+        message = transition_messages.get(
+            (old_status, new_status),
+            f"{product_name} status changed from {old_status} to {new_status}"
+        )

-            # Create notification event
-            event = RawEvent(
-                tenant_id=tenant_id,
-                event_class=EventClass.NOTIFICATION,
-                event_domain=EventDomain.PRODUCTION,
-                event_type="batch_state_changed",
-                title=f"Batch Status: {new_status}",
-                message=f"{message} ({quantity} {unit})",
-                service="production",
-                event_metadata={
-                    "batch_id": batch_id,
-                    "product_sku": product_sku,
-                    "product_name": product_name,
-                    "old_status": old_status,
-                    "new_status": new_status,
-                    "quantity": quantity,
-                    "unit": unit,
-                    "assigned_to": assigned_to,
-                    "state_changed_at": datetime.now(timezone.utc).isoformat(),
-                },
-                timestamp=datetime.now(timezone.utc),
-            )
+        metadata = {
+            "batch_id": batch_id,
+            "product_sku": product_sku,
+            "product_name": product_name,
+            "old_status": old_status,
+            "new_status": new_status,
+            "quantity": float(quantity),
+            "unit": unit,
+            "assigned_to": assigned_to,
+            "state_changed_at": datetime.now(timezone.utc).isoformat(),
+        }

-            # Publish to RabbitMQ for processing
-            await self.publish_item(tenant_id, event.dict(), item_type="notification")
+        await self.publisher.publish_notification(
+            event_type="production.batch_state_changed",
+            tenant_id=tenant_id,
+            data=metadata
+        )

-            logger.info(
-                f"Batch state change notification emitted: {batch_id} ({old_status} → {new_status})",
-                extra={"tenant_id": tenant_id, "batch_id": batch_id}
-            )
-
-        except Exception as e:
-            logger.error(
-                f"Failed to emit batch state change notification: {e}",
-                extra={"tenant_id": tenant_id, "batch_id": batch_id},
-                exc_info=True,
-            )
+        logger.info(
+            "batch_state_changed_notification_emitted",
+            tenant_id=str(tenant_id),
+            batch_id=batch_id,
+            old_status=old_status,
+            new_status=new_status
+        )

    async def emit_batch_completed_notification(
        self,
-        db: Session,
-        tenant_id: str,
+        tenant_id: UUID,
        batch_id: str,
        product_sku: str,
        product_name: str,
@@ -124,64 +93,42 @@ class ProductionNotificationService(BaseAlertService):
    ) -> None:
        """
        Emit notification when a production batch is completed.
-
-        Args:
-            db: Database session
-            tenant_id: Tenant ID
-            batch_id: Production batch ID
-            product_sku: Product SKU
-            product_name: Product name
-            quantity_produced: Quantity produced
-            unit: Unit of measurement
-            production_duration_minutes: Total production time (optional)
-            quality_score: Quality score (0-100, optional)
        """
-        try:
-            message = f"Produced {quantity_produced} {unit} of {product_name}"
-            if production_duration_minutes:
-                message += f" in {production_duration_minutes} minutes"
-            if quality_score:
-                message += f" (Quality: {quality_score:.1f}%)"
+        message_parts = [f"Produced {quantity_produced} {unit} of {product_name}"]
+        if production_duration_minutes:
+            message_parts.append(f"in {production_duration_minutes} minutes")
+        if quality_score:
+            message_parts.append(f"(Quality: {quality_score:.1f}%)")
+        
+        message = " ".join(message_parts)

-            event = RawEvent(
-                tenant_id=tenant_id,
-                event_class=EventClass.NOTIFICATION,
-                event_domain=EventDomain.PRODUCTION,
-                event_type="batch_completed",
-                title=f"Batch Completed: {product_name}",
-                message=message,
-                service="production",
-                event_metadata={
-                    "batch_id": batch_id,
-                    "product_sku": product_sku,
-                    "product_name": product_name,
-                    "quantity_produced": quantity_produced,
-                    "unit": unit,
-                    "production_duration_minutes": production_duration_minutes,
-                    "quality_score": quality_score,
-                    "completed_at": datetime.now(timezone.utc).isoformat(),
-                },
-                timestamp=datetime.now(timezone.utc),
-            )
+        metadata = {
+            "batch_id": batch_id,
+            "product_sku": product_sku,
+            "product_name": product_name,
+            "quantity_produced": float(quantity_produced),
+            "unit": unit,
+            "production_duration_minutes": production_duration_minutes,
+            "quality_score": quality_score,
+            "completed_at": datetime.now(timezone.utc).isoformat(),
+        }

-            await self.publish_item(tenant_id, event.dict(), item_type="notification")
+        await self.publisher.publish_notification(
+            event_type="production.batch_completed",
+            tenant_id=tenant_id,
+            data=metadata
+        )

-            logger.info(
-                f"Batch completed notification emitted: {batch_id} ({quantity_produced} {unit})",
-                extra={"tenant_id": tenant_id, "batch_id": batch_id}
-            )
-
-        except Exception as e:
-            logger.error(
-                f"Failed to emit batch completed notification: {e}",
-                extra={"tenant_id": tenant_id, "batch_id": batch_id},
-                exc_info=True,
-            )
+        logger.info(
+            "batch_completed_notification_emitted",
+            tenant_id=str(tenant_id),
+            batch_id=batch_id,
+            quantity_produced=quantity_produced
+        )

    async def emit_batch_started_notification(
        self,
-        db: Session,
-        tenant_id: str,
+        tenant_id: UUID,
        batch_id: str,
        product_sku: str,
        product_name: str,
@@ -192,64 +139,41 @@ class ProductionNotificationService(BaseAlertService):
    ) -> None:
        """
        Emit notification when a production batch is started.
-
-        Args:
-            db: Database session
-            tenant_id: Tenant ID
-            batch_id: Production batch ID
-            product_sku: Product SKU
-            product_name: Product name
-            quantity_planned: Planned quantity
-            unit: Unit of measurement
-            estimated_duration_minutes: Estimated duration (optional)
-            assigned_to: Assigned worker/station (optional)
        """
-        try:
-            message = f"Started production of {quantity_planned} {unit} of {product_name}"
-            if estimated_duration_minutes:
-                message += f" (Est. {estimated_duration_minutes} min)"
-            if assigned_to:
-                message += f" - Assigned to {assigned_to}"
+        message_parts = [f"Started production of {quantity_planned} {unit} of {product_name}"]
+        if estimated_duration_minutes:
+            message_parts.append(f"(Est. {estimated_duration_minutes} min)")
+        if assigned_to:
+            message_parts.append(f"- Assigned to {assigned_to}")
+        
+        message = " ".join(message_parts)

-            event = RawEvent(
-                tenant_id=tenant_id,
-                event_class=EventClass.NOTIFICATION,
-                event_domain=EventDomain.PRODUCTION,
-                event_type="batch_started",
-                title=f"Batch Started: {product_name}",
-                message=message,
-                service="production",
-                event_metadata={
-                    "batch_id": batch_id,
-                    "product_sku": product_sku,
-                    "product_name": product_name,
-                    "quantity_planned": quantity_planned,
-                    "unit": unit,
-                    "estimated_duration_minutes": estimated_duration_minutes,
-                    "assigned_to": assigned_to,
-                    "started_at": datetime.now(timezone.utc).isoformat(),
-                },
-                timestamp=datetime.now(timezone.utc),
-            )
+        metadata = {
+            "batch_id": batch_id,
+            "product_sku": product_sku,
+            "product_name": product_name,
+            "quantity_planned": float(quantity_planned),
+            "unit": unit,
+            "estimated_duration_minutes": estimated_duration_minutes,
+            "assigned_to": assigned_to,
+            "started_at": datetime.now(timezone.utc).isoformat(),
+        }

-            await self.publish_item(tenant_id, event.dict(), item_type="notification")
+        await self.publisher.publish_notification(
+            event_type="production.batch_started",
+            tenant_id=tenant_id,
+            data=metadata
+        )

-            logger.info(
-                f"Batch started notification emitted: {batch_id}",
-                extra={"tenant_id": tenant_id, "batch_id": batch_id}
-            )
-
-        except Exception as e:
-            logger.error(
-                f"Failed to emit batch started notification: {e}",
-                extra={"tenant_id": tenant_id, "batch_id": batch_id},
-                exc_info=True,
-            )
+        logger.info(
+            "batch_started_notification_emitted",
+            tenant_id=str(tenant_id),
+            batch_id=batch_id
+        )

    async def emit_equipment_status_notification(
        self,
-        db: Session,
-        tenant_id: str,
+        tenant_id: UUID,
        equipment_id: str,
        equipment_name: str,
        old_status: str,
@@ -258,50 +182,29 @@ class ProductionNotificationService(BaseAlertService):
    ) -> None:
        """
        Emit notification when equipment status changes.
-
-        Args:
-            db: Database session
-            tenant_id: Tenant ID
-            equipment_id: Equipment ID
-            equipment_name: Equipment name
-            old_status: Previous status
-            new_status: New status
-            reason: Reason for status change (optional)
        """
-        try:
-            message = f"{equipment_name} status: {old_status} → {new_status}"
-            if reason:
-                message += f" - {reason}"
+        message = f"{equipment_name} status: {old_status} → {new_status}"
+        if reason:
+            message += f" - {reason}"

-            event = RawEvent(
-                tenant_id=tenant_id,
-                event_class=EventClass.NOTIFICATION,
-                event_domain=EventDomain.PRODUCTION,
-                event_type="equipment_status_changed",
-                title=f"Equipment Status: {equipment_name}",
-                message=message,
-                service="production",
-                event_metadata={
-                    "equipment_id": equipment_id,
-                    "equipment_name": equipment_name,
-                    "old_status": old_status,
-                    "new_status": new_status,
-                    "reason": reason,
-                    "status_changed_at": datetime.now(timezone.utc).isoformat(),
-                },
-                timestamp=datetime.now(timezone.utc),
-            )
+        metadata = {
+            "equipment_id": equipment_id,
+            "equipment_name": equipment_name,
+            "old_status": old_status,
+            "new_status": new_status,
+            "reason": reason,
+            "status_changed_at": datetime.now(timezone.utc).isoformat(),
+        }

-            await self.publish_item(tenant_id, event.dict(), item_type="notification")
+        await self.publisher.publish_notification(
+            event_type="production.equipment_status_changed",
+            tenant_id=tenant_id,
+            data=metadata
+        )

-            logger.info(
-                f"Equipment status notification emitted: {equipment_name}",
-                extra={"tenant_id": tenant_id, "equipment_id": equipment_id}
-            )
-
-        except Exception as e:
-            logger.error(
-                f"Failed to emit equipment status notification: {e}",
-                extra={"tenant_id": tenant_id, "equipment_id": equipment_id},
-                exc_info=True,
-            )
+        logger.info(
+            "equipment_status_notification_emitted",
+            tenant_id=str(tenant_id),
+            equipment_id=equipment_id,
+            new_status=new_status
+        )
--- a/services/production/app/services/production_service.py
+++ b/services/production/app/services/production_service.py
@@ -24,6 +24,7 @@ from app.schemas.production import (
    ProductionScheduleCreate, ProductionScheduleUpdate, ProductionScheduleResponse,
    DailyProductionRequirements, ProductionDashboardSummary, ProductionMetrics
 )
+from app.utils.cache import delete_cached, make_cache_key

 logger = structlog.get_logger()

@@ -324,12 +325,17 @@ class ProductionService:
                    await self._update_inventory_on_completion(
                        tenant_id, batch, status_update.actual_quantity
                    )
-                
-                logger.info("Updated batch status", 
-                           batch_id=str(batch_id), 
+
+                # PHASE 2: Invalidate production dashboard cache
+                cache_key = make_cache_key("production_dashboard", str(tenant_id))
+                await delete_cached(cache_key)
+                logger.debug("Invalidated production dashboard cache", cache_key=cache_key, tenant_id=str(tenant_id))
+
+                logger.info("Updated batch status",
+                           batch_id=str(batch_id),
                           new_status=status_update.status.value,
                           tenant_id=str(tenant_id))
-                
+
                return batch
                
        except Exception as e:
@@ -658,7 +664,26 @@ class ProductionService:
                logger.info("Started production batch",
                           batch_id=str(batch_id), tenant_id=str(tenant_id))

-                return batch
+            # Acknowledge production delay alerts (non-blocking)
+            try:
+                from shared.clients.alert_processor_client import get_alert_processor_client
+                alert_client = get_alert_processor_client(self.config, "production")
+                await alert_client.acknowledge_alerts_by_metadata(
+                    tenant_id=tenant_id,
+                    alert_type="production_delay",
+                    metadata_filter={"batch_id": str(batch_id)}
+                )
+                await alert_client.acknowledge_alerts_by_metadata(
+                    tenant_id=tenant_id,
+                    alert_type="batch_at_risk",
+                    metadata_filter={"batch_id": str(batch_id)}
+                )
+                logger.debug("Acknowledged production delay alerts", batch_id=str(batch_id))
+            except Exception as e:
+                # Log but don't fail the batch start
+                logger.warning("Failed to acknowledge production alerts", batch_id=str(batch_id), error=str(e))
+
+            return batch

        except Exception as e:
            logger.error("Error starting production batch",
--- a/services/production/app/utils/init.py
+++ b/services/production/app/utils/init.py
@@ -0,0 +1,26 @@
+# services/alert_processor/app/utils/__init__.py
+"""
+Utility modules for alert processor service
+"""
+
+from .cache import (
+    get_redis_client,
+    close_redis,
+    get_cached,
+    set_cached,
+    delete_cached,
+    delete_pattern,
+    cache_response,
+    make_cache_key,
+)
+
+__all__ = [
+    'get_redis_client',
+    'close_redis',
+    'get_cached',
+    'set_cached',
+    'delete_cached',
+    'delete_pattern',
+    'cache_response',
+    'make_cache_key',
+]
--- a/services/production/app/utils/cache.py
+++ b/services/production/app/utils/cache.py
@@ -0,0 +1,265 @@
+# services/orchestrator/app/utils/cache.py
+"""
+Redis caching utilities for dashboard endpoints
+"""
+
+import json
+import redis.asyncio as redis
+from typing import Optional, Any, Callable
+from functools import wraps
+import structlog
+from app.core.config import settings
+from pydantic import BaseModel
+
+logger = structlog.get_logger()
+
+# Redis client instance
+_redis_client: Optional[redis.Redis] = None
+
+
+async def get_redis_client() -> redis.Redis:
+    """Get or create Redis client"""
+    global _redis_client
+
+    if _redis_client is None:
+        try:
+            # Check if TLS is enabled - convert string to boolean properly
+            redis_tls_str = str(getattr(settings, 'REDIS_TLS_ENABLED', 'false')).lower()
+            redis_tls_enabled = redis_tls_str in ('true', '1', 'yes', 'on')
+
+            connection_kwargs = {
+                'host': str(getattr(settings, 'REDIS_HOST', 'localhost')),
+                'port': int(getattr(settings, 'REDIS_PORT', 6379)),
+                'db': int(getattr(settings, 'REDIS_DB', 0)),
+                'decode_responses': True,
+                'socket_connect_timeout': 5,
+                'socket_timeout': 5
+            }
+
+            # Add password if configured
+            redis_password = getattr(settings, 'REDIS_PASSWORD', None)
+            if redis_password:
+                connection_kwargs['password'] = redis_password
+
+            # Add SSL/TLS support if enabled
+            if redis_tls_enabled:
+                import ssl
+                connection_kwargs['ssl'] = True
+                connection_kwargs['ssl_cert_reqs'] = ssl.CERT_NONE
+                logger.debug(f"Redis TLS enabled - connecting with SSL to {connection_kwargs['host']}:{connection_kwargs['port']}")
+
+            _redis_client = redis.Redis(**connection_kwargs)
+
+            # Test connection
+            await _redis_client.ping()
+            logger.info(f"Redis client connected successfully (TLS: {redis_tls_enabled})")
+        except Exception as e:
+            logger.warning(f"Failed to connect to Redis: {e}. Caching will be disabled.")
+            _redis_client = None
+
+    return _redis_client
+
+
+async def close_redis():
+    """Close Redis connection"""
+    global _redis_client
+    if _redis_client:
+        await _redis_client.close()
+        _redis_client = None
+        logger.info("Redis connection closed")
+
+
+async def get_cached(key: str) -> Optional[Any]:
+    """
+    Get cached value by key
+
+    Args:
+        key: Cache key
+
+    Returns:
+        Cached value (deserialized from JSON) or None if not found or error
+    """
+    try:
+        client = await get_redis_client()
+        if not client:
+            return None
+
+        cached = await client.get(key)
+        if cached:
+            logger.debug(f"Cache hit: {key}")
+            return json.loads(cached)
+        else:
+            logger.debug(f"Cache miss: {key}")
+            return None
+    except Exception as e:
+        logger.warning(f"Cache get error for key {key}: {e}")
+        return None
+
+
+def _serialize_value(value: Any) -> Any:
+    """
+    Recursively serialize values for JSON storage, handling Pydantic models properly.
+
+    Args:
+        value: Value to serialize
+
+    Returns:
+        JSON-serializable value
+    """
+    if isinstance(value, BaseModel):
+        # Convert Pydantic model to dictionary
+        return value.model_dump()
+    elif isinstance(value, (list, tuple)):
+        # Recursively serialize list/tuple elements
+        return [_serialize_value(item) for item in value]
+    elif isinstance(value, dict):
+        # Recursively serialize dictionary values
+        return {key: _serialize_value(val) for key, val in value.items()}
+    else:
+        # For other types, use default serialization
+        return value
+
+
+async def set_cached(key: str, value: Any, ttl: int = 60) -> bool:
+    """
+    Set cached value with TTL
+
+    Args:
+        key: Cache key
+        value: Value to cache (will be JSON serialized)
+        ttl: Time to live in seconds
+
+    Returns:
+        True if successful, False otherwise
+    """
+    try:
+        client = await get_redis_client()
+        if not client:
+            return False
+
+        # Serialize value properly before JSON encoding
+        serialized_value = _serialize_value(value)
+        serialized = json.dumps(serialized_value)
+        await client.setex(key, ttl, serialized)
+        logger.debug(f"Cache set: {key} (TTL: {ttl}s)")
+        return True
+    except Exception as e:
+        logger.warning(f"Cache set error for key {key}: {e}")
+        return False
+
+
+async def delete_cached(key: str) -> bool:
+    """
+    Delete cached value
+
+    Args:
+        key: Cache key
+
+    Returns:
+        True if successful, False otherwise
+    """
+    try:
+        client = await get_redis_client()
+        if not client:
+            return False
+
+        await client.delete(key)
+        logger.debug(f"Cache deleted: {key}")
+        return True
+    except Exception as e:
+        logger.warning(f"Cache delete error for key {key}: {e}")
+        return False
+
+
+async def delete_pattern(pattern: str) -> int:
+    """
+    Delete all keys matching pattern
+
+    Args:
+        pattern: Redis key pattern (e.g., "dashboard:*")
+
+    Returns:
+        Number of keys deleted
+    """
+    try:
+        client = await get_redis_client()
+        if not client:
+            return 0
+
+        keys = []
+        async for key in client.scan_iter(match=pattern):
+            keys.append(key)
+
+        if keys:
+            deleted = await client.delete(*keys)
+            logger.info(f"Deleted {deleted} keys matching pattern: {pattern}")
+            return deleted
+        return 0
+    except Exception as e:
+        logger.warning(f"Cache delete pattern error for {pattern}: {e}")
+        return 0
+
+
+def cache_response(key_prefix: str, ttl: int = 60):
+    """
+    Decorator to cache endpoint responses
+
+    Args:
+        key_prefix: Prefix for cache key (will be combined with tenant_id)
+        ttl: Time to live in seconds
+
+    Usage:
+        @cache_response("dashboard:health", ttl=30)
+        async def get_health(tenant_id: str):
+            ...
+    """
+    def decorator(func: Callable):
+        @wraps(func)
+        async def wrapper(*args, **kwargs):
+            # Extract tenant_id from kwargs or args
+            tenant_id = kwargs.get('tenant_id')
+            if not tenant_id and args:
+                # Try to find tenant_id in args (assuming it's the first argument)
+                tenant_id = args[0] if len(args) > 0 else None
+
+            if not tenant_id:
+                # No tenant_id, skip caching
+                return await func(*args, **kwargs)
+
+            # Build cache key
+            cache_key = f"{key_prefix}:{tenant_id}"
+
+            # Try to get from cache
+            cached_value = await get_cached(cache_key)
+            if cached_value is not None:
+                return cached_value
+
+            # Execute function
+            result = await func(*args, **kwargs)
+
+            # Cache result
+            await set_cached(cache_key, result, ttl)
+
+            return result
+
+        return wrapper
+    return decorator
+
+
+def make_cache_key(prefix: str, tenant_id: str, **params) -> str:
+    """
+    Create a cache key with optional parameters
+
+    Args:
+        prefix: Key prefix
+        tenant_id: Tenant ID
+        **params: Additional parameters to include in key
+
+    Returns:
+        Cache key string
+    """
+    key_parts = [prefix, tenant_id]
+    for k, v in sorted(params.items()):
+        if v is not None:
+            key_parts.append(f"{k}:{v}")
+    return ":".join(key_parts)