Improve AI logic

2025-11-05 13:34:56 +01:00
parent 5c87fbcf48
commit 394ad3aea4
218 changed files with 30627 additions and 7658 deletions
--- a/services/orchestrator/app/services/orchestration_saga.py
+++ b/services/orchestrator/app/services/orchestration_saga.py
@@ -2,6 +2,7 @@
 Orchestration Saga Service

 Implements saga pattern for orchestrator workflow with compensation logic.
+Integrates AI-enhanced orchestration when enabled.
 """

 import asyncio
@@ -18,6 +19,8 @@ from shared.clients.notification_client import NotificationServiceClient
 from shared.clients.inventory_client import InventoryServiceClient
 from shared.clients.suppliers_client import SuppliersServiceClient
 from shared.clients.recipes_client import RecipesServiceClient
+from shared.clients.ai_insights_client import AIInsightsClient
+from shared.clients.training_client import TrainingServiceClient

 logger = logging.getLogger(__name__)

@@ -27,7 +30,8 @@ class OrchestrationSaga:
    Saga coordinator for orchestration workflow.

    Workflow Steps:
-    0. Fetch shared data snapshot (inventory, suppliers, recipes) - NEW
+    0. Fetch shared data snapshot (inventory, suppliers, recipes)
+    0.5. Generate AI insights from ML orchestrators
    1. Generate forecasts
    2. Generate production schedule
    3. Generate procurement plan
@@ -44,7 +48,12 @@ class OrchestrationSaga:
        notification_client: NotificationServiceClient,
        inventory_client: InventoryServiceClient,
        suppliers_client: SuppliersServiceClient,
-        recipes_client: RecipesServiceClient
+        recipes_client: RecipesServiceClient,
+        ai_insights_client: Optional[AIInsightsClient] = None,
+        training_client: Optional[TrainingServiceClient] = None,
+        use_ai_enhancement: bool = False,
+        ai_insights_base_url: str = "http://ai-insights-service:8000",
+        ai_insights_min_confidence: int = 70
    ):
        """
        Initialize orchestration saga.
@@ -54,9 +63,14 @@ class OrchestrationSaga:
            production_client: Production service client
            procurement_client: Procurement service client
            notification_client: Notification service client
-            inventory_client: Inventory service client (NEW)
-            suppliers_client: Suppliers service client (NEW)
-            recipes_client: Recipes service client (NEW)
+            inventory_client: Inventory service client
+            suppliers_client: Suppliers service client
+            recipes_client: Recipes service client
+            ai_insights_client: AI Insights service client
+            training_client: Training service client
+            use_ai_enhancement: Enable AI-enhanced orchestration
+            ai_insights_base_url: Base URL for AI Insights Service
+            ai_insights_min_confidence: Minimum confidence threshold for applying insights
        """
        self.forecast_client = forecast_client
        self.production_client = production_client
@@ -65,6 +79,25 @@ class OrchestrationSaga:
        self.inventory_client = inventory_client
        self.suppliers_client = suppliers_client
        self.recipes_client = recipes_client
+        self.ai_insights_client = ai_insights_client or AIInsightsClient(
+            base_url=ai_insights_base_url
+        )
+        self.training_client = training_client
+        self.use_ai_enhancement = use_ai_enhancement
+
+        # Initialize AI enhancer if enabled
+        self.ai_enhancer = None
+        if use_ai_enhancement:
+            try:
+                from app.ml.ai_enhanced_orchestrator import AIEnhancedOrchestrator
+                self.ai_enhancer = AIEnhancedOrchestrator(
+                    ai_insights_base_url=ai_insights_base_url,
+                    min_confidence_threshold=ai_insights_min_confidence
+                )
+                logger.info("AI-enhanced orchestration enabled")
+            except ImportError as e:
+                logger.warning(f"AI enhancement requested but could not be loaded: {e}")
+                self.use_ai_enhancement = False

    async def execute_orchestration(
        self,
@@ -108,6 +141,14 @@ class OrchestrationSaga:
            action_args=(tenant_id, context)
        )

+        # Step 0.5: Generate AI insights (NEW)
+        saga.add_step(
+            name="generate_ai_insights",
+            action=self._generate_ai_insights,
+            compensation=None,  # No compensation needed for read-only insight generation
+            action_args=(tenant_id, context)
+        )
+
        # Step 1: Generate forecasts
        saga.add_step(
            name="generate_forecasts",
@@ -140,6 +181,14 @@ class OrchestrationSaga:
            action_args=(tenant_id, context)
        )

+        # Step 5: Validate previous day's forecasts
+        saga.add_step(
+            name="validate_previous_forecasts",
+            action=self._validate_previous_forecasts,
+            compensation=None,  # No compensation needed for validation
+            action_args=(tenant_id, context)
+        )
+
        # Execute saga
        success, final_result, error = await saga.execute()

@@ -233,24 +282,249 @@ class OrchestrationSaga:
                'count': len(recipes_data) if recipes_data else 0
            }

+            # NEW: Fetch upcoming events for next 7 days
+            try:
+                from datetime import timedelta
+                # Note: Implement when event calendar service is ready
+                # For now, initialize as empty
+                context['event_calendar'] = []
+                logger.info("Event calendar: not yet implemented, using empty list")
+            except Exception as e:
+                logger.warning(f"Could not fetch events: {e}")
+                context['event_calendar'] = []
+
+            # NEW: Placeholder for traffic predictions (Phase 5)
+            try:
+                # Note: Implement traffic forecasting in Phase 5
+                # For now, initialize as empty DataFrame
+                import pandas as pd
+                context['traffic_predictions'] = pd.DataFrame()
+                logger.info("Traffic predictions: not yet implemented, using empty DataFrame")
+            except Exception as e:
+                logger.warning(f"Could not fetch traffic predictions: {e}")
+                import pandas as pd
+                context['traffic_predictions'] = pd.DataFrame()
+
            logger.info(
                f"Shared data snapshot fetched successfully: "
                f"{len(inventory_data)} ingredients, "
                f"{len(suppliers_data)} suppliers, "
-                f"{len(recipes_data)} recipes"
+                f"{len(recipes_data)} recipes, "
+                f"{len(context.get('event_calendar', []))} events"
            )

            return {
                'success': True,
                'inventory_count': len(inventory_data) if inventory_data else 0,
                'suppliers_count': len(suppliers_data) if suppliers_data else 0,
-                'recipes_count': len(recipes_data) if recipes_data else 0
+                'recipes_count': len(recipes_data) if recipes_data else 0,
+                'events_count': len(context.get('event_calendar', []))
            }

        except Exception as e:
            logger.error(f"Failed to fetch shared data snapshot for tenant {tenant_id}: {e}")
            raise

+    # ========================================================================
+    # Step 0.5: Generate AI Insights (NEW)
+    # ========================================================================
+
+    async def _generate_ai_insights(
+        self,
+        tenant_id: str,
+        context: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        Generate AI insights using HTTP calls to ML insights endpoints.
+
+        This step runs multiple ML insight generators in parallel via HTTP:
+        - Dynamic forecasting rules learning (forecasting service)
+        - Safety stock optimization (inventory service)
+        - Production yield predictions (production service)
+        - Supplier performance analysis (procurement service)
+        - Price forecasting (procurement service)
+
+        All insights are posted to the AI Insights Service by the respective services
+        and can be consumed by downstream orchestration steps.
+
+        Args:
+            tenant_id: Tenant ID
+            context: Execution context with cached data snapshots
+
+        Returns:
+            Dictionary with insights generation results
+        """
+        logger.info(f"Generating AI insights for tenant {tenant_id} via HTTP endpoints")
+
+        insights_results = {
+            'total_insights_generated': 0,
+            'total_insights_posted': 0,
+            'insights_by_source': {},
+            'errors': []
+        }
+
+        try:
+            # Prepare async tasks for parallel HTTP calls
+            ml_tasks = []
+
+            # Task 1: Safety Stock Optimization (inventory service)
+            async def trigger_safety_stock_optimization():
+                try:
+                    result = await self.inventory_client.trigger_safety_stock_optimization(
+                        tenant_id=tenant_id,
+                        product_ids=None,  # Analyze all products
+                        lookback_days=90,
+                        min_history_days=30
+                    )
+                    if result and result.get('success'):
+                        return ('safety_stock', {
+                            'insights_posted': result.get('total_insights_posted', 0),
+                            'insights_generated': result.get('total_insights_generated', 0),
+                            'products_optimized': result.get('products_optimized', 0)
+                        })
+                    else:
+                        return ('safety_stock', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0})
+                except Exception as e:
+                    logger.error(f"Safety stock optimization failed: {e}")
+                    return ('safety_stock', {'error': str(e), 'insights_posted': 0})
+
+            ml_tasks.append(trigger_safety_stock_optimization())
+
+            # Task 2: Production Yield Analysis (production service)
+            async def trigger_yield_prediction():
+                try:
+                    result = await self.production_client.trigger_yield_prediction(
+                        tenant_id=tenant_id,
+                        recipe_ids=None,  # Analyze all recipes
+                        lookback_days=90,
+                        min_history_runs=30
+                    )
+                    if result and result.get('success'):
+                        return ('yield_analysis', {
+                            'insights_posted': result.get('total_insights_posted', 0),
+                            'insights_generated': result.get('total_insights_generated', 0),
+                            'recipes_analyzed': result.get('recipes_analyzed', 0)
+                        })
+                    else:
+                        return ('yield_analysis', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0})
+                except Exception as e:
+                    logger.error(f"Yield prediction failed: {e}")
+                    return ('yield_analysis', {'error': str(e), 'insights_posted': 0})
+
+            ml_tasks.append(trigger_yield_prediction())
+
+            # Task 3: Supplier Performance Analysis (procurement service)
+            async def trigger_supplier_analysis():
+                try:
+                    result = await self.procurement_client.trigger_supplier_analysis(
+                        tenant_id=tenant_id,
+                        supplier_ids=None,  # Analyze all suppliers
+                        lookback_days=180,
+                        min_orders=10
+                    )
+                    if result and result.get('success'):
+                        return ('supplier_analysis', {
+                            'insights_posted': result.get('total_insights_posted', 0),
+                            'insights_generated': result.get('total_insights_generated', 0),
+                            'suppliers_analyzed': result.get('suppliers_analyzed', 0)
+                        })
+                    else:
+                        return ('supplier_analysis', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0})
+                except Exception as e:
+                    logger.error(f"Supplier analysis failed: {e}")
+                    return ('supplier_analysis', {'error': str(e), 'insights_posted': 0})
+
+            ml_tasks.append(trigger_supplier_analysis())
+
+            # Task 4: Price Forecasting (procurement service)
+            async def trigger_price_forecasting():
+                try:
+                    result = await self.procurement_client.trigger_price_forecasting(
+                        tenant_id=tenant_id,
+                        ingredient_ids=None,  # Forecast all ingredients
+                        lookback_days=180,
+                        forecast_horizon_days=30
+                    )
+                    if result and result.get('success'):
+                        return ('price_forecast', {
+                            'insights_posted': result.get('total_insights_posted', 0),
+                            'insights_generated': result.get('total_insights_generated', 0),
+                            'ingredients_forecasted': result.get('ingredients_forecasted', 0),
+                            'buy_now_recommendations': result.get('buy_now_recommendations', 0)
+                        })
+                    else:
+                        return ('price_forecast', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0})
+                except Exception as e:
+                    logger.error(f"Price forecasting failed: {e}")
+                    return ('price_forecast', {'error': str(e), 'insights_posted': 0})
+
+            ml_tasks.append(trigger_price_forecasting())
+
+            # Task 5: Dynamic Rules Learning (forecasting service)
+            async def trigger_rules_generation():
+                try:
+                    result = await self.forecast_client.trigger_rules_generation(
+                        tenant_id=tenant_id,
+                        product_ids=None,  # Analyze all products
+                        lookback_days=90,
+                        min_samples=10
+                    )
+                    if result and result.get('success'):
+                        return ('rules_learning', {
+                            'insights_posted': result.get('total_insights_posted', 0),
+                            'insights_generated': result.get('total_insights_generated', 0),
+                            'products_analyzed': result.get('products_analyzed', 0)
+                        })
+                    else:
+                        return ('rules_learning', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0})
+                except Exception as e:
+                    logger.error(f"Rules generation failed: {e}")
+                    return ('rules_learning', {'error': str(e), 'insights_posted': 0})
+
+            ml_tasks.append(trigger_rules_generation())
+
+            # Run all ML insight generation tasks in parallel
+            logger.info(f"Triggering {len(ml_tasks)} ML insight endpoints in parallel")
+            results = await asyncio.gather(*ml_tasks, return_exceptions=True)
+
+            # Process results
+            for result in results:
+                if isinstance(result, Exception):
+                    logger.error(f"ML insight task failed with exception: {result}")
+                    insights_results['errors'].append(str(result))
+                elif isinstance(result, tuple) and len(result) == 2:
+                    source, data = result
+                    if 'error' in data:
+                        insights_results['errors'].append(f"{source}: {data['error']}")
+                    else:
+                        posted = data.get('insights_posted', 0)
+                        generated = data.get('insights_generated', posted)
+                        insights_results['total_insights_posted'] += posted
+                        insights_results['total_insights_generated'] += generated
+                        insights_results['insights_by_source'][source] = posted
+                        logger.info(f"{source}: {posted} insights posted")
+
+            # Store insights count in context
+            context['ai_insights_generated'] = insights_results['total_insights_generated']
+            context['ai_insights_posted'] = insights_results['total_insights_posted']
+
+            logger.info(
+                f"AI insights generation complete: "
+                f"{insights_results['total_insights_posted']} insights posted from "
+                f"{len(insights_results['insights_by_source'])} sources"
+            )
+
+            return insights_results
+
+        except Exception as e:
+            logger.error(f"Failed to generate AI insights for tenant {tenant_id}: {e}", exc_info=True)
+            # Don't fail the orchestration if insights generation fails
+            # Log error and continue
+            insights_results['errors'].append(str(e))
+            context['ai_insights_generated'] = 0
+            context['ai_insights_posted'] = 0
+            return insights_results
+
    # ========================================================================
    # Step 1: Generate Forecasts
    # ========================================================================
@@ -276,6 +550,10 @@ class OrchestrationSaga:
            # Call forecast service
            result = await self.forecast_client.generate_forecasts(tenant_id)

+            if not result:
+                logger.error(f"Forecast service returned None for tenant {tenant_id}")
+                raise Exception("Forecast service returned None")
+
            # Store forecast ID in context
            forecast_id = result.get('forecast_id') or result.get('id')
            context['forecast_id'] = forecast_id
@@ -349,6 +627,10 @@ class OrchestrationSaga:
                recipes_data=recipes_snapshot  # NEW: Pass cached recipes
            )

+            if not result:
+                logger.error(f"Production service returned None for tenant {tenant_id}")
+                raise Exception("Production service returned None")
+
            # Store schedule ID in context
            schedule_id = result.get('schedule_id') or result.get('id')
            context['production_schedule_id'] = schedule_id
@@ -435,6 +717,10 @@ class OrchestrationSaga:
                recipes_data=recipes_snapshot  # NEW: Pass cached recipes
            )

+            if not result:
+                logger.error(f"Procurement service returned None for tenant {tenant_id}")
+                raise Exception("Procurement service returned None")
+
            # Store plan ID in context
            plan_id = result.get('plan_id') or result.get('id')
            context['procurement_plan_id'] = plan_id
@@ -523,12 +809,16 @@ class OrchestrationSaga:
                notification_data=notification_data
            )

-            notifications_sent = result.get('notifications_sent', 0)
-            context['notifications_sent'] = notifications_sent
+            if result:
+                notifications_sent = result.get('notifications_sent', 0)
+                context['notifications_sent'] = notifications_sent

-            logger.info(f"Notifications sent successfully: {notifications_sent}")
+                logger.info(f"Notifications sent successfully: {notifications_sent}")

-            return result
+                return result
+            else:
+                logger.warning(f"Notification service returned None for tenant {tenant_id}")
+                return {'notifications_sent': 0, 'error': 'Notification service returned None'}

        except Exception as e:
            # Log error but don't fail the saga for notification failures
@@ -536,6 +826,140 @@ class OrchestrationSaga:
            # Return empty result instead of raising
            return {'notifications_sent': 0, 'error': str(e)}

+    # ========================================================================
+    # Step 5: Validate Previous Day's Forecasts
+    # ========================================================================
+
+    async def _validate_previous_forecasts(
+        self,
+        tenant_id: str,
+        context: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        Validate yesterday's forecasts against actual sales.
+        Calculate accuracy metrics (MAPE, RMSE, MAE) and trigger retraining if needed.
+
+        Args:
+            tenant_id: Tenant ID
+            context: Execution context
+
+        Returns:
+            Validation result with metrics
+        """
+        from datetime import date, timedelta
+
+        logger.info(f"Validating previous day's forecasts for tenant {tenant_id}")
+
+        try:
+            yesterday = date.today() - timedelta(days=1)
+
+            # Call forecasting service validation endpoint
+            validation_result = await self.forecast_client.validate_forecasts(
+                tenant_id=tenant_id,
+                date=yesterday
+            )
+
+            if not validation_result:
+                logger.warning(f"No validation results returned for tenant {tenant_id}")
+                return {'validated': False, 'reason': 'no_data'}
+
+            # Extract metrics
+            overall_mape = validation_result.get('overall_mape', 0)
+            overall_rmse = validation_result.get('overall_rmse', 0)
+            overall_mae = validation_result.get('overall_mae', 0)
+            products_validated = validation_result.get('products_validated', 0)
+            poor_accuracy_products = validation_result.get('poor_accuracy_products', [])
+
+            context['validation_metrics'] = {
+                'mape': overall_mape,
+                'rmse': overall_rmse,
+                'mae': overall_mae,
+                'products_validated': products_validated,
+                'validation_date': yesterday.isoformat()
+            }
+
+            logger.info(
+                f"Validation complete for tenant {tenant_id}: "
+                f"MAPE={overall_mape:.2f}%, RMSE={overall_rmse:.2f}, MAE={overall_mae:.2f}, "
+                f"Products={products_validated}"
+            )
+
+            # Post accuracy insights to AI Insights Service
+            try:
+                from uuid import UUID
+                from datetime import datetime
+                await self.ai_insights_client.post_accuracy_metrics(
+                    tenant_id=UUID(tenant_id),
+                    validation_date=datetime.combine(yesterday, datetime.min.time()),
+                    metrics={
+                        'overall_mape': overall_mape,
+                        'overall_rmse': overall_rmse,
+                        'overall_mae': overall_mae,
+                        'products_validated': products_validated,
+                        'poor_accuracy_products': poor_accuracy_products
+                    }
+                )
+                logger.info(f"Posted accuracy metrics to AI Insights Service")
+            except Exception as e:
+                logger.warning(f"Could not post accuracy metrics to AI Insights: {e}")
+
+            # Trigger retraining for products with poor accuracy
+            if poor_accuracy_products and len(poor_accuracy_products) > 0:
+                logger.warning(
+                    f"Found {len(poor_accuracy_products)} products with MAPE > 30%, "
+                    f"triggering retraining"
+                )
+
+                retraining_triggered = 0
+                for product_data in poor_accuracy_products:
+                    product_id = product_data.get('product_id')
+                    product_mape = product_data.get('mape', 0)
+
+                    if not product_id:
+                        continue
+
+                    try:
+                        await self.training_client.trigger_retrain(
+                            tenant_id=tenant_id,
+                            inventory_product_id=product_id,
+                            reason='accuracy_degradation',
+                            metadata={
+                                'previous_mape': product_mape,
+                                'validation_date': yesterday.isoformat(),
+                                'triggered_by': 'orchestration_validation'
+                            }
+                        )
+                        retraining_triggered += 1
+                        logger.info(
+                            f"Triggered retraining for product {product_id} "
+                            f"(MAPE={product_mape:.2f}%)"
+                        )
+                    except Exception as e:
+                        logger.error(
+                            f"Failed to trigger retraining for product {product_id}: {e}"
+                        )
+
+                context['retraining_triggered'] = retraining_triggered
+                logger.info(f"Triggered retraining for {retraining_triggered} products")
+            else:
+                logger.info("All products have acceptable accuracy (MAPE <= 30%)")
+                context['retraining_triggered'] = 0
+
+            return {
+                'validated': True,
+                'metrics': context['validation_metrics'],
+                'retraining_triggered': context.get('retraining_triggered', 0)
+            }
+
+        except Exception as e:
+            # Don't fail the saga if validation fails
+            logger.warning(f"Forecast validation failed for tenant {tenant_id}: {e}")
+            return {
+                'validated': False,
+                'error': str(e),
+                'retraining_triggered': 0
+            }
+
    # ========================================================================
    # Utility Methods
    # ========================================================================
--- a/services/orchestrator/app/services/orchestrator_service.py
+++ b/services/orchestrator/app/services/orchestrator_service.py
@@ -26,7 +26,11 @@ from shared.clients.forecast_client import ForecastServiceClient
 from shared.clients.production_client import ProductionServiceClient
 from shared.clients.procurement_client import ProcurementServiceClient
 from shared.clients.notification_client import NotificationServiceClient
-from shared.utils.tenant_settings_client import TenantSettingsClient
+from shared.clients.tenant_client import TenantServiceClient
+from shared.clients.inventory_client import InventoryServiceClient
+from shared.clients.suppliers_client import SuppliersServiceClient
+from shared.clients.recipes_client import RecipesServiceClient
+from shared.clients.training_client import TrainingServiceClient
 from shared.utils.circuit_breaker import CircuitBreaker, CircuitBreakerOpenError
 from app.core.config import settings
 from app.repositories.orchestration_run_repository import OrchestrationRunRepository
@@ -46,11 +50,16 @@ class OrchestratorSchedulerService(BaseAlertService):
        super().__init__(config)

        # Service clients
-        self.forecast_client = ForecastServiceClient(config)
-        self.production_client = ProductionServiceClient(config)
-        self.procurement_client = ProcurementServiceClient(config)
-        self.notification_client = NotificationServiceClient(config)
-        self.tenant_settings_client = TenantSettingsClient(tenant_service_url=config.TENANT_SERVICE_URL)
+        self.forecast_client = ForecastServiceClient(config, "orchestrator-service")
+        self.production_client = ProductionServiceClient(config, "orchestrator-service")
+        self.procurement_client = ProcurementServiceClient(config, "orchestrator-service")
+        self.notification_client = NotificationServiceClient(config, "orchestrator-service")
+        self.tenant_client = TenantServiceClient(config)
+        self.training_client = TrainingServiceClient(config, "orchestrator-service")
+        # Clients for centralized data fetching
+        self.inventory_client = InventoryServiceClient(config, "orchestrator-service")
+        self.suppliers_client = SuppliersServiceClient(config, "orchestrator-service")
+        self.recipes_client = RecipesServiceClient(config, "orchestrator-service")

        # Circuit breakers for each service
        self.forecast_breaker = CircuitBreaker(
@@ -183,11 +192,19 @@ class OrchestratorSchedulerService(BaseAlertService):
            # Set timeout for entire tenant orchestration
            async with asyncio.timeout(settings.TENANT_TIMEOUT_SECONDS):
                # Execute orchestration using Saga pattern
+                # AI enhancement is enabled via ORCHESTRATION_USE_AI_INSIGHTS config
                saga = OrchestrationSaga(
                    forecast_client=self.forecast_client,
                    production_client=self.production_client,
                    procurement_client=self.procurement_client,
-                    notification_client=self.notification_client
+                    notification_client=self.notification_client,
+                    inventory_client=self.inventory_client,
+                    suppliers_client=self.suppliers_client,
+                    recipes_client=self.recipes_client,
+                    training_client=self.training_client,
+                    use_ai_enhancement=settings.ORCHESTRATION_USE_AI_INSIGHTS,
+                    ai_insights_base_url=settings.AI_INSIGHTS_SERVICE_URL,
+                    ai_insights_min_confidence=settings.AI_INSIGHTS_MIN_CONFIDENCE
                )

                result = await saga.execute_orchestration(
@@ -238,7 +255,7 @@ class OrchestratorSchedulerService(BaseAlertService):

            # Call Tenant Service with circuit breaker
            tenants_data = await self.tenant_breaker.call(
-                self.tenant_settings_client.get_active_tenants
+                self.tenant_client.get_active_tenants
            )

            if not tenants_data:
--- a/services/orchestrator/app/services/orchestrator_service_refactored.py
+++ b/services/orchestrator/app/services/orchestrator_service_refactored.py
@@ -1,392 +0,0 @@
-"""
-Orchestrator Scheduler Service - REFACTORED
-Coordinates daily auto-generation workflow: Forecasting → Production → Procurement → Notifications
-
-CHANGES FROM ORIGINAL:
- Removed all TODO/stub code
- Integrated OrchestrationSaga for error handling and compensation
- Added circuit breakers for all service calls
- Implemented real Forecasting Service integration
- Implemented real Production Service integration
- Implemented real Tenant Service integration
- Implemented real Notification Service integration
- NO backwards compatibility, NO feature flags - complete rewrite
-"""
-
-import asyncio
-import uuid
-from datetime import datetime, date, timezone
-from decimal import Decimal
-from typing import List, Dict, Any, Optional
-import structlog
-from apscheduler.triggers.cron import CronTrigger
-
-from shared.alerts.base_service import BaseAlertService
-from shared.clients.forecast_client import ForecastServiceClient
-from shared.clients.production_client import ProductionServiceClient
-from shared.clients.procurement_client import ProcurementServiceClient
-from shared.clients.notification_client import NotificationServiceClient
-from shared.clients.tenant_settings_client import TenantSettingsClient
-from shared.clients.inventory_client import InventoryServiceClient
-from shared.clients.suppliers_client import SuppliersServiceClient
-from shared.clients.recipes_client import RecipesServiceClient
-from shared.utils.circuit_breaker import CircuitBreaker, CircuitBreakerOpenError
-from app.core.config import settings
-from app.repositories.orchestration_run_repository import OrchestrationRunRepository
-from app.models.orchestration_run import OrchestrationStatus
-from app.services.orchestration_saga import OrchestrationSaga
-
-logger = structlog.get_logger()
-
-
-class OrchestratorSchedulerService(BaseAlertService):
-    """
-    Orchestrator Service extending BaseAlertService
-    Handles automated daily orchestration of forecasting, production, and procurement
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        # Service clients
-        self.forecast_client = ForecastServiceClient(config)
-        self.production_client = ProductionServiceClient(config)
-        self.procurement_client = ProcurementServiceClient(config)
-        self.notification_client = NotificationServiceClient(config)
-        self.tenant_settings_client = TenantSettingsClient(config)
-        # NEW: Clients for centralized data fetching
-        self.inventory_client = InventoryServiceClient(config)
-        self.suppliers_client = SuppliersServiceClient(config)
-        self.recipes_client = RecipesServiceClient(config)
-
-        # Circuit breakers for each service
-        self.forecast_breaker = CircuitBreaker(
-            failure_threshold=5,
-            timeout_duration=60,
-            success_threshold=2
-        )
-        self.production_breaker = CircuitBreaker(
-            failure_threshold=5,
-            timeout_duration=60,
-            success_threshold=2
-        )
-        self.procurement_breaker = CircuitBreaker(
-            failure_threshold=5,
-            timeout_duration=60,
-            success_threshold=2
-        )
-        self.tenant_breaker = CircuitBreaker(
-            failure_threshold=3,
-            timeout_duration=30,
-            success_threshold=2
-        )
-
-    def setup_scheduled_checks(self):
-        """
-        Configure scheduled orchestration jobs
-        Runs daily at 5:30 AM (configured via ORCHESTRATION_SCHEDULE)
-        """
-        # Parse cron schedule from config (default: "30 5 * * *" = 5:30 AM daily)
-        cron_parts = settings.ORCHESTRATION_SCHEDULE.split()
-        if len(cron_parts) == 5:
-            minute, hour, day, month, day_of_week = cron_parts
-        else:
-            # Fallback to default
-            minute, hour, day, month, day_of_week = "30", "5", "*", "*", "*"
-
-        # Schedule daily orchestration
-        self.scheduler.add_job(
-            func=self.run_daily_orchestration,
-            trigger=CronTrigger(
-                minute=minute,
-                hour=hour,
-                day=day,
-                month=month,
-                day_of_week=day_of_week
-            ),
-            id="daily_orchestration",
-            name="Daily Orchestration (Forecasting → Production → Procurement)",
-            misfire_grace_time=300,  # 5 minutes grace period
-            max_instances=1  # Only one instance running at a time
-        )
-
-        logger.info("Orchestrator scheduler configured",
-                   schedule=settings.ORCHESTRATION_SCHEDULE)
-
-    async def run_daily_orchestration(self):
-        """
-        Main orchestration workflow - runs daily
-        Executes for all active tenants in parallel (with limits)
-        """
-        if not self.is_leader:
-            logger.debug("Not leader, skipping orchestration")
-            return
-
-        if not settings.ORCHESTRATION_ENABLED:
-            logger.info("Orchestration disabled via config")
-            return
-
-        logger.info("Starting daily orchestration workflow")
-
-        try:
-            # Get all active tenants
-            active_tenants = await self._get_active_tenants()
-
-            if not active_tenants:
-                logger.warning("No active tenants found for orchestration")
-                return
-
-            logger.info("Processing tenants",
-                       total_tenants=len(active_tenants))
-
-            # Process tenants with concurrency limit
-            semaphore = asyncio.Semaphore(settings.MAX_CONCURRENT_TENANTS)
-
-            async def process_with_semaphore(tenant_id):
-                async with semaphore:
-                    return await self._orchestrate_tenant(tenant_id)
-
-            # Process all tenants in parallel (but limited by semaphore)
-            tasks = [process_with_semaphore(tenant_id) for tenant_id in active_tenants]
-            results = await asyncio.gather(*tasks, return_exceptions=True)
-
-            # Log summary
-            successful = sum(1 for r in results if r and not isinstance(r, Exception))
-            failed = len(results) - successful
-
-            logger.info("Daily orchestration completed",
-                       total_tenants=len(active_tenants),
-                       successful=successful,
-                       failed=failed)
-
-        except Exception as e:
-            logger.error("Error in daily orchestration",
-                        error=str(e), exc_info=True)
-
-    async def _orchestrate_tenant(self, tenant_id: uuid.UUID) -> bool:
-        """
-        Orchestrate workflow for a single tenant using Saga pattern
-        Returns True if successful, False otherwise
-        """
-        logger.info("Starting orchestration for tenant", tenant_id=str(tenant_id))
-
-        # Create orchestration run record
-        async with self.db_manager.get_session() as session:
-            repo = OrchestrationRunRepository(session)
-            run_number = await repo.generate_run_number()
-
-            run = await repo.create_run({
-                'run_number': run_number,
-                'tenant_id': tenant_id,
-                'status': OrchestrationStatus.running,
-                'run_type': 'scheduled',
-                'started_at': datetime.now(timezone.utc),
-                'triggered_by': 'scheduler'
-            })
-            await session.commit()
-            run_id = run.id
-
-        try:
-            # Set timeout for entire tenant orchestration
-            async with asyncio.timeout(settings.TENANT_TIMEOUT_SECONDS):
-                # Execute orchestration using Saga pattern
-                saga = OrchestrationSaga(
-                    forecast_client=self.forecast_client,
-                    production_client=self.production_client,
-                    procurement_client=self.procurement_client,
-                    notification_client=self.notification_client,
-                    inventory_client=self.inventory_client,  # NEW
-                    suppliers_client=self.suppliers_client,  # NEW
-                    recipes_client=self.recipes_client  # NEW
-                )
-
-                result = await saga.execute_orchestration(
-                    tenant_id=str(tenant_id),
-                    orchestration_run_id=str(run_id)
-                )
-
-                if result['success']:
-                    # Update orchestration run with saga results
-                    await self._complete_orchestration_run_with_saga(
-                        run_id,
-                        result
-                    )
-
-                    logger.info("Tenant orchestration completed successfully",
-                               tenant_id=str(tenant_id), run_id=str(run_id))
-                    return True
-                else:
-                    # Saga failed (with compensation)
-                    await self._mark_orchestration_failed(
-                        run_id,
-                        result.get('error', 'Saga execution failed')
-                    )
-                    return False
-
-        except asyncio.TimeoutError:
-            logger.error("Tenant orchestration timeout",
-                        tenant_id=str(tenant_id),
-                        timeout_seconds=settings.TENANT_TIMEOUT_SECONDS)
-            await self._mark_orchestration_failed(run_id, "Timeout exceeded")
-            return False
-
-        except Exception as e:
-            logger.error("Tenant orchestration failed",
-                        tenant_id=str(tenant_id),
-                        error=str(e), exc_info=True)
-            await self._mark_orchestration_failed(run_id, str(e))
-            return False
-
-    async def _get_active_tenants(self) -> List[uuid.UUID]:
-        """
-        Get list of active tenants for orchestration
-
-        REAL IMPLEMENTATION (no stubs)
-        """
-        try:
-            logger.info("Fetching active tenants from Tenant Service")
-
-            # Call Tenant Service with circuit breaker
-            tenants_data = await self.tenant_breaker.call(
-                self.tenant_settings_client.get_active_tenants
-            )
-
-            if not tenants_data:
-                logger.warning("Tenant Service returned no active tenants")
-                return []
-
-            # Extract tenant IDs
-            tenant_ids = []
-            for tenant in tenants_data:
-                tenant_id = tenant.get('id') or tenant.get('tenant_id')
-                if tenant_id:
-                    # Convert string to UUID if needed
-                    if isinstance(tenant_id, str):
-                        tenant_id = uuid.UUID(tenant_id)
-                    tenant_ids.append(tenant_id)
-
-            logger.info(f"Found {len(tenant_ids)} active tenants for orchestration")
-
-            return tenant_ids
-
-        except CircuitBreakerOpenError:
-            logger.error("Circuit breaker open for Tenant Service, skipping orchestration")
-            return []
-
-        except Exception as e:
-            logger.error("Error getting active tenants", error=str(e), exc_info=True)
-            return []
-
-    async def _complete_orchestration_run_with_saga(
-        self,
-        run_id: uuid.UUID,
-        saga_result: Dict[str, Any]
-    ):
-        """
-        Complete orchestration run with saga results
-
-        Args:
-            run_id: Orchestration run ID
-            saga_result: Result from saga execution
-        """
-        async with self.db_manager.get_session() as session:
-            repo = OrchestrationRunRepository(session)
-            run = await repo.get_run_by_id(run_id)
-
-            if run:
-                started_at = run.started_at
-                completed_at = datetime.now(timezone.utc)
-                duration = (completed_at - started_at).total_seconds()
-
-                # Extract results from saga
-                forecast_id = saga_result.get('forecast_id')
-                production_schedule_id = saga_result.get('production_schedule_id')
-                procurement_plan_id = saga_result.get('procurement_plan_id')
-                notifications_sent = saga_result.get('notifications_sent', 0)
-
-                # Get saga summary
-                saga_summary = saga_result.get('saga_summary', {})
-                total_steps = saga_summary.get('total_steps', 0)
-                completed_steps = saga_summary.get('completed_steps', 0)
-
-                await repo.update_run(run_id, {
-                    'status': OrchestrationStatus.completed,
-                    'completed_at': completed_at,
-                    'duration_seconds': int(duration),
-                    'forecast_id': forecast_id,
-                    'forecasting_status': 'success',
-                    'forecasting_completed_at': completed_at,
-                    'forecasts_generated': 1,  # Placeholder
-                    'production_schedule_id': production_schedule_id,
-                    'production_status': 'success',
-                    'production_completed_at': completed_at,
-                    'production_batches_created': 0,  # Placeholder
-                    'procurement_plan_id': procurement_plan_id,
-                    'procurement_status': 'success',
-                    'procurement_completed_at': completed_at,
-                    'procurement_plans_created': 1,
-                    'purchase_orders_created': 0,  # Placeholder
-                    'notification_status': 'success',
-                    'notification_completed_at': completed_at,
-                    'notifications_sent': notifications_sent,
-                    'saga_steps_total': total_steps,
-                    'saga_steps_completed': completed_steps
-                })
-                await session.commit()
-
-    async def _mark_orchestration_failed(self, run_id: uuid.UUID, error_message: str):
-        """Mark orchestration run as failed"""
-        async with self.db_manager.get_session() as session:
-            repo = OrchestrationRunRepository(session)
-            run = await repo.get_run_by_id(run_id)
-
-            if run:
-                started_at = run.started_at
-                completed_at = datetime.now(timezone.utc)
-                duration = (completed_at - started_at).total_seconds()
-
-                await repo.update_run(run_id, {
-                    'status': OrchestrationStatus.failed,
-                    'completed_at': completed_at,
-                    'duration_seconds': int(duration),
-                    'error_message': error_message
-                })
-                await session.commit()
-
-    # Manual trigger for testing
-    async def trigger_orchestration_for_tenant(
-        self,
-        tenant_id: uuid.UUID,
-        test_scenario: Optional[str] = None
-    ) -> Dict[str, Any]:
-        """
-        Manually trigger orchestration for a tenant (for testing)
-
-        Args:
-            tenant_id: Tenant ID to orchestrate
-            test_scenario: Optional test scenario (full, production_only, procurement_only)
-
-        Returns:
-            Dict with orchestration results
-        """
-        logger.info("Manual orchestration trigger",
-                   tenant_id=str(tenant_id),
-                   test_scenario=test_scenario)
-
-        success = await self._orchestrate_tenant(tenant_id)
-
-        return {
-            'success': success,
-            'tenant_id': str(tenant_id),
-            'test_scenario': test_scenario,
-            'message': 'Orchestration completed' if success else 'Orchestration failed'
-        }
-
-    def get_circuit_breaker_stats(self) -> Dict[str, Any]:
-        """Get circuit breaker statistics for monitoring"""
-        return {
-            'forecast_service': self.forecast_breaker.get_stats(),
-            'production_service': self.production_breaker.get_stats(),
-            'procurement_service': self.procurement_breaker.get_stats(),
-            'tenant_service': self.tenant_breaker.get_stats()
-        }