""" Orchestration Saga Service Implements saga pattern for orchestrator workflow with compensation logic. Integrates AI-enhanced orchestration when enabled. """ import asyncio import uuid from datetime import datetime, timezone from typing import Dict, Any, Optional import logging from shared.utils.saga_pattern import SagaCoordinator from shared.clients.forecast_client import ForecastServiceClient from shared.clients.production_client import ProductionServiceClient from shared.clients.procurement_client import ProcurementServiceClient from shared.clients.notification_client import NotificationServiceClient from shared.clients.inventory_client import InventoryServiceClient from shared.clients.suppliers_client import SuppliersServiceClient from shared.clients.recipes_client import RecipesServiceClient from shared.clients.ai_insights_client import AIInsightsClient from shared.clients.training_client import TrainingServiceClient logger = logging.getLogger(__name__) class OrchestrationSaga: """ Saga coordinator for orchestration workflow. Workflow Steps: 0. Fetch shared data snapshot (inventory, suppliers, recipes) 0.5. Generate AI insights from ML orchestrators 1. Generate forecasts 2. Generate production schedule 3. Generate procurement plan 4. Send notifications Each step has compensation logic to rollback on failure. """ def __init__( self, forecast_client: ForecastServiceClient, production_client: ProductionServiceClient, procurement_client: ProcurementServiceClient, notification_client: NotificationServiceClient, inventory_client: InventoryServiceClient, suppliers_client: SuppliersServiceClient, recipes_client: RecipesServiceClient, ai_insights_client: Optional[AIInsightsClient] = None, training_client: Optional[TrainingServiceClient] = None, use_ai_enhancement: bool = False, ai_insights_base_url: str = "http://ai-insights-service:8000", ai_insights_min_confidence: int = 70, # Circuit breakers for fault tolerance forecast_breaker: Optional['CircuitBreaker'] = None, production_breaker: Optional['CircuitBreaker'] = None, procurement_breaker: Optional['CircuitBreaker'] = None, inventory_breaker: Optional['CircuitBreaker'] = None, suppliers_breaker: Optional['CircuitBreaker'] = None, recipes_breaker: Optional['CircuitBreaker'] = None ): """ Initialize orchestration saga. Args: forecast_client: Forecast service client production_client: Production service client procurement_client: Procurement service client notification_client: Notification service client inventory_client: Inventory service client suppliers_client: Suppliers service client recipes_client: Recipes service client ai_insights_client: AI Insights service client training_client: Training service client use_ai_enhancement: Enable AI-enhanced orchestration ai_insights_base_url: Base URL for AI Insights Service ai_insights_min_confidence: Minimum confidence threshold for applying insights """ self.forecast_client = forecast_client self.production_client = production_client self.procurement_client = procurement_client self.notification_client = notification_client self.inventory_client = inventory_client self.suppliers_client = suppliers_client self.recipes_client = recipes_client self.ai_insights_client = ai_insights_client or AIInsightsClient( base_url=ai_insights_base_url ) self.training_client = training_client self.use_ai_enhancement = use_ai_enhancement # Circuit breakers self.forecast_breaker = forecast_breaker self.production_breaker = production_breaker self.procurement_breaker = procurement_breaker self.inventory_breaker = inventory_breaker self.suppliers_breaker = suppliers_breaker self.recipes_breaker = recipes_breaker # Initialize AI enhancer if enabled self.ai_enhancer = None if use_ai_enhancement: try: from app.ml.ai_enhanced_orchestrator import AIEnhancedOrchestrator self.ai_enhancer = AIEnhancedOrchestrator( ai_insights_base_url=ai_insights_base_url, min_confidence_threshold=ai_insights_min_confidence ) logger.info("AI-enhanced orchestration enabled") except ImportError as e: logger.warning(f"AI enhancement requested but could not be loaded: {e}") self.use_ai_enhancement = False async def execute_orchestration( self, tenant_id: str, orchestration_run_id: str ) -> Dict[str, Any]: """ Execute full orchestration workflow with saga pattern. Args: tenant_id: Tenant ID orchestration_run_id: Orchestration run ID Returns: Dictionary with execution results """ saga = SagaCoordinator(saga_id=f"orchestration_{orchestration_run_id}") # Store execution context context = { 'tenant_id': tenant_id, 'orchestration_run_id': orchestration_run_id, 'forecast_id': None, 'production_schedule_id': None, 'procurement_plan_id': None, 'notifications_sent': 0, # NEW: Cached data snapshots to avoid duplicate fetching 'inventory_snapshot': None, 'suppliers_snapshot': None, 'recipes_snapshot': None, 'forecast_data': None, 'production_data': None, 'procurement_data': None } # Step 0: Fetch shared data snapshot (NEW) saga.add_step( name="fetch_shared_data_snapshot", action=self._fetch_shared_data_snapshot, compensation=None, # No compensation needed for read-only operations action_args=(tenant_id, context) ) # Step 0.5: Generate AI insights (NEW) saga.add_step( name="generate_ai_insights", action=self._generate_ai_insights, compensation=None, # No compensation needed for read-only insight generation action_args=(tenant_id, context) ) # Step 1: Generate forecasts saga.add_step( name="generate_forecasts", action=self._generate_forecasts, compensation=self._compensate_forecasts, action_args=(tenant_id, context) ) # Step 2: Generate production schedule saga.add_step( name="generate_production_schedule", action=self._generate_production_schedule, compensation=self._compensate_production_schedule, action_args=(tenant_id, context) ) # Step 3: Generate procurement plan saga.add_step( name="generate_procurement_plan", action=self._generate_procurement_plan, compensation=self._compensate_procurement_plan, action_args=(tenant_id, context) ) # Step 4: Send notifications saga.add_step( name="send_notifications", action=self._send_notifications, compensation=None, # No compensation needed for notifications action_args=(tenant_id, context) ) # Step 5: Validate previous day's forecasts saga.add_step( name="validate_previous_forecasts", action=self._validate_previous_forecasts, compensation=None, # No compensation needed for validation action_args=(tenant_id, context) ) # Execute saga success, final_result, error = await saga.execute() if success: logger.info( f"Orchestration saga completed successfully for tenant {tenant_id}" ) return { 'success': True, 'forecast_id': context.get('forecast_id'), 'production_schedule_id': context.get('production_schedule_id'), 'procurement_plan_id': context.get('procurement_plan_id'), 'notifications_sent': context.get('notifications_sent', 0), 'forecast_data': context.get('forecast_data', {}), 'production_data': context.get('production_data', {}), 'procurement_data': context.get('procurement_data', {}), 'ai_insights_generated': context.get('ai_insights_generated', 0), 'ai_insights_posted': context.get('ai_insights_posted', 0), 'ai_insights_errors': context.get('ai_insights_errors', []), 'saga_summary': saga.get_execution_summary() } else: logger.error( f"Orchestration saga failed for tenant {tenant_id}: {error}" ) return { 'success': False, 'error': str(error), 'saga_summary': saga.get_execution_summary() } # ======================================================================== # Step 0: Fetch Shared Data Snapshot (NEW) # ======================================================================== async def _fetch_shared_data_snapshot( self, tenant_id: str, context: Dict[str, Any] ) -> Dict[str, Any]: """ Fetch shared data snapshot once at the beginning of orchestration. This eliminates duplicate API calls to inventory, suppliers, and recipes services. Args: tenant_id: Tenant ID context: Execution context Returns: Dictionary with fetched data """ logger.info(f"Fetching shared data snapshot for tenant {tenant_id}") try: # Fetch data in parallel for optimal performance, with circuit breaker protection async def fetch_inventory(): if self.inventory_breaker: return await self.inventory_breaker.call( self.inventory_client.get_all_ingredients, tenant_id, is_active=True ) else: return await self.inventory_client.get_all_ingredients(tenant_id, is_active=True) async def fetch_suppliers(): if self.suppliers_breaker: return await self.suppliers_breaker.call( self.suppliers_client.get_all_suppliers, tenant_id, is_active=True ) else: return await self.suppliers_client.get_all_suppliers(tenant_id, is_active=True) async def fetch_recipes(): if self.recipes_breaker: return await self.recipes_breaker.call( self.recipes_client.get_all_recipes, tenant_id, is_active=True ) else: return await self.recipes_client.get_all_recipes(tenant_id, is_active=True) # Wait for all data to be fetched inventory_data, suppliers_data, recipes_data = await asyncio.gather( fetch_inventory(), fetch_suppliers(), fetch_recipes(), return_exceptions=True ) # Handle errors for each fetch failures = 0 if isinstance(inventory_data, Exception): logger.error(f"Failed to fetch inventory data: {inventory_data}") inventory_data = [] failures += 1 if isinstance(suppliers_data, Exception): logger.error(f"Failed to fetch suppliers data: {suppliers_data}") suppliers_data = [] failures += 1 if isinstance(recipes_data, Exception): logger.error(f"Failed to fetch recipes data: {recipes_data}") recipes_data = [] failures += 1 # If all three fetches failed, treat it as a critical failure if failures >= 3: logger.error(f"All shared data fetches failed for tenant {tenant_id}") raise Exception("Unable to fetch any shared data (inventory, suppliers, recipes)") # Store in context for downstream services context['inventory_snapshot'] = { 'ingredients': inventory_data, 'fetched_at': datetime.now(timezone.utc).isoformat(), 'count': len(inventory_data) if inventory_data else 0 } context['suppliers_snapshot'] = { 'suppliers': suppliers_data, 'fetched_at': datetime.now(timezone.utc).isoformat(), 'count': len(suppliers_data) if suppliers_data else 0 } context['recipes_snapshot'] = { 'recipes': recipes_data, 'fetched_at': datetime.now(timezone.utc).isoformat(), 'count': len(recipes_data) if recipes_data else 0 } # NEW: Fetch upcoming events for next 7 days try: from datetime import timedelta # Note: Implement when event calendar service is ready # For now, initialize as empty context['event_calendar'] = [] logger.info("Event calendar: not yet implemented, using empty list") except Exception as e: logger.warning(f"Could not fetch events: {e}") context['event_calendar'] = [] # NEW: Placeholder for traffic predictions (Phase 5) # Note: Implement traffic forecasting in Phase 5 # For now, initialize as empty dict (will be converted to DataFrame when implemented) context['traffic_predictions'] = {} logger.info("Traffic predictions: not yet implemented, using empty dict") logger.info( f"Shared data snapshot fetched successfully: " f"{len(inventory_data)} ingredients, " f"{len(suppliers_data)} suppliers, " f"{len(recipes_data)} recipes, " f"{len(context.get('event_calendar', []))} events" ) return { 'success': True, 'inventory_count': len(inventory_data) if inventory_data else 0, 'suppliers_count': len(suppliers_data) if suppliers_data else 0, 'recipes_count': len(recipes_data) if recipes_data else 0, 'events_count': len(context.get('event_calendar', [])) } except Exception as e: logger.error(f"Failed to fetch shared data snapshot for tenant {tenant_id}: {e}") raise # ======================================================================== # Step 0.5: Generate AI Insights (NEW) # ======================================================================== async def _generate_ai_insights( self, tenant_id: str, context: Dict[str, Any] ) -> Dict[str, Any]: """ Generate AI insights using HTTP calls to ML insights endpoints. This step runs multiple ML insight generators in parallel via HTTP: - Dynamic forecasting rules learning (forecasting service) - Safety stock optimization (inventory service) - Production yield predictions (production service) - Supplier performance analysis (procurement service) - Price forecasting (procurement service) All insights are posted to the AI Insights Service by the respective services and can be consumed by downstream orchestration steps. Args: tenant_id: Tenant ID context: Execution context with cached data snapshots Returns: Dictionary with insights generation results """ logger.info(f"Generating AI insights for tenant {tenant_id} via HTTP endpoints") insights_results = { 'total_insights_generated': 0, 'total_insights_posted': 0, 'insights_by_source': {}, 'errors': [] } try: # Prepare async tasks for parallel HTTP calls ml_tasks = [] # Task 1: Safety Stock Optimization (inventory service) async def trigger_safety_stock_optimization(): try: result = await self.inventory_client.trigger_safety_stock_optimization( tenant_id=tenant_id, product_ids=None, # Analyze all products lookback_days=90, min_history_days=30 ) if result and result.get('success'): return ('safety_stock', { 'insights_posted': result.get('total_insights_posted', 0), 'insights_generated': result.get('total_insights_generated', 0), 'products_optimized': result.get('products_optimized', 0) }) else: return ('safety_stock', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0}) except Exception as e: logger.error(f"Safety stock optimization failed: {e}") return ('safety_stock', {'error': str(e), 'insights_posted': 0}) ml_tasks.append(trigger_safety_stock_optimization()) # Task 2: Production Yield Analysis (production service) async def trigger_yield_prediction(): try: result = await self.production_client.trigger_yield_prediction( tenant_id=tenant_id, recipe_ids=None, # Analyze all recipes lookback_days=90, min_history_runs=30 ) if result and result.get('success'): return ('yield_analysis', { 'insights_posted': result.get('total_insights_posted', 0), 'insights_generated': result.get('total_insights_generated', 0), 'recipes_analyzed': result.get('recipes_analyzed', 0) }) else: return ('yield_analysis', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0}) except Exception as e: logger.error(f"Yield prediction failed: {e}") return ('yield_analysis', {'error': str(e), 'insights_posted': 0}) ml_tasks.append(trigger_yield_prediction()) # Task 3: Supplier Performance Analysis (procurement service) async def trigger_supplier_analysis(): try: result = await self.procurement_client.trigger_supplier_analysis( tenant_id=tenant_id, supplier_ids=None, # Analyze all suppliers lookback_days=180, min_orders=10 ) if result and result.get('success'): return ('supplier_analysis', { 'insights_posted': result.get('total_insights_posted', 0), 'insights_generated': result.get('total_insights_generated', 0), 'suppliers_analyzed': result.get('suppliers_analyzed', 0) }) else: return ('supplier_analysis', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0}) except Exception as e: logger.error(f"Supplier analysis failed: {e}") return ('supplier_analysis', {'error': str(e), 'insights_posted': 0}) ml_tasks.append(trigger_supplier_analysis()) # Task 4: Price Forecasting (procurement service) async def trigger_price_forecasting(): try: result = await self.procurement_client.trigger_price_forecasting( tenant_id=tenant_id, ingredient_ids=None, # Forecast all ingredients lookback_days=180, forecast_horizon_days=30 ) if result and result.get('success'): return ('price_forecast', { 'insights_posted': result.get('total_insights_posted', 0), 'insights_generated': result.get('total_insights_generated', 0), 'ingredients_forecasted': result.get('ingredients_forecasted', 0), 'buy_now_recommendations': result.get('buy_now_recommendations', 0) }) else: return ('price_forecast', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0}) except Exception as e: logger.error(f"Price forecasting failed: {e}") return ('price_forecast', {'error': str(e), 'insights_posted': 0}) ml_tasks.append(trigger_price_forecasting()) # Task 5: Dynamic Rules Learning (forecasting service) async def trigger_rules_generation(): try: result = await self.forecast_client.trigger_rules_generation( tenant_id=tenant_id, product_ids=None, # Analyze all products lookback_days=90, min_samples=10 ) if result and result.get('success'): return ('rules_learning', { 'insights_posted': result.get('total_insights_posted', 0), 'insights_generated': result.get('total_insights_generated', 0), 'products_analyzed': result.get('products_analyzed', 0) }) else: return ('rules_learning', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0}) except Exception as e: logger.error(f"Rules generation failed: {e}") return ('rules_learning', {'error': str(e), 'insights_posted': 0}) ml_tasks.append(trigger_rules_generation()) # Run all ML insight generation tasks in parallel logger.info(f"Triggering {len(ml_tasks)} ML insight endpoints in parallel") results = await asyncio.gather(*ml_tasks, return_exceptions=True) # Process results for result in results: if isinstance(result, Exception): logger.error(f"ML insight task failed with exception: {result}") insights_results['errors'].append(str(result)) elif isinstance(result, tuple) and len(result) == 2: source, data = result if 'error' in data: insights_results['errors'].append(f"{source}: {data['error']}") else: posted = data.get('insights_posted', 0) generated = data.get('insights_generated', posted) insights_results['total_insights_posted'] += posted insights_results['total_insights_generated'] += generated insights_results['insights_by_source'][source] = posted logger.info(f"{source}: {posted} insights posted") # Store insights count and errors in context context['ai_insights_generated'] = insights_results['total_insights_generated'] context['ai_insights_posted'] = insights_results['total_insights_posted'] context['ai_insights_errors'] = insights_results['errors'] logger.info( f"AI insights generation complete: " f"{insights_results['total_insights_posted']} insights posted from " f"{len(insights_results['insights_by_source'])} sources" ) return insights_results except Exception as e: logger.error(f"Failed to generate AI insights for tenant {tenant_id}: {e}", exc_info=True) # Don't fail the orchestration if insights generation fails # Log error and continue insights_results['errors'].append(str(e)) context['ai_insights_generated'] = 0 context['ai_insights_posted'] = 0 context['ai_insights_errors'] = insights_results['errors'] return insights_results # ======================================================================== # Step 1: Generate Forecasts # ======================================================================== async def _generate_forecasts( self, tenant_id: str, context: Dict[str, Any] ) -> Dict[str, Any]: """ Generate forecasts for tenant. Args: tenant_id: Tenant ID context: Execution context Returns: Forecast result """ logger.info(f"Generating forecasts for tenant {tenant_id}") try: # Call forecast service with circuit breaker protection if self.forecast_breaker: result = await self.forecast_breaker.call( self.forecast_client.generate_forecasts, tenant_id ) else: result = await self.forecast_client.generate_forecasts(tenant_id) if not result: logger.error(f"Forecast service returned None for tenant {tenant_id}") raise Exception("Forecast service returned None") # Store forecast ID in context forecast_id = result.get('forecast_id') or result.get('id') context['forecast_id'] = forecast_id context['forecast_data'] = result logger.info( f"Forecasts generated successfully: {forecast_id}, " f"{result.get('forecasts_created', 0)} forecasts created" ) # Ensure tenant_id is in result for compensation result['tenant_id'] = tenant_id return result except Exception as e: logger.error(f"Failed to generate forecasts for tenant {tenant_id}: {e}") raise async def _compensate_forecasts(self, forecast_result: Dict[str, Any]): """ Compensate forecast generation (delete generated forecasts). Args: forecast_result: Result from forecast generation """ forecast_id = forecast_result.get('forecast_id') or forecast_result.get('id') if not forecast_id: logger.warning("No forecast ID to compensate") return logger.info(f"Compensating forecasts: {forecast_id}") try: # Call forecast service to delete the forecast tenant_id = forecast_result.get('tenant_id') if tenant_id: await self.forecast_client.delete_forecast(tenant_id, forecast_id) logger.info(f"Successfully deleted forecast {forecast_id} (compensation)") else: logger.warning(f"Cannot compensate forecast {forecast_id}: no tenant_id in result") except Exception as e: logger.error(f"Failed to compensate forecasts {forecast_id}: {e}") # ======================================================================== # Step 2: Generate Production Schedule # ======================================================================== async def _generate_production_schedule( self, tenant_id: str, context: Dict[str, Any] ) -> Dict[str, Any]: """ Generate production schedule for tenant. Args: tenant_id: Tenant ID context: Execution context Returns: Production schedule result """ logger.info(f"Generating production schedule for tenant {tenant_id}") forecast_data = context.get('forecast_data', {}) inventory_snapshot = context.get('inventory_snapshot', {}) recipes_snapshot = context.get('recipes_snapshot', {}) try: # Call production service with cached data and circuit breaker protection if self.production_breaker: result = await self.production_breaker.call( self.production_client.generate_schedule, tenant_id=tenant_id, forecast_data=forecast_data, inventory_data=inventory_snapshot, recipes_data=recipes_snapshot ) else: result = await self.production_client.generate_schedule( tenant_id=tenant_id, forecast_data=forecast_data, inventory_data=inventory_snapshot, recipes_data=recipes_snapshot ) if not result: logger.error(f"Production service returned None for tenant {tenant_id}") raise Exception("Production service returned None") # Store schedule ID in context schedule_id = result.get('schedule_id') or result.get('id') context['production_schedule_id'] = schedule_id context['production_data'] = result logger.info( f"Production schedule generated successfully: {schedule_id}, " f"{result.get('batches_created', 0)} batches created" ) # Ensure tenant_id is in result for compensation result['tenant_id'] = tenant_id return result except Exception as e: logger.error( f"Failed to generate production schedule for tenant {tenant_id}: {e}" ) raise async def _compensate_production_schedule( self, production_result: Dict[str, Any] ): """ Compensate production schedule (delete schedule). Args: production_result: Result from production generation """ schedule_id = production_result.get('schedule_id') or production_result.get('id') if not schedule_id: logger.warning("No production schedule ID to compensate") return logger.info(f"Compensating production schedule: {schedule_id}") try: # Call production service to delete the schedule tenant_id = production_result.get('tenant_id') if tenant_id: await self.production_client.delete_schedule(tenant_id, schedule_id) logger.info( f"Successfully deleted production schedule {schedule_id} (compensation)" ) else: logger.warning(f"Cannot compensate schedule {schedule_id}: no tenant_id in result") except Exception as e: logger.error( f"Failed to compensate production schedule {schedule_id}: {e}" ) # ======================================================================== # Step 3: Generate Procurement Plan # ======================================================================== async def _generate_procurement_plan( self, tenant_id: str, context: Dict[str, Any] ) -> Dict[str, Any]: """ Generate procurement plan for tenant. Args: tenant_id: Tenant ID context: Execution context Returns: Procurement plan result """ logger.info(f"Generating procurement plan for tenant {tenant_id}") forecast_data = context.get('forecast_data', {}) production_schedule_id = context.get('production_schedule_id') inventory_snapshot = context.get('inventory_snapshot', {}) suppliers_snapshot = context.get('suppliers_snapshot', {}) recipes_snapshot = context.get('recipes_snapshot', {}) try: # Call procurement service with cached data and circuit breaker protection if self.procurement_breaker: result = await self.procurement_breaker.call( self.procurement_client.auto_generate_procurement, tenant_id=tenant_id, forecast_data=forecast_data, production_schedule_id=production_schedule_id, inventory_data=inventory_snapshot, suppliers_data=suppliers_snapshot, recipes_data=recipes_snapshot ) else: result = await self.procurement_client.auto_generate_procurement( tenant_id=tenant_id, forecast_data=forecast_data, production_schedule_id=production_schedule_id, inventory_data=inventory_snapshot, suppliers_data=suppliers_snapshot, recipes_data=recipes_snapshot ) if not result: logger.error(f"Procurement service returned None for tenant {tenant_id}") raise Exception("Procurement service returned None") # Store plan ID in context plan_id = result.get('plan_id') or result.get('id') context['procurement_plan_id'] = plan_id context['procurement_data'] = result logger.info( f"Procurement plan generated successfully: {plan_id}, " f"{result.get('requirements_created', 0)} requirements, " f"{result.get('pos_created', 0)} purchase orders created" ) # Ensure tenant_id is in result for compensation result['tenant_id'] = tenant_id return result except Exception as e: logger.error( f"Failed to generate procurement plan for tenant {tenant_id}: {e}" ) raise async def _compensate_procurement_plan( self, procurement_result: Dict[str, Any] ): """ Compensate procurement plan (delete plan and POs). Args: procurement_result: Result from procurement generation """ plan_id = procurement_result.get('plan_id') or procurement_result.get('id') if not plan_id: logger.warning("No procurement plan ID to compensate") return logger.info(f"Compensating procurement plan: {plan_id}") try: # Call procurement service to delete plan (this should cascade delete requirements and POs) tenant_id = procurement_result.get('tenant_id') if tenant_id: await self.procurement_client.delete_plan(tenant_id, plan_id) logger.info( f"Successfully deleted procurement plan {plan_id} and associated POs (compensation)" ) else: logger.warning(f"Cannot compensate plan {plan_id}: no tenant_id in result") except Exception as e: logger.error(f"Failed to compensate procurement plan {plan_id}: {e}") # ======================================================================== # Step 4: Send Notifications # ======================================================================== async def _send_notifications( self, tenant_id: str, context: Dict[str, Any] ) -> Dict[str, Any]: """ Send workflow completion notifications. Args: tenant_id: Tenant ID context: Execution context Returns: Notification result """ logger.info(f"Sending notifications for tenant {tenant_id}") try: # Prepare notification data notification_data = { 'tenant_id': tenant_id, 'orchestration_run_id': context.get('orchestration_run_id'), 'forecast_id': context.get('forecast_id'), 'production_schedule_id': context.get('production_schedule_id'), 'procurement_plan_id': context.get('procurement_plan_id'), 'forecasts_created': context.get('forecast_data', {}).get('forecasts_created', 0), 'batches_created': context.get('production_data', {}).get('batches_created', 0), 'requirements_created': context.get('procurement_data', {}).get('requirements_created', 0), 'pos_created': context.get('procurement_data', {}).get('pos_created', 0) } # Call notification service result = await self.notification_client.send_workflow_summary( tenant_id=tenant_id, notification_data=notification_data ) if result: notifications_sent = result.get('notifications_sent', 0) context['notifications_sent'] = notifications_sent logger.info(f"Notifications sent successfully: {notifications_sent}") return result else: logger.warning(f"Notification service returned None for tenant {tenant_id}") return {'notifications_sent': 0, 'error': 'Notification service returned None'} except Exception as e: # Log error but don't fail the saga for notification failures logger.error( f"NOTIFICATION FAILURE: Failed to send notifications for tenant {tenant_id}: {e}", exc_info=True ) # Store failure information in context context['notification_failed'] = True context['notification_error'] = str(e) # Return empty result instead of raising return {'notifications_sent': 0, 'error': str(e), 'failed': True} # ======================================================================== # Step 5: Validate Previous Day's Forecasts # ======================================================================== async def _validate_previous_forecasts( self, tenant_id: str, context: Dict[str, Any] ) -> Dict[str, Any]: """ Validate yesterday's forecasts against actual sales. Calculate accuracy metrics (MAPE, RMSE, MAE) and trigger retraining if needed. Args: tenant_id: Tenant ID context: Execution context Returns: Validation result with metrics """ from datetime import date, timedelta logger.info(f"Validating previous day's forecasts for tenant {tenant_id}") try: yesterday = date.today() - timedelta(days=1) # Call forecasting service validation endpoint validation_result = await self.forecast_client.validate_forecasts( tenant_id=tenant_id, date=yesterday ) if not validation_result: logger.warning(f"No validation results returned for tenant {tenant_id}") return {'validated': False, 'reason': 'no_data'} # Extract metrics overall_mape = validation_result.get('overall_mape', 0) overall_rmse = validation_result.get('overall_rmse', 0) overall_mae = validation_result.get('overall_mae', 0) products_validated = validation_result.get('products_validated', 0) poor_accuracy_products = validation_result.get('poor_accuracy_products', []) context['validation_metrics'] = { 'mape': overall_mape, 'rmse': overall_rmse, 'mae': overall_mae, 'products_validated': products_validated, 'validation_date': yesterday.isoformat() } logger.info( f"Validation complete for tenant {tenant_id}: " f"MAPE={overall_mape:.2f}%, RMSE={overall_rmse:.2f}, MAE={overall_mae:.2f}, " f"Products={products_validated}" ) # Post accuracy insights to AI Insights Service try: from uuid import UUID from datetime import datetime await self.ai_insights_client.post_accuracy_metrics( tenant_id=UUID(tenant_id), validation_date=datetime.combine(yesterday, datetime.min.time()), metrics={ 'overall_mape': overall_mape, 'overall_rmse': overall_rmse, 'overall_mae': overall_mae, 'products_validated': products_validated, 'poor_accuracy_products': poor_accuracy_products } ) logger.info(f"Posted accuracy metrics to AI Insights Service") except Exception as e: logger.warning(f"Could not post accuracy metrics to AI Insights: {e}") # Trigger retraining for products with poor accuracy if poor_accuracy_products and len(poor_accuracy_products) > 0: logger.warning( f"Found {len(poor_accuracy_products)} products with MAPE > 30%, " f"triggering retraining" ) retraining_triggered = 0 # Check if training client is available if not self.training_client: logger.warning( f"Training client not available, cannot trigger retraining for " f"{len(poor_accuracy_products)} products" ) context['retraining_triggered'] = 0 else: for product_data in poor_accuracy_products: product_id = product_data.get('product_id') product_mape = product_data.get('mape', 0) if not product_id: continue try: await self.training_client.trigger_retrain( tenant_id=tenant_id, inventory_product_id=product_id, reason='accuracy_degradation', metadata={ 'previous_mape': product_mape, 'validation_date': yesterday.isoformat(), 'triggered_by': 'orchestration_validation' } ) retraining_triggered += 1 logger.info( f"Triggered retraining for product {product_id} " f"(MAPE={product_mape:.2f}%)" ) except Exception as e: logger.error( f"Failed to trigger retraining for product {product_id}: {e}" ) context['retraining_triggered'] = retraining_triggered logger.info(f"Triggered retraining for {retraining_triggered} products") else: logger.info("All products have acceptable accuracy (MAPE <= 30%)") context['retraining_triggered'] = 0 return { 'validated': True, 'metrics': context['validation_metrics'], 'retraining_triggered': context.get('retraining_triggered', 0) } except Exception as e: # Don't fail the saga if validation fails, but log prominently logger.error( f"VALIDATION FAILURE: Forecast validation failed for tenant {tenant_id}: {e}", exc_info=True ) # Store failure information in context context['validation_failed'] = True context['validation_error'] = str(e) return { 'validated': False, 'error': str(e), 'retraining_triggered': 0, 'failed': True } # ======================================================================== # Utility Methods # ======================================================================== async def execute_with_timeout( self, tenant_id: str, orchestration_run_id: str, timeout_seconds: int = 600 ) -> Dict[str, Any]: """ Execute orchestration with timeout. Args: tenant_id: Tenant ID orchestration_run_id: Orchestration run ID timeout_seconds: Timeout in seconds Returns: Execution result """ try: result = await asyncio.wait_for( self.execute_orchestration(tenant_id, orchestration_run_id), timeout=timeout_seconds ) return result except asyncio.TimeoutError: logger.error( f"Orchestration timed out after {timeout_seconds}s for tenant {tenant_id}" ) return { 'success': False, 'error': f'Orchestration timed out after {timeout_seconds} seconds', 'timeout': True }