Improve AI logic

This commit is contained in:
Urtzi Alfaro
2025-11-05 13:34:56 +01:00
parent 5c87fbcf48
commit 394ad3aea4
218 changed files with 30627 additions and 7658 deletions

View File

@@ -2,6 +2,7 @@
Orchestration Saga Service
Implements saga pattern for orchestrator workflow with compensation logic.
Integrates AI-enhanced orchestration when enabled.
"""
import asyncio
@@ -18,6 +19,8 @@ from shared.clients.notification_client import NotificationServiceClient
from shared.clients.inventory_client import InventoryServiceClient
from shared.clients.suppliers_client import SuppliersServiceClient
from shared.clients.recipes_client import RecipesServiceClient
from shared.clients.ai_insights_client import AIInsightsClient
from shared.clients.training_client import TrainingServiceClient
logger = logging.getLogger(__name__)
@@ -27,7 +30,8 @@ class OrchestrationSaga:
Saga coordinator for orchestration workflow.
Workflow Steps:
0. Fetch shared data snapshot (inventory, suppliers, recipes) - NEW
0. Fetch shared data snapshot (inventory, suppliers, recipes)
0.5. Generate AI insights from ML orchestrators
1. Generate forecasts
2. Generate production schedule
3. Generate procurement plan
@@ -44,7 +48,12 @@ class OrchestrationSaga:
notification_client: NotificationServiceClient,
inventory_client: InventoryServiceClient,
suppliers_client: SuppliersServiceClient,
recipes_client: RecipesServiceClient
recipes_client: RecipesServiceClient,
ai_insights_client: Optional[AIInsightsClient] = None,
training_client: Optional[TrainingServiceClient] = None,
use_ai_enhancement: bool = False,
ai_insights_base_url: str = "http://ai-insights-service:8000",
ai_insights_min_confidence: int = 70
):
"""
Initialize orchestration saga.
@@ -54,9 +63,14 @@ class OrchestrationSaga:
production_client: Production service client
procurement_client: Procurement service client
notification_client: Notification service client
inventory_client: Inventory service client (NEW)
suppliers_client: Suppliers service client (NEW)
recipes_client: Recipes service client (NEW)
inventory_client: Inventory service client
suppliers_client: Suppliers service client
recipes_client: Recipes service client
ai_insights_client: AI Insights service client
training_client: Training service client
use_ai_enhancement: Enable AI-enhanced orchestration
ai_insights_base_url: Base URL for AI Insights Service
ai_insights_min_confidence: Minimum confidence threshold for applying insights
"""
self.forecast_client = forecast_client
self.production_client = production_client
@@ -65,6 +79,25 @@ class OrchestrationSaga:
self.inventory_client = inventory_client
self.suppliers_client = suppliers_client
self.recipes_client = recipes_client
self.ai_insights_client = ai_insights_client or AIInsightsClient(
base_url=ai_insights_base_url
)
self.training_client = training_client
self.use_ai_enhancement = use_ai_enhancement
# Initialize AI enhancer if enabled
self.ai_enhancer = None
if use_ai_enhancement:
try:
from app.ml.ai_enhanced_orchestrator import AIEnhancedOrchestrator
self.ai_enhancer = AIEnhancedOrchestrator(
ai_insights_base_url=ai_insights_base_url,
min_confidence_threshold=ai_insights_min_confidence
)
logger.info("AI-enhanced orchestration enabled")
except ImportError as e:
logger.warning(f"AI enhancement requested but could not be loaded: {e}")
self.use_ai_enhancement = False
async def execute_orchestration(
self,
@@ -108,6 +141,14 @@ class OrchestrationSaga:
action_args=(tenant_id, context)
)
# Step 0.5: Generate AI insights (NEW)
saga.add_step(
name="generate_ai_insights",
action=self._generate_ai_insights,
compensation=None, # No compensation needed for read-only insight generation
action_args=(tenant_id, context)
)
# Step 1: Generate forecasts
saga.add_step(
name="generate_forecasts",
@@ -140,6 +181,14 @@ class OrchestrationSaga:
action_args=(tenant_id, context)
)
# Step 5: Validate previous day's forecasts
saga.add_step(
name="validate_previous_forecasts",
action=self._validate_previous_forecasts,
compensation=None, # No compensation needed for validation
action_args=(tenant_id, context)
)
# Execute saga
success, final_result, error = await saga.execute()
@@ -233,24 +282,249 @@ class OrchestrationSaga:
'count': len(recipes_data) if recipes_data else 0
}
# NEW: Fetch upcoming events for next 7 days
try:
from datetime import timedelta
# Note: Implement when event calendar service is ready
# For now, initialize as empty
context['event_calendar'] = []
logger.info("Event calendar: not yet implemented, using empty list")
except Exception as e:
logger.warning(f"Could not fetch events: {e}")
context['event_calendar'] = []
# NEW: Placeholder for traffic predictions (Phase 5)
try:
# Note: Implement traffic forecasting in Phase 5
# For now, initialize as empty DataFrame
import pandas as pd
context['traffic_predictions'] = pd.DataFrame()
logger.info("Traffic predictions: not yet implemented, using empty DataFrame")
except Exception as e:
logger.warning(f"Could not fetch traffic predictions: {e}")
import pandas as pd
context['traffic_predictions'] = pd.DataFrame()
logger.info(
f"Shared data snapshot fetched successfully: "
f"{len(inventory_data)} ingredients, "
f"{len(suppliers_data)} suppliers, "
f"{len(recipes_data)} recipes"
f"{len(recipes_data)} recipes, "
f"{len(context.get('event_calendar', []))} events"
)
return {
'success': True,
'inventory_count': len(inventory_data) if inventory_data else 0,
'suppliers_count': len(suppliers_data) if suppliers_data else 0,
'recipes_count': len(recipes_data) if recipes_data else 0
'recipes_count': len(recipes_data) if recipes_data else 0,
'events_count': len(context.get('event_calendar', []))
}
except Exception as e:
logger.error(f"Failed to fetch shared data snapshot for tenant {tenant_id}: {e}")
raise
# ========================================================================
# Step 0.5: Generate AI Insights (NEW)
# ========================================================================
async def _generate_ai_insights(
self,
tenant_id: str,
context: Dict[str, Any]
) -> Dict[str, Any]:
"""
Generate AI insights using HTTP calls to ML insights endpoints.
This step runs multiple ML insight generators in parallel via HTTP:
- Dynamic forecasting rules learning (forecasting service)
- Safety stock optimization (inventory service)
- Production yield predictions (production service)
- Supplier performance analysis (procurement service)
- Price forecasting (procurement service)
All insights are posted to the AI Insights Service by the respective services
and can be consumed by downstream orchestration steps.
Args:
tenant_id: Tenant ID
context: Execution context with cached data snapshots
Returns:
Dictionary with insights generation results
"""
logger.info(f"Generating AI insights for tenant {tenant_id} via HTTP endpoints")
insights_results = {
'total_insights_generated': 0,
'total_insights_posted': 0,
'insights_by_source': {},
'errors': []
}
try:
# Prepare async tasks for parallel HTTP calls
ml_tasks = []
# Task 1: Safety Stock Optimization (inventory service)
async def trigger_safety_stock_optimization():
try:
result = await self.inventory_client.trigger_safety_stock_optimization(
tenant_id=tenant_id,
product_ids=None, # Analyze all products
lookback_days=90,
min_history_days=30
)
if result and result.get('success'):
return ('safety_stock', {
'insights_posted': result.get('total_insights_posted', 0),
'insights_generated': result.get('total_insights_generated', 0),
'products_optimized': result.get('products_optimized', 0)
})
else:
return ('safety_stock', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0})
except Exception as e:
logger.error(f"Safety stock optimization failed: {e}")
return ('safety_stock', {'error': str(e), 'insights_posted': 0})
ml_tasks.append(trigger_safety_stock_optimization())
# Task 2: Production Yield Analysis (production service)
async def trigger_yield_prediction():
try:
result = await self.production_client.trigger_yield_prediction(
tenant_id=tenant_id,
recipe_ids=None, # Analyze all recipes
lookback_days=90,
min_history_runs=30
)
if result and result.get('success'):
return ('yield_analysis', {
'insights_posted': result.get('total_insights_posted', 0),
'insights_generated': result.get('total_insights_generated', 0),
'recipes_analyzed': result.get('recipes_analyzed', 0)
})
else:
return ('yield_analysis', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0})
except Exception as e:
logger.error(f"Yield prediction failed: {e}")
return ('yield_analysis', {'error': str(e), 'insights_posted': 0})
ml_tasks.append(trigger_yield_prediction())
# Task 3: Supplier Performance Analysis (procurement service)
async def trigger_supplier_analysis():
try:
result = await self.procurement_client.trigger_supplier_analysis(
tenant_id=tenant_id,
supplier_ids=None, # Analyze all suppliers
lookback_days=180,
min_orders=10
)
if result and result.get('success'):
return ('supplier_analysis', {
'insights_posted': result.get('total_insights_posted', 0),
'insights_generated': result.get('total_insights_generated', 0),
'suppliers_analyzed': result.get('suppliers_analyzed', 0)
})
else:
return ('supplier_analysis', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0})
except Exception as e:
logger.error(f"Supplier analysis failed: {e}")
return ('supplier_analysis', {'error': str(e), 'insights_posted': 0})
ml_tasks.append(trigger_supplier_analysis())
# Task 4: Price Forecasting (procurement service)
async def trigger_price_forecasting():
try:
result = await self.procurement_client.trigger_price_forecasting(
tenant_id=tenant_id,
ingredient_ids=None, # Forecast all ingredients
lookback_days=180,
forecast_horizon_days=30
)
if result and result.get('success'):
return ('price_forecast', {
'insights_posted': result.get('total_insights_posted', 0),
'insights_generated': result.get('total_insights_generated', 0),
'ingredients_forecasted': result.get('ingredients_forecasted', 0),
'buy_now_recommendations': result.get('buy_now_recommendations', 0)
})
else:
return ('price_forecast', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0})
except Exception as e:
logger.error(f"Price forecasting failed: {e}")
return ('price_forecast', {'error': str(e), 'insights_posted': 0})
ml_tasks.append(trigger_price_forecasting())
# Task 5: Dynamic Rules Learning (forecasting service)
async def trigger_rules_generation():
try:
result = await self.forecast_client.trigger_rules_generation(
tenant_id=tenant_id,
product_ids=None, # Analyze all products
lookback_days=90,
min_samples=10
)
if result and result.get('success'):
return ('rules_learning', {
'insights_posted': result.get('total_insights_posted', 0),
'insights_generated': result.get('total_insights_generated', 0),
'products_analyzed': result.get('products_analyzed', 0)
})
else:
return ('rules_learning', {'error': result.get('message', 'Unknown error') if result else 'Service returned None', 'insights_posted': 0})
except Exception as e:
logger.error(f"Rules generation failed: {e}")
return ('rules_learning', {'error': str(e), 'insights_posted': 0})
ml_tasks.append(trigger_rules_generation())
# Run all ML insight generation tasks in parallel
logger.info(f"Triggering {len(ml_tasks)} ML insight endpoints in parallel")
results = await asyncio.gather(*ml_tasks, return_exceptions=True)
# Process results
for result in results:
if isinstance(result, Exception):
logger.error(f"ML insight task failed with exception: {result}")
insights_results['errors'].append(str(result))
elif isinstance(result, tuple) and len(result) == 2:
source, data = result
if 'error' in data:
insights_results['errors'].append(f"{source}: {data['error']}")
else:
posted = data.get('insights_posted', 0)
generated = data.get('insights_generated', posted)
insights_results['total_insights_posted'] += posted
insights_results['total_insights_generated'] += generated
insights_results['insights_by_source'][source] = posted
logger.info(f"{source}: {posted} insights posted")
# Store insights count in context
context['ai_insights_generated'] = insights_results['total_insights_generated']
context['ai_insights_posted'] = insights_results['total_insights_posted']
logger.info(
f"AI insights generation complete: "
f"{insights_results['total_insights_posted']} insights posted from "
f"{len(insights_results['insights_by_source'])} sources"
)
return insights_results
except Exception as e:
logger.error(f"Failed to generate AI insights for tenant {tenant_id}: {e}", exc_info=True)
# Don't fail the orchestration if insights generation fails
# Log error and continue
insights_results['errors'].append(str(e))
context['ai_insights_generated'] = 0
context['ai_insights_posted'] = 0
return insights_results
# ========================================================================
# Step 1: Generate Forecasts
# ========================================================================
@@ -276,6 +550,10 @@ class OrchestrationSaga:
# Call forecast service
result = await self.forecast_client.generate_forecasts(tenant_id)
if not result:
logger.error(f"Forecast service returned None for tenant {tenant_id}")
raise Exception("Forecast service returned None")
# Store forecast ID in context
forecast_id = result.get('forecast_id') or result.get('id')
context['forecast_id'] = forecast_id
@@ -349,6 +627,10 @@ class OrchestrationSaga:
recipes_data=recipes_snapshot # NEW: Pass cached recipes
)
if not result:
logger.error(f"Production service returned None for tenant {tenant_id}")
raise Exception("Production service returned None")
# Store schedule ID in context
schedule_id = result.get('schedule_id') or result.get('id')
context['production_schedule_id'] = schedule_id
@@ -435,6 +717,10 @@ class OrchestrationSaga:
recipes_data=recipes_snapshot # NEW: Pass cached recipes
)
if not result:
logger.error(f"Procurement service returned None for tenant {tenant_id}")
raise Exception("Procurement service returned None")
# Store plan ID in context
plan_id = result.get('plan_id') or result.get('id')
context['procurement_plan_id'] = plan_id
@@ -523,12 +809,16 @@ class OrchestrationSaga:
notification_data=notification_data
)
notifications_sent = result.get('notifications_sent', 0)
context['notifications_sent'] = notifications_sent
if result:
notifications_sent = result.get('notifications_sent', 0)
context['notifications_sent'] = notifications_sent
logger.info(f"Notifications sent successfully: {notifications_sent}")
logger.info(f"Notifications sent successfully: {notifications_sent}")
return result
return result
else:
logger.warning(f"Notification service returned None for tenant {tenant_id}")
return {'notifications_sent': 0, 'error': 'Notification service returned None'}
except Exception as e:
# Log error but don't fail the saga for notification failures
@@ -536,6 +826,140 @@ class OrchestrationSaga:
# Return empty result instead of raising
return {'notifications_sent': 0, 'error': str(e)}
# ========================================================================
# Step 5: Validate Previous Day's Forecasts
# ========================================================================
async def _validate_previous_forecasts(
self,
tenant_id: str,
context: Dict[str, Any]
) -> Dict[str, Any]:
"""
Validate yesterday's forecasts against actual sales.
Calculate accuracy metrics (MAPE, RMSE, MAE) and trigger retraining if needed.
Args:
tenant_id: Tenant ID
context: Execution context
Returns:
Validation result with metrics
"""
from datetime import date, timedelta
logger.info(f"Validating previous day's forecasts for tenant {tenant_id}")
try:
yesterday = date.today() - timedelta(days=1)
# Call forecasting service validation endpoint
validation_result = await self.forecast_client.validate_forecasts(
tenant_id=tenant_id,
date=yesterday
)
if not validation_result:
logger.warning(f"No validation results returned for tenant {tenant_id}")
return {'validated': False, 'reason': 'no_data'}
# Extract metrics
overall_mape = validation_result.get('overall_mape', 0)
overall_rmse = validation_result.get('overall_rmse', 0)
overall_mae = validation_result.get('overall_mae', 0)
products_validated = validation_result.get('products_validated', 0)
poor_accuracy_products = validation_result.get('poor_accuracy_products', [])
context['validation_metrics'] = {
'mape': overall_mape,
'rmse': overall_rmse,
'mae': overall_mae,
'products_validated': products_validated,
'validation_date': yesterday.isoformat()
}
logger.info(
f"Validation complete for tenant {tenant_id}: "
f"MAPE={overall_mape:.2f}%, RMSE={overall_rmse:.2f}, MAE={overall_mae:.2f}, "
f"Products={products_validated}"
)
# Post accuracy insights to AI Insights Service
try:
from uuid import UUID
from datetime import datetime
await self.ai_insights_client.post_accuracy_metrics(
tenant_id=UUID(tenant_id),
validation_date=datetime.combine(yesterday, datetime.min.time()),
metrics={
'overall_mape': overall_mape,
'overall_rmse': overall_rmse,
'overall_mae': overall_mae,
'products_validated': products_validated,
'poor_accuracy_products': poor_accuracy_products
}
)
logger.info(f"Posted accuracy metrics to AI Insights Service")
except Exception as e:
logger.warning(f"Could not post accuracy metrics to AI Insights: {e}")
# Trigger retraining for products with poor accuracy
if poor_accuracy_products and len(poor_accuracy_products) > 0:
logger.warning(
f"Found {len(poor_accuracy_products)} products with MAPE > 30%, "
f"triggering retraining"
)
retraining_triggered = 0
for product_data in poor_accuracy_products:
product_id = product_data.get('product_id')
product_mape = product_data.get('mape', 0)
if not product_id:
continue
try:
await self.training_client.trigger_retrain(
tenant_id=tenant_id,
inventory_product_id=product_id,
reason='accuracy_degradation',
metadata={
'previous_mape': product_mape,
'validation_date': yesterday.isoformat(),
'triggered_by': 'orchestration_validation'
}
)
retraining_triggered += 1
logger.info(
f"Triggered retraining for product {product_id} "
f"(MAPE={product_mape:.2f}%)"
)
except Exception as e:
logger.error(
f"Failed to trigger retraining for product {product_id}: {e}"
)
context['retraining_triggered'] = retraining_triggered
logger.info(f"Triggered retraining for {retraining_triggered} products")
else:
logger.info("All products have acceptable accuracy (MAPE <= 30%)")
context['retraining_triggered'] = 0
return {
'validated': True,
'metrics': context['validation_metrics'],
'retraining_triggered': context.get('retraining_triggered', 0)
}
except Exception as e:
# Don't fail the saga if validation fails
logger.warning(f"Forecast validation failed for tenant {tenant_id}: {e}")
return {
'validated': False,
'error': str(e),
'retraining_triggered': 0
}
# ========================================================================
# Utility Methods
# ========================================================================

View File

@@ -26,7 +26,11 @@ from shared.clients.forecast_client import ForecastServiceClient
from shared.clients.production_client import ProductionServiceClient
from shared.clients.procurement_client import ProcurementServiceClient
from shared.clients.notification_client import NotificationServiceClient
from shared.utils.tenant_settings_client import TenantSettingsClient
from shared.clients.tenant_client import TenantServiceClient
from shared.clients.inventory_client import InventoryServiceClient
from shared.clients.suppliers_client import SuppliersServiceClient
from shared.clients.recipes_client import RecipesServiceClient
from shared.clients.training_client import TrainingServiceClient
from shared.utils.circuit_breaker import CircuitBreaker, CircuitBreakerOpenError
from app.core.config import settings
from app.repositories.orchestration_run_repository import OrchestrationRunRepository
@@ -46,11 +50,16 @@ class OrchestratorSchedulerService(BaseAlertService):
super().__init__(config)
# Service clients
self.forecast_client = ForecastServiceClient(config)
self.production_client = ProductionServiceClient(config)
self.procurement_client = ProcurementServiceClient(config)
self.notification_client = NotificationServiceClient(config)
self.tenant_settings_client = TenantSettingsClient(tenant_service_url=config.TENANT_SERVICE_URL)
self.forecast_client = ForecastServiceClient(config, "orchestrator-service")
self.production_client = ProductionServiceClient(config, "orchestrator-service")
self.procurement_client = ProcurementServiceClient(config, "orchestrator-service")
self.notification_client = NotificationServiceClient(config, "orchestrator-service")
self.tenant_client = TenantServiceClient(config)
self.training_client = TrainingServiceClient(config, "orchestrator-service")
# Clients for centralized data fetching
self.inventory_client = InventoryServiceClient(config, "orchestrator-service")
self.suppliers_client = SuppliersServiceClient(config, "orchestrator-service")
self.recipes_client = RecipesServiceClient(config, "orchestrator-service")
# Circuit breakers for each service
self.forecast_breaker = CircuitBreaker(
@@ -183,11 +192,19 @@ class OrchestratorSchedulerService(BaseAlertService):
# Set timeout for entire tenant orchestration
async with asyncio.timeout(settings.TENANT_TIMEOUT_SECONDS):
# Execute orchestration using Saga pattern
# AI enhancement is enabled via ORCHESTRATION_USE_AI_INSIGHTS config
saga = OrchestrationSaga(
forecast_client=self.forecast_client,
production_client=self.production_client,
procurement_client=self.procurement_client,
notification_client=self.notification_client
notification_client=self.notification_client,
inventory_client=self.inventory_client,
suppliers_client=self.suppliers_client,
recipes_client=self.recipes_client,
training_client=self.training_client,
use_ai_enhancement=settings.ORCHESTRATION_USE_AI_INSIGHTS,
ai_insights_base_url=settings.AI_INSIGHTS_SERVICE_URL,
ai_insights_min_confidence=settings.AI_INSIGHTS_MIN_CONFIDENCE
)
result = await saga.execute_orchestration(
@@ -238,7 +255,7 @@ class OrchestratorSchedulerService(BaseAlertService):
# Call Tenant Service with circuit breaker
tenants_data = await self.tenant_breaker.call(
self.tenant_settings_client.get_active_tenants
self.tenant_client.get_active_tenants
)
if not tenants_data:

View File

@@ -1,392 +0,0 @@
"""
Orchestrator Scheduler Service - REFACTORED
Coordinates daily auto-generation workflow: Forecasting → Production → Procurement → Notifications
CHANGES FROM ORIGINAL:
- Removed all TODO/stub code
- Integrated OrchestrationSaga for error handling and compensation
- Added circuit breakers for all service calls
- Implemented real Forecasting Service integration
- Implemented real Production Service integration
- Implemented real Tenant Service integration
- Implemented real Notification Service integration
- NO backwards compatibility, NO feature flags - complete rewrite
"""
import asyncio
import uuid
from datetime import datetime, date, timezone
from decimal import Decimal
from typing import List, Dict, Any, Optional
import structlog
from apscheduler.triggers.cron import CronTrigger
from shared.alerts.base_service import BaseAlertService
from shared.clients.forecast_client import ForecastServiceClient
from shared.clients.production_client import ProductionServiceClient
from shared.clients.procurement_client import ProcurementServiceClient
from shared.clients.notification_client import NotificationServiceClient
from shared.clients.tenant_settings_client import TenantSettingsClient
from shared.clients.inventory_client import InventoryServiceClient
from shared.clients.suppliers_client import SuppliersServiceClient
from shared.clients.recipes_client import RecipesServiceClient
from shared.utils.circuit_breaker import CircuitBreaker, CircuitBreakerOpenError
from app.core.config import settings
from app.repositories.orchestration_run_repository import OrchestrationRunRepository
from app.models.orchestration_run import OrchestrationStatus
from app.services.orchestration_saga import OrchestrationSaga
logger = structlog.get_logger()
class OrchestratorSchedulerService(BaseAlertService):
"""
Orchestrator Service extending BaseAlertService
Handles automated daily orchestration of forecasting, production, and procurement
"""
def __init__(self, config):
super().__init__(config)
# Service clients
self.forecast_client = ForecastServiceClient(config)
self.production_client = ProductionServiceClient(config)
self.procurement_client = ProcurementServiceClient(config)
self.notification_client = NotificationServiceClient(config)
self.tenant_settings_client = TenantSettingsClient(config)
# NEW: Clients for centralized data fetching
self.inventory_client = InventoryServiceClient(config)
self.suppliers_client = SuppliersServiceClient(config)
self.recipes_client = RecipesServiceClient(config)
# Circuit breakers for each service
self.forecast_breaker = CircuitBreaker(
failure_threshold=5,
timeout_duration=60,
success_threshold=2
)
self.production_breaker = CircuitBreaker(
failure_threshold=5,
timeout_duration=60,
success_threshold=2
)
self.procurement_breaker = CircuitBreaker(
failure_threshold=5,
timeout_duration=60,
success_threshold=2
)
self.tenant_breaker = CircuitBreaker(
failure_threshold=3,
timeout_duration=30,
success_threshold=2
)
def setup_scheduled_checks(self):
"""
Configure scheduled orchestration jobs
Runs daily at 5:30 AM (configured via ORCHESTRATION_SCHEDULE)
"""
# Parse cron schedule from config (default: "30 5 * * *" = 5:30 AM daily)
cron_parts = settings.ORCHESTRATION_SCHEDULE.split()
if len(cron_parts) == 5:
minute, hour, day, month, day_of_week = cron_parts
else:
# Fallback to default
minute, hour, day, month, day_of_week = "30", "5", "*", "*", "*"
# Schedule daily orchestration
self.scheduler.add_job(
func=self.run_daily_orchestration,
trigger=CronTrigger(
minute=minute,
hour=hour,
day=day,
month=month,
day_of_week=day_of_week
),
id="daily_orchestration",
name="Daily Orchestration (Forecasting → Production → Procurement)",
misfire_grace_time=300, # 5 minutes grace period
max_instances=1 # Only one instance running at a time
)
logger.info("Orchestrator scheduler configured",
schedule=settings.ORCHESTRATION_SCHEDULE)
async def run_daily_orchestration(self):
"""
Main orchestration workflow - runs daily
Executes for all active tenants in parallel (with limits)
"""
if not self.is_leader:
logger.debug("Not leader, skipping orchestration")
return
if not settings.ORCHESTRATION_ENABLED:
logger.info("Orchestration disabled via config")
return
logger.info("Starting daily orchestration workflow")
try:
# Get all active tenants
active_tenants = await self._get_active_tenants()
if not active_tenants:
logger.warning("No active tenants found for orchestration")
return
logger.info("Processing tenants",
total_tenants=len(active_tenants))
# Process tenants with concurrency limit
semaphore = asyncio.Semaphore(settings.MAX_CONCURRENT_TENANTS)
async def process_with_semaphore(tenant_id):
async with semaphore:
return await self._orchestrate_tenant(tenant_id)
# Process all tenants in parallel (but limited by semaphore)
tasks = [process_with_semaphore(tenant_id) for tenant_id in active_tenants]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Log summary
successful = sum(1 for r in results if r and not isinstance(r, Exception))
failed = len(results) - successful
logger.info("Daily orchestration completed",
total_tenants=len(active_tenants),
successful=successful,
failed=failed)
except Exception as e:
logger.error("Error in daily orchestration",
error=str(e), exc_info=True)
async def _orchestrate_tenant(self, tenant_id: uuid.UUID) -> bool:
"""
Orchestrate workflow for a single tenant using Saga pattern
Returns True if successful, False otherwise
"""
logger.info("Starting orchestration for tenant", tenant_id=str(tenant_id))
# Create orchestration run record
async with self.db_manager.get_session() as session:
repo = OrchestrationRunRepository(session)
run_number = await repo.generate_run_number()
run = await repo.create_run({
'run_number': run_number,
'tenant_id': tenant_id,
'status': OrchestrationStatus.running,
'run_type': 'scheduled',
'started_at': datetime.now(timezone.utc),
'triggered_by': 'scheduler'
})
await session.commit()
run_id = run.id
try:
# Set timeout for entire tenant orchestration
async with asyncio.timeout(settings.TENANT_TIMEOUT_SECONDS):
# Execute orchestration using Saga pattern
saga = OrchestrationSaga(
forecast_client=self.forecast_client,
production_client=self.production_client,
procurement_client=self.procurement_client,
notification_client=self.notification_client,
inventory_client=self.inventory_client, # NEW
suppliers_client=self.suppliers_client, # NEW
recipes_client=self.recipes_client # NEW
)
result = await saga.execute_orchestration(
tenant_id=str(tenant_id),
orchestration_run_id=str(run_id)
)
if result['success']:
# Update orchestration run with saga results
await self._complete_orchestration_run_with_saga(
run_id,
result
)
logger.info("Tenant orchestration completed successfully",
tenant_id=str(tenant_id), run_id=str(run_id))
return True
else:
# Saga failed (with compensation)
await self._mark_orchestration_failed(
run_id,
result.get('error', 'Saga execution failed')
)
return False
except asyncio.TimeoutError:
logger.error("Tenant orchestration timeout",
tenant_id=str(tenant_id),
timeout_seconds=settings.TENANT_TIMEOUT_SECONDS)
await self._mark_orchestration_failed(run_id, "Timeout exceeded")
return False
except Exception as e:
logger.error("Tenant orchestration failed",
tenant_id=str(tenant_id),
error=str(e), exc_info=True)
await self._mark_orchestration_failed(run_id, str(e))
return False
async def _get_active_tenants(self) -> List[uuid.UUID]:
"""
Get list of active tenants for orchestration
REAL IMPLEMENTATION (no stubs)
"""
try:
logger.info("Fetching active tenants from Tenant Service")
# Call Tenant Service with circuit breaker
tenants_data = await self.tenant_breaker.call(
self.tenant_settings_client.get_active_tenants
)
if not tenants_data:
logger.warning("Tenant Service returned no active tenants")
return []
# Extract tenant IDs
tenant_ids = []
for tenant in tenants_data:
tenant_id = tenant.get('id') or tenant.get('tenant_id')
if tenant_id:
# Convert string to UUID if needed
if isinstance(tenant_id, str):
tenant_id = uuid.UUID(tenant_id)
tenant_ids.append(tenant_id)
logger.info(f"Found {len(tenant_ids)} active tenants for orchestration")
return tenant_ids
except CircuitBreakerOpenError:
logger.error("Circuit breaker open for Tenant Service, skipping orchestration")
return []
except Exception as e:
logger.error("Error getting active tenants", error=str(e), exc_info=True)
return []
async def _complete_orchestration_run_with_saga(
self,
run_id: uuid.UUID,
saga_result: Dict[str, Any]
):
"""
Complete orchestration run with saga results
Args:
run_id: Orchestration run ID
saga_result: Result from saga execution
"""
async with self.db_manager.get_session() as session:
repo = OrchestrationRunRepository(session)
run = await repo.get_run_by_id(run_id)
if run:
started_at = run.started_at
completed_at = datetime.now(timezone.utc)
duration = (completed_at - started_at).total_seconds()
# Extract results from saga
forecast_id = saga_result.get('forecast_id')
production_schedule_id = saga_result.get('production_schedule_id')
procurement_plan_id = saga_result.get('procurement_plan_id')
notifications_sent = saga_result.get('notifications_sent', 0)
# Get saga summary
saga_summary = saga_result.get('saga_summary', {})
total_steps = saga_summary.get('total_steps', 0)
completed_steps = saga_summary.get('completed_steps', 0)
await repo.update_run(run_id, {
'status': OrchestrationStatus.completed,
'completed_at': completed_at,
'duration_seconds': int(duration),
'forecast_id': forecast_id,
'forecasting_status': 'success',
'forecasting_completed_at': completed_at,
'forecasts_generated': 1, # Placeholder
'production_schedule_id': production_schedule_id,
'production_status': 'success',
'production_completed_at': completed_at,
'production_batches_created': 0, # Placeholder
'procurement_plan_id': procurement_plan_id,
'procurement_status': 'success',
'procurement_completed_at': completed_at,
'procurement_plans_created': 1,
'purchase_orders_created': 0, # Placeholder
'notification_status': 'success',
'notification_completed_at': completed_at,
'notifications_sent': notifications_sent,
'saga_steps_total': total_steps,
'saga_steps_completed': completed_steps
})
await session.commit()
async def _mark_orchestration_failed(self, run_id: uuid.UUID, error_message: str):
"""Mark orchestration run as failed"""
async with self.db_manager.get_session() as session:
repo = OrchestrationRunRepository(session)
run = await repo.get_run_by_id(run_id)
if run:
started_at = run.started_at
completed_at = datetime.now(timezone.utc)
duration = (completed_at - started_at).total_seconds()
await repo.update_run(run_id, {
'status': OrchestrationStatus.failed,
'completed_at': completed_at,
'duration_seconds': int(duration),
'error_message': error_message
})
await session.commit()
# Manual trigger for testing
async def trigger_orchestration_for_tenant(
self,
tenant_id: uuid.UUID,
test_scenario: Optional[str] = None
) -> Dict[str, Any]:
"""
Manually trigger orchestration for a tenant (for testing)
Args:
tenant_id: Tenant ID to orchestrate
test_scenario: Optional test scenario (full, production_only, procurement_only)
Returns:
Dict with orchestration results
"""
logger.info("Manual orchestration trigger",
tenant_id=str(tenant_id),
test_scenario=test_scenario)
success = await self._orchestrate_tenant(tenant_id)
return {
'success': success,
'tenant_id': str(tenant_id),
'test_scenario': test_scenario,
'message': 'Orchestration completed' if success else 'Orchestration failed'
}
def get_circuit_breaker_stats(self) -> Dict[str, Any]:
"""Get circuit breaker statistics for monitoring"""
return {
'forecast_service': self.forecast_breaker.get_stats(),
'production_service': self.production_breaker.get_stats(),
'procurement_service': self.procurement_breaker.get_stats(),
'tenant_service': self.tenant_breaker.get_stats()
}