Initial commit - production deployment

This commit is contained in:
2026-01-21 17:17:16 +01:00
commit c23d00dd92
2289 changed files with 638440 additions and 0 deletions

View File

View File

@@ -0,0 +1,4 @@
from .orchestration import router as orchestration_router
from .internal_demo import router as internal_demo_router
__all__ = ["orchestration_router", "internal_demo_router"]

View File

@@ -0,0 +1,177 @@
"""
Internal API for Alert Intelligence Service
Provides orchestrator context for alert enrichment
"""
from fastapi import APIRouter, Header, HTTPException, Query
from typing import Optional, List, Dict, Any
from datetime import datetime, timedelta
from uuid import UUID
from pydantic import BaseModel
router = APIRouter(prefix="/api/internal", tags=["internal"])
class OrchestrationAction(BaseModel):
"""Recent orchestration action"""
id: str
type: str # purchase_order, production_batch
status: str # created, pending_approval, approved, completed
delivery_date: Optional[datetime]
reasoning: Optional[Dict[str, Any]]
estimated_resolution: Optional[datetime]
created_at: datetime
class RecentActionsResponse(BaseModel):
"""Response with recent orchestrator actions"""
actions: List[OrchestrationAction]
count: int
@router.get("/recent-actions", response_model=RecentActionsResponse)
async def get_recent_actions(
tenant_id: str = Query(..., description="Tenant ID"),
ingredient_id: Optional[str] = Query(None, description="Filter by ingredient"),
product_id: Optional[str] = Query(None, description="Filter by product"),
hours_ago: int = Query(24, description="Look back hours"),
):
"""
Get recent orchestrator actions for alert context enrichment.
Only accessible by internal services (alert-intelligence).
Returns orchestration runs with details about POs created, batches adjusted, etc.
This helps the alert system understand if AI already addressed similar issues.
"""
from shared.database.base import create_database_manager
from ..core.config import get_settings
from ..models.orchestration_run import OrchestrationRun, OrchestrationStatus
from sqlalchemy import select, and_, desc
import structlog
logger = structlog.get_logger()
try:
settings = get_settings()
db_manager = create_database_manager(settings.DATABASE_URL, "orchestrator")
async with db_manager.get_session() as session:
cutoff_time = datetime.utcnow() - timedelta(hours=hours_ago)
# Query recent orchestration runs
query = select(OrchestrationRun).where(
and_(
OrchestrationRun.tenant_id == UUID(tenant_id),
OrchestrationRun.created_at >= cutoff_time,
OrchestrationRun.status.in_([
OrchestrationStatus.completed,
OrchestrationStatus.partial_success
])
)
).order_by(desc(OrchestrationRun.created_at))
result = await session.execute(query)
runs = result.scalars().all()
actions = []
for run in runs:
run_metadata = run.run_metadata or {}
# Add purchase order actions
if run.purchase_orders_created > 0:
po_details = run_metadata.get('purchase_orders', [])
# If metadata has PO details, use them
if po_details:
for po in po_details:
# Filter by ingredient if specified
if ingredient_id:
po_items = po.get('items', [])
has_ingredient = any(
item.get('ingredient_id') == ingredient_id
for item in po_items
)
if not has_ingredient:
continue
actions.append(OrchestrationAction(
id=po.get('id', str(run.id)),
type="purchase_order",
status=po.get('status', 'created'),
delivery_date=po.get('delivery_date'),
reasoning=run_metadata.get('reasoning'),
estimated_resolution=po.get('delivery_date'),
created_at=run.created_at
))
else:
# Fallback: create generic action from run
actions.append(OrchestrationAction(
id=str(run.id),
type="purchase_order",
status="created",
delivery_date=None,
reasoning=run_metadata.get('reasoning'),
estimated_resolution=None,
created_at=run.created_at
))
# Add production batch actions
if run.production_batches_created > 0:
batch_details = run_metadata.get('production_batches', [])
if batch_details:
for batch in batch_details:
# Filter by product if specified
if product_id and batch.get('product_id') != product_id:
continue
actions.append(OrchestrationAction(
id=batch.get('id', str(run.id)),
type="production_batch",
status=batch.get('status', 'created'),
delivery_date=None,
reasoning=run_metadata.get('reasoning'),
estimated_resolution=batch.get('scheduled_date'),
created_at=run.created_at
))
else:
# Fallback: create generic action from run
if not product_id: # Only add if no product filter
actions.append(OrchestrationAction(
id=str(run.id),
type="production_batch",
status="created",
delivery_date=None,
reasoning=run_metadata.get('reasoning'),
estimated_resolution=None,
created_at=run.created_at
))
logger.info(
"recent_actions_fetched",
tenant_id=tenant_id,
hours_ago=hours_ago,
action_count=len(actions),
ingredient_id=ingredient_id,
product_id=product_id
)
return RecentActionsResponse(
actions=actions,
count=len(actions)
)
except Exception as e:
logger.error("error_fetching_recent_actions", error=str(e), tenant_id=tenant_id)
raise HTTPException(
status_code=500,
detail=f"Failed to fetch recent actions: {str(e)}"
)
@router.get("/health")
async def internal_health():
"""Internal health check"""
return {"status": "healthy", "api": "internal"}

View File

@@ -0,0 +1,277 @@
"""
Internal Demo API Endpoints for Orchestrator Service
Used by demo_session service to clone data for virtual demo tenants
"""
from fastapi import APIRouter, Depends, HTTPException, Header
from typing import Dict, Any
from uuid import UUID
import structlog
import os
import json
from app.core.database import get_db
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, delete, func
from app.models.orchestration_run import OrchestrationRun, OrchestrationStatus
import uuid
from datetime import datetime, timezone, timedelta
from typing import Optional
import sys
from pathlib import Path
# Add shared utilities to path
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
from shared.utils.demo_dates import adjust_date_for_demo
from app.core.config import settings
router = APIRouter(prefix="/internal/demo", tags=["internal"])
logger = structlog.get_logger()
async def ensure_unique_run_number(db: AsyncSession, base_run_number: str) -> str:
"""Ensure the run number is unique by appending a suffix if needed"""
proposed_run_number = base_run_number
# Check if the proposed run number already exists in the database
while True:
result = await db.execute(
select(OrchestrationRun)
.where(OrchestrationRun.run_number == proposed_run_number)
)
existing_run = result.scalar_one_or_none()
if not existing_run:
# Run number is unique, return it
return proposed_run_number
# Generate a new run number with an additional random suffix
random_suffix = str(uuid.uuid4())[:4].upper()
proposed_run_number = f"{base_run_number[:50-len(random_suffix)-1]}-{random_suffix}"
async def load_fixture_data_for_tenant(
db: AsyncSession,
tenant_uuid: UUID,
demo_account_type: str,
reference_time: datetime,
base_tenant_id: Optional[str] = None
) -> int:
"""
Load orchestration run data from JSON fixture directly into the virtual tenant.
Returns the number of runs created.
"""
from shared.utils.seed_data_paths import get_seed_data_path
from shared.utils.demo_dates import resolve_time_marker, adjust_date_for_demo
# Load fixture data
if demo_account_type == "enterprise_child" and base_tenant_id:
json_file = get_seed_data_path("enterprise", "11-orchestrator.json", child_id=base_tenant_id)
else:
json_file = get_seed_data_path(demo_account_type, "11-orchestrator.json")
with open(json_file, 'r', encoding='utf-8') as f:
fixture_data = json.load(f)
orchestration_run_data = fixture_data.get("orchestration_run")
if not orchestration_run_data:
logger.warning("No orchestration_run data in fixture")
return 0
# Parse and adjust dates from fixture to reference_time
base_started_at = resolve_time_marker(orchestration_run_data.get("started_at"), reference_time)
base_completed_at = resolve_time_marker(orchestration_run_data.get("completed_at"), reference_time)
# Adjust dates to make them appear recent relative to session creation
started_at = adjust_date_for_demo(base_started_at, reference_time) if base_started_at else reference_time - timedelta(hours=2)
completed_at = adjust_date_for_demo(base_completed_at, reference_time) if base_completed_at else started_at + timedelta(minutes=15)
# Generate unique run number with session context
current_year = reference_time.year
unique_suffix = str(uuid.uuid4())[:8].upper()
run_number = f"ORCH-DEMO-PROF-{current_year}-001-{unique_suffix}"
# Create orchestration run for virtual tenant
new_run = OrchestrationRun(
id=uuid.uuid4(), # Generate new UUID
tenant_id=tenant_uuid,
run_number=run_number,
status=OrchestrationStatus[orchestration_run_data["status"]],
run_type=orchestration_run_data.get("run_type", "daily"),
priority="normal",
started_at=started_at,
completed_at=completed_at,
duration_seconds=orchestration_run_data.get("duration_seconds", 900),
# Step statuses from orchestration_results
forecasting_status="success",
forecasting_started_at=started_at,
forecasting_completed_at=started_at + timedelta(minutes=2),
production_status="success",
production_started_at=started_at + timedelta(minutes=2),
production_completed_at=started_at + timedelta(minutes=5),
procurement_status="success",
procurement_started_at=started_at + timedelta(minutes=5),
procurement_completed_at=started_at + timedelta(minutes=8),
notification_status="success",
notification_started_at=started_at + timedelta(minutes=8),
notification_completed_at=completed_at,
# Results from orchestration_results
forecasts_generated=fixture_data.get("orchestration_results", {}).get("forecasts_generated", 10),
production_batches_created=fixture_data.get("orchestration_results", {}).get("production_batches_created", 18),
procurement_plans_created=0,
purchase_orders_created=fixture_data.get("orchestration_results", {}).get("purchase_orders_created", 6),
notifications_sent=fixture_data.get("orchestration_results", {}).get("notifications_sent", 8),
# Metadata
triggered_by="system",
created_at=started_at,
updated_at=completed_at
)
db.add(new_run)
await db.flush()
logger.info(
"Loaded orchestration run from fixture",
tenant_id=str(tenant_uuid),
run_number=new_run.run_number,
started_at=started_at.isoformat()
)
return 1
@router.post("/clone")
async def clone_demo_data(
base_tenant_id: str,
virtual_tenant_id: str,
demo_account_type: str,
session_id: Optional[str] = None,
session_created_at: Optional[str] = None,
db: AsyncSession = Depends(get_db)
):
"""
Clone orchestration run demo data from base tenant to virtual tenant
This endpoint is called by the demo_session service during session initialization.
It clones orchestration runs with date adjustments to make them appear recent.
If the base tenant has no orchestration runs, it will first seed them from the fixture.
"""
start_time = datetime.now(timezone.utc)
# Parse session_created_at or use current time
if session_created_at:
try:
reference_time = datetime.fromisoformat(session_created_at.replace('Z', '+00:00'))
except:
reference_time = datetime.now(timezone.utc)
else:
reference_time = datetime.now(timezone.utc)
logger.info(
"Starting orchestration runs cloning with date adjustment",
base_tenant_id=base_tenant_id,
virtual_tenant_id=virtual_tenant_id,
demo_account_type=demo_account_type,
session_id=session_id,
reference_time=reference_time.isoformat()
)
try:
virtual_uuid = uuid.UUID(virtual_tenant_id)
# Load fixture data directly into virtual tenant (no base tenant cloning)
runs_created = await load_fixture_data_for_tenant(
db,
virtual_uuid,
demo_account_type,
reference_time,
base_tenant_id
)
await db.commit()
duration_ms = int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000)
logger.info(
"Orchestration runs loaded from fixture successfully",
virtual_tenant_id=str(virtual_tenant_id),
runs_created=runs_created,
duration_ms=duration_ms
)
return {
"service": "orchestrator",
"status": "completed",
"success": True,
"records_cloned": runs_created,
"runs_cloned": runs_created,
"duration_ms": duration_ms
}
except Exception as e:
logger.error("Failed to clone orchestration runs", error=str(e), exc_info=True)
await db.rollback()
raise HTTPException(status_code=500, detail=f"Failed to clone orchestration runs: {str(e)}")
@router.delete("/tenant/{virtual_tenant_id}")
async def delete_demo_data(
virtual_tenant_id: str,
db: AsyncSession = Depends(get_db)
):
"""Delete all orchestration runs for a virtual demo tenant"""
logger.info("Deleting orchestration runs for virtual tenant", virtual_tenant_id=virtual_tenant_id)
start_time = datetime.now(timezone.utc)
try:
virtual_uuid = uuid.UUID(virtual_tenant_id)
# Count records
run_count = await db.scalar(
select(func.count(OrchestrationRun.id))
.where(OrchestrationRun.tenant_id == virtual_uuid)
)
# Delete orchestration runs
await db.execute(
delete(OrchestrationRun)
.where(OrchestrationRun.tenant_id == virtual_uuid)
)
await db.commit()
duration_ms = int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000)
logger.info(
"Orchestration runs deleted successfully",
virtual_tenant_id=virtual_tenant_id,
duration_ms=duration_ms
)
return {
"service": "orchestrator",
"status": "deleted",
"virtual_tenant_id": virtual_tenant_id,
"records_deleted": {
"orchestration_runs": run_count,
"total": run_count
},
"duration_ms": duration_ms
}
except Exception as e:
logger.error("Failed to delete orchestration runs", error=str(e), exc_info=True)
await db.rollback()
raise HTTPException(status_code=500, detail=str(e))
@router.get("/clone/health")
async def health_check():
"""Health check for demo cloning endpoint"""
return {"status": "healthy", "service": "orchestrator"}

View File

@@ -0,0 +1,346 @@
# ================================================================
# services/orchestrator/app/api/orchestration.py
# ================================================================
"""
Orchestration API Endpoints
Testing and manual trigger endpoints for orchestration
"""
import uuid
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Request
from pydantic import BaseModel, Field
import structlog
from app.core.database import get_db
from app.repositories.orchestration_run_repository import OrchestrationRunRepository
from sqlalchemy.ext.asyncio import AsyncSession
logger = structlog.get_logger()
router = APIRouter(prefix="/api/v1/tenants/{tenant_id}/orchestrator", tags=["Orchestration"])
# ================================================================
# REQUEST/RESPONSE SCHEMAS
# ================================================================
class OrchestratorTestRequest(BaseModel):
"""Request schema for testing orchestrator"""
test_scenario: Optional[str] = Field(None, description="Test scenario: full, production_only, procurement_only")
dry_run: bool = Field(False, description="Dry run mode (no actual changes)")
class OrchestratorTestResponse(BaseModel):
"""Response schema for orchestrator test"""
success: bool
message: str
tenant_id: str
forecasting_completed: bool = False
production_completed: bool = False
procurement_completed: bool = False
notifications_sent: bool = False
summary: dict = {}
class OrchestratorWorkflowRequest(BaseModel):
"""Request schema for daily workflow trigger"""
dry_run: bool = Field(False, description="Dry run mode (no actual changes)")
class OrchestratorWorkflowResponse(BaseModel):
"""Response schema for daily workflow trigger"""
success: bool
message: str
tenant_id: str
run_id: Optional[str] = None
forecasting_completed: bool = False
production_completed: bool = False
procurement_completed: bool = False
notifications_sent: bool = False
summary: dict = {}
# ================================================================
# API ENDPOINTS
# ================================================================
@router.post("/test", response_model=OrchestratorTestResponse)
async def trigger_orchestrator_test(
tenant_id: str,
request_data: OrchestratorTestRequest,
request: Request,
db: AsyncSession = Depends(get_db)
):
"""
Trigger orchestrator for testing purposes
This endpoint allows manual triggering of the orchestration workflow
for a specific tenant, useful for testing during development.
Args:
tenant_id: Tenant ID to orchestrate
request_data: Test request with scenario and dry_run options
request: FastAPI request object
db: Database session
Returns:
OrchestratorTestResponse with results
"""
logger.info("Orchestrator test trigger requested",
tenant_id=tenant_id,
test_scenario=request_data.test_scenario,
dry_run=request_data.dry_run)
try:
# Get scheduler service from app state
if not hasattr(request.app.state, 'scheduler_service'):
raise HTTPException(
status_code=503,
detail="Orchestrator scheduler service not available"
)
scheduler_service = request.app.state.scheduler_service
# Trigger orchestration
tenant_uuid = uuid.UUID(tenant_id)
result = await scheduler_service.trigger_orchestration_for_tenant(
tenant_id=tenant_uuid,
test_scenario=request_data.test_scenario
)
# Get the latest run for this tenant
repo = OrchestrationRunRepository(db)
latest_run = await repo.get_latest_run_for_tenant(tenant_uuid)
# Build response
response = OrchestratorTestResponse(
success=result.get('success', False),
message=result.get('message', 'Orchestration completed'),
tenant_id=tenant_id,
forecasting_completed=latest_run.forecasting_status == 'success' if latest_run else False,
production_completed=latest_run.production_status == 'success' if latest_run else False,
procurement_completed=latest_run.procurement_status == 'success' if latest_run else False,
notifications_sent=latest_run.notification_status == 'success' if latest_run else False,
summary={
'forecasts_generated': latest_run.forecasts_generated if latest_run else 0,
'batches_created': latest_run.production_batches_created if latest_run else 0,
'pos_created': latest_run.purchase_orders_created if latest_run else 0,
'notifications_sent': latest_run.notifications_sent if latest_run else 0
}
)
logger.info("Orchestrator test completed",
tenant_id=tenant_id,
success=response.success)
return response
except ValueError as e:
raise HTTPException(status_code=400, detail=f"Invalid tenant ID: {str(e)}")
except Exception as e:
logger.error("Orchestrator test failed",
tenant_id=tenant_id,
error=str(e),
exc_info=True)
raise HTTPException(status_code=500, detail=f"Orchestrator test failed: {str(e)}")
@router.post("/run-daily-workflow", response_model=OrchestratorWorkflowResponse)
async def run_daily_workflow(
tenant_id: str,
request_data: Optional[OrchestratorWorkflowRequest] = None,
request: Request = None,
db: AsyncSession = Depends(get_db)
):
"""
Trigger the daily orchestrated workflow for a tenant
This endpoint runs the complete daily workflow which includes:
1. Forecasting Service: Generate demand forecasts
2. Production Service: Create production schedule from forecasts
3. Procurement Service: Generate procurement plan
4. Notification Service: Send relevant notifications
This is the production endpoint used by the dashboard scheduler button.
Args:
tenant_id: Tenant ID to orchestrate
request_data: Optional request data with dry_run flag
request: FastAPI request object
db: Database session
Returns:
OrchestratorWorkflowResponse with workflow execution results
"""
logger.info("Daily workflow trigger requested", tenant_id=tenant_id)
# Handle optional request_data
if request_data is None:
request_data = OrchestratorWorkflowRequest()
try:
# Get scheduler service from app state
if not hasattr(request.app.state, 'scheduler_service'):
raise HTTPException(
status_code=503,
detail="Orchestrator scheduler service not available"
)
scheduler_service = request.app.state.scheduler_service
# Trigger orchestration (use full workflow, not test scenario)
tenant_uuid = uuid.UUID(tenant_id)
result = await scheduler_service.trigger_orchestration_for_tenant(
tenant_id=tenant_uuid,
test_scenario=None # Full production workflow
)
# Get the latest run for this tenant
repo = OrchestrationRunRepository(db)
latest_run = await repo.get_latest_run_for_tenant(tenant_uuid)
# Build response
response = OrchestratorWorkflowResponse(
success=result.get('success', False),
message=result.get('message', 'Daily workflow completed successfully'),
tenant_id=tenant_id,
run_id=str(latest_run.id) if latest_run else None,
forecasting_completed=latest_run.forecasting_status == 'success' if latest_run else False,
production_completed=latest_run.production_status == 'success' if latest_run else False,
procurement_completed=latest_run.procurement_status == 'success' if latest_run else False,
notifications_sent=latest_run.notification_status == 'success' if latest_run else False,
summary={
'run_number': latest_run.run_number if latest_run else 0,
'forecasts_generated': latest_run.forecasts_generated if latest_run else 0,
'production_batches_created': latest_run.production_batches_created if latest_run else 0,
'purchase_orders_created': latest_run.purchase_orders_created if latest_run else 0,
'notifications_sent': latest_run.notifications_sent if latest_run else 0,
'duration_seconds': latest_run.duration_seconds if latest_run else 0
}
)
logger.info("Daily workflow completed",
tenant_id=tenant_id,
success=response.success,
run_id=response.run_id)
return response
except ValueError as e:
raise HTTPException(status_code=400, detail=f"Invalid tenant ID: {str(e)}")
except Exception as e:
logger.error("Daily workflow failed",
tenant_id=tenant_id,
error=str(e),
exc_info=True)
raise HTTPException(status_code=500, detail=f"Daily workflow failed: {str(e)}")
@router.get("/health")
async def orchestrator_health():
"""Check orchestrator health"""
return {
"status": "healthy",
"service": "orchestrator",
"message": "Orchestrator service is running"
}
@router.get("/runs", response_model=dict)
async def list_orchestration_runs(
tenant_id: str,
limit: int = 10,
offset: int = 0,
db: AsyncSession = Depends(get_db)
):
"""
List orchestration runs for a tenant
Args:
tenant_id: Tenant ID
limit: Maximum number of runs to return
offset: Number of runs to skip
db: Database session
Returns:
List of orchestration runs
"""
try:
tenant_uuid = uuid.UUID(tenant_id)
repo = OrchestrationRunRepository(db)
runs = await repo.list_runs(
tenant_id=tenant_uuid,
limit=limit,
offset=offset
)
return {
"runs": [
{
"id": str(run.id),
"run_number": run.run_number,
"status": run.status.value,
"started_at": run.started_at.isoformat() if run.started_at else None,
"completed_at": run.completed_at.isoformat() if run.completed_at else None,
"duration_seconds": run.duration_seconds,
"forecasts_generated": run.forecasts_generated,
"batches_created": run.production_batches_created,
"pos_created": run.purchase_orders_created
}
for run in runs
],
"total": len(runs),
"limit": limit,
"offset": offset
}
except ValueError as e:
raise HTTPException(status_code=400, detail=f"Invalid tenant ID: {str(e)}")
except Exception as e:
logger.error("Error listing orchestration runs",
tenant_id=tenant_id,
error=str(e))
raise HTTPException(status_code=500, detail=str(e))
@router.get("/last-run")
async def get_last_orchestration_run(
tenant_id: str,
db: AsyncSession = Depends(get_db)
):
"""
Get timestamp of last orchestration run
Lightweight endpoint for health status frontend migration (Phase 4).
Returns only timestamp and run number for the most recent completed run.
Args:
tenant_id: Tenant ID
Returns:
Dict with timestamp and runNumber (or None if no runs)
"""
try:
tenant_uuid = uuid.UUID(tenant_id)
repo = OrchestrationRunRepository(db)
# Get most recent completed run
latest_run = await repo.get_latest_run_for_tenant(tenant_uuid)
if not latest_run:
return {"timestamp": None, "runNumber": None}
return {
"timestamp": latest_run.started_at.isoformat() if latest_run.started_at else None,
"runNumber": latest_run.run_number
}
except ValueError as e:
raise HTTPException(status_code=400, detail=f"Invalid tenant ID: {str(e)}")
except Exception as e:
logger.error("Error getting last orchestration run",
tenant_id=tenant_id,
error=str(e))
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -0,0 +1,133 @@
# ================================================================
# services/orchestrator/app/core/config.py
# ================================================================
"""
Orchestrator Service Configuration
"""
import os
from pydantic import Field
from shared.config.base import BaseServiceSettings
class OrchestratorSettings(BaseServiceSettings):
"""Orchestrator service specific settings"""
# Service Identity
APP_NAME: str = "Orchestrator Service"
SERVICE_NAME: str = "orchestrator-service"
VERSION: str = "1.0.0"
DESCRIPTION: str = "Automated orchestration of forecasting, production, and procurement workflows"
# Database configuration (minimal - only for audit logs)
@property
def DATABASE_URL(self) -> str:
"""Build database URL from secure components"""
# Try complete URL first (for backward compatibility)
complete_url = os.getenv("ORCHESTRATOR_DATABASE_URL")
if complete_url:
return complete_url
# Build from components (secure approach)
user = os.getenv("ORCHESTRATOR_DB_USER", "orchestrator_user")
password = os.getenv("ORCHESTRATOR_DB_PASSWORD", "orchestrator_pass123")
host = os.getenv("ORCHESTRATOR_DB_HOST", "localhost")
port = os.getenv("ORCHESTRATOR_DB_PORT", "5432")
name = os.getenv("ORCHESTRATOR_DB_NAME", "orchestrator_db")
return f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{name}"
# Orchestration Settings
ORCHESTRATION_ENABLED: bool = os.getenv("ORCHESTRATION_ENABLED", "true").lower() == "true"
ORCHESTRATION_SCHEDULE: str = os.getenv("ORCHESTRATION_SCHEDULE", "30 5 * * *") # 5:30 AM daily (cron format)
ORCHESTRATION_HOUR: int = int(os.getenv("ORCHESTRATION_HOUR", "2")) # Hour to run daily orchestration (default: 2 AM)
ORCHESTRATION_MINUTE: int = int(os.getenv("ORCHESTRATION_MINUTE", "0")) # Minute to run (default: :00)
ORCHESTRATION_TIMEOUT_SECONDS: int = int(os.getenv("ORCHESTRATION_TIMEOUT_SECONDS", "600")) # 10 minutes
# Tenant Processing
MAX_CONCURRENT_TENANTS: int = int(os.getenv("MAX_CONCURRENT_TENANTS", "5"))
TENANT_TIMEOUT_SECONDS: int = int(os.getenv("TENANT_TIMEOUT_SECONDS", "180")) # 3 minutes per tenant
# Retry Configuration
MAX_RETRIES: int = int(os.getenv("MAX_RETRIES", "3"))
RETRY_DELAY_SECONDS: int = int(os.getenv("RETRY_DELAY_SECONDS", "30"))
ENABLE_EXPONENTIAL_BACKOFF: bool = os.getenv("ENABLE_EXPONENTIAL_BACKOFF", "true").lower() == "true"
# Circuit Breaker
CIRCUIT_BREAKER_ENABLED: bool = os.getenv("CIRCUIT_BREAKER_ENABLED", "true").lower() == "true"
CIRCUIT_BREAKER_FAILURE_THRESHOLD: int = int(os.getenv("CIRCUIT_BREAKER_FAILURE_THRESHOLD", "5"))
CIRCUIT_BREAKER_RESET_TIMEOUT: int = int(os.getenv("CIRCUIT_BREAKER_RESET_TIMEOUT", "300")) # 5 minutes
# ================================================================
# CIRCUIT BREAKER SETTINGS - Enhanced with Pydantic validation
# ================================================================
CIRCUIT_BREAKER_TIMEOUT_DURATION: int = Field(
default=60,
description="Seconds to wait before attempting recovery"
)
CIRCUIT_BREAKER_SUCCESS_THRESHOLD: int = Field(
default=2,
description="Successful calls needed to close circuit"
)
# ================================================================
# SAGA PATTERN SETTINGS
# ================================================================
SAGA_TIMEOUT_SECONDS: int = Field(
default=600,
description="Timeout for saga execution (10 minutes)"
)
SAGA_ENABLE_COMPENSATION: bool = Field(
default=True,
description="Enable saga compensation on failure"
)
# Service Integration URLs
FORECASTING_SERVICE_URL: str = os.getenv("FORECASTING_SERVICE_URL", "http://forecasting-service:8000")
PRODUCTION_SERVICE_URL: str = os.getenv("PRODUCTION_SERVICE_URL", "http://production-service:8000")
PROCUREMENT_SERVICE_URL: str = os.getenv("PROCUREMENT_SERVICE_URL", "http://procurement-service:8000")
NOTIFICATION_SERVICE_URL: str = os.getenv("NOTIFICATION_SERVICE_URL", "http://notification-service:8000")
TENANT_SERVICE_URL: str = os.getenv("TENANT_SERVICE_URL", "http://tenant-service:8000")
# Notification Settings
SEND_NOTIFICATIONS: bool = os.getenv("SEND_NOTIFICATIONS", "true").lower() == "true"
NOTIFY_ON_SUCCESS: bool = os.getenv("NOTIFY_ON_SUCCESS", "true").lower() == "true"
NOTIFY_ON_FAILURE: bool = os.getenv("NOTIFY_ON_FAILURE", "true").lower() == "true"
# Audit and Logging
AUDIT_ORCHESTRATION_RUNS: bool = os.getenv("AUDIT_ORCHESTRATION_RUNS", "true").lower() == "true"
DETAILED_LOGGING: bool = os.getenv("DETAILED_LOGGING", "true").lower() == "true"
# AI Enhancement Settings
ORCHESTRATION_USE_AI_INSIGHTS: bool = os.getenv("ORCHESTRATION_USE_AI_INSIGHTS", "true").lower() == "true"
AI_INSIGHTS_SERVICE_URL: str = os.getenv("AI_INSIGHTS_SERVICE_URL", "http://ai-insights-service:8000")
AI_INSIGHTS_MIN_CONFIDENCE: int = int(os.getenv("AI_INSIGHTS_MIN_CONFIDENCE", "70"))
# Redis Cache Settings (for dashboard performance)
REDIS_HOST: str = os.getenv("REDIS_HOST", "localhost")
REDIS_PORT: int = int(os.getenv("REDIS_PORT", "6379"))
REDIS_DB: int = int(os.getenv("REDIS_DB", "0"))
REDIS_PASSWORD: str = os.getenv("REDIS_PASSWORD", "")
REDIS_TLS_ENABLED: str = os.getenv("REDIS_TLS_ENABLED", "false")
CACHE_ENABLED: bool = os.getenv("CACHE_ENABLED", "true").lower() == "true"
CACHE_TTL_HEALTH: int = int(os.getenv("CACHE_TTL_HEALTH", "30")) # 30 seconds
CACHE_TTL_INSIGHTS: int = int(os.getenv("CACHE_TTL_INSIGHTS", "60")) # 1 minute (reduced for faster metrics updates)
CACHE_TTL_SUMMARY: int = int(os.getenv("CACHE_TTL_SUMMARY", "60")) # 1 minute
# Enterprise dashboard cache TTLs
CACHE_TTL_ENTERPRISE_SUMMARY: int = int(os.getenv("CACHE_TTL_ENTERPRISE_SUMMARY", "60")) # 1 minute
CACHE_TTL_ENTERPRISE_PERFORMANCE: int = int(os.getenv("CACHE_TTL_ENTERPRISE_PERFORMANCE", "60")) # 1 minute
CACHE_TTL_ENTERPRISE_DISTRIBUTION: int = int(os.getenv("CACHE_TTL_ENTERPRISE_DISTRIBUTION", "30")) # 30 seconds
CACHE_TTL_ENTERPRISE_FORECAST: int = int(os.getenv("CACHE_TTL_ENTERPRISE_FORECAST", "120")) # 2 minutes
CACHE_TTL_ENTERPRISE_NETWORK: int = int(os.getenv("CACHE_TTL_ENTERPRISE_NETWORK", "60")) # 1 minute
# Global settings instance
settings = OrchestratorSettings()
def get_settings():
"""Get the global settings instance"""
return settings

View File

@@ -0,0 +1,48 @@
# ================================================================
# services/orchestrator/app/core/database.py
# ================================================================
"""
Database connection and session management for Orchestrator Service
Minimal database - only for audit trail
"""
from shared.database.base import DatabaseManager
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from .config import settings
# Initialize database manager
database_manager = DatabaseManager(
database_url=settings.DATABASE_URL,
echo=settings.DEBUG
)
# Create async session factory
AsyncSessionLocal = async_sessionmaker(
database_manager.async_engine,
class_=AsyncSession,
expire_on_commit=False,
autocommit=False,
autoflush=False,
)
async def get_db() -> AsyncSession:
"""
Dependency to get database session.
Used in FastAPI endpoints via Depends(get_db).
"""
async with AsyncSessionLocal() as session:
try:
yield session
finally:
await session.close()
async def init_db():
"""Initialize database (create tables if needed)"""
await database_manager.create_all()
async def close_db():
"""Close database connections"""
await database_manager.close()

View File

@@ -0,0 +1,237 @@
# ================================================================
# services/orchestrator/app/main.py
# ================================================================
"""
Orchestrator Service - FastAPI Application
Automated orchestration of forecasting, production, and procurement workflows
"""
from fastapi import FastAPI, Request
from sqlalchemy import text
from app.core.config import settings
from app.core.database import database_manager
from shared.service_base import StandardFastAPIService
class OrchestratorService(StandardFastAPIService):
"""Orchestrator Service with standardized setup"""
expected_migration_version = "001_initial_schema"
def __init__(self):
# Define expected database tables for health checks
orchestrator_expected_tables = [
'orchestration_runs'
]
self.rabbitmq_client = None
self.event_publisher = None
self.leader_election = None
self.scheduler_service = None
super().__init__(
service_name="orchestrator-service",
app_name=settings.APP_NAME,
description=settings.DESCRIPTION,
version=settings.VERSION,
api_prefix="", # Empty because RouteBuilder already includes /api/v1
database_manager=database_manager,
expected_tables=orchestrator_expected_tables,
enable_messaging=True # Enable RabbitMQ for event publishing
)
async def verify_migrations(self):
"""Verify database schema matches the latest migrations"""
try:
async with self.database_manager.get_session() as session:
result = await session.execute(text("SELECT version_num FROM alembic_version"))
version = result.scalar()
if version != self.expected_migration_version:
self.logger.error(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
raise RuntimeError(f"Migration version mismatch: expected {self.expected_migration_version}, got {version}")
self.logger.info(f"Migration verification successful: {version}")
except Exception as e:
self.logger.error(f"Migration verification failed: {e}")
raise
async def _setup_messaging(self):
"""Setup messaging for orchestrator service"""
from shared.messaging import UnifiedEventPublisher, RabbitMQClient
try:
self.rabbitmq_client = RabbitMQClient(settings.RABBITMQ_URL, service_name="orchestrator-service")
await self.rabbitmq_client.connect()
# Create event publisher
self.event_publisher = UnifiedEventPublisher(self.rabbitmq_client, "orchestrator-service")
self.logger.info("Orchestrator service messaging setup completed")
except Exception as e:
self.logger.error("Failed to setup orchestrator messaging", error=str(e))
raise
async def _cleanup_messaging(self):
"""Cleanup messaging for orchestrator service"""
try:
if self.rabbitmq_client:
await self.rabbitmq_client.disconnect()
self.logger.info("Orchestrator service messaging cleanup completed")
except Exception as e:
self.logger.error("Error during orchestrator messaging cleanup", error=str(e))
async def on_startup(self, app: FastAPI):
"""Custom startup logic for orchestrator service"""
# Verify migrations first
await self.verify_migrations()
# Call parent startup (includes database, messaging, etc.)
await super().on_startup(app)
self.logger.info("Orchestrator Service starting up...")
# Initialize leader election for horizontal scaling
# Only the leader pod will run the scheduler
await self._setup_leader_election(app)
# REMOVED: Delivery tracking service - moved to procurement service (domain ownership)
async def _setup_leader_election(self, app: FastAPI):
"""
Setup leader election for scheduler.
CRITICAL FOR HORIZONTAL SCALING:
Without leader election, each pod would run the same scheduled jobs,
causing duplicate forecasts, production schedules, and database contention.
"""
from shared.leader_election import LeaderElectionService
import redis.asyncio as redis
try:
# Create Redis connection for leader election
redis_url = f"redis://:{settings.REDIS_PASSWORD}@{settings.REDIS_HOST}:{settings.REDIS_PORT}/{settings.REDIS_DB}"
if settings.REDIS_TLS_ENABLED.lower() == "true":
redis_url = redis_url.replace("redis://", "rediss://")
redis_client = redis.from_url(redis_url, decode_responses=False)
await redis_client.ping()
# Use shared leader election service
self.leader_election = LeaderElectionService(
redis_client,
service_name="orchestrator"
)
# Define callbacks for leader state changes
async def on_become_leader():
self.logger.info("This pod became the leader - starting scheduler")
from app.services.orchestrator_service import OrchestratorSchedulerService
self.scheduler_service = OrchestratorSchedulerService(self.event_publisher, settings)
await self.scheduler_service.start()
app.state.scheduler_service = self.scheduler_service
self.logger.info("Orchestrator scheduler service started (leader only)")
async def on_lose_leader():
self.logger.warning("This pod lost leadership - stopping scheduler")
if self.scheduler_service:
await self.scheduler_service.stop()
self.scheduler_service = None
if hasattr(app.state, 'scheduler_service'):
app.state.scheduler_service = None
self.logger.info("Orchestrator scheduler service stopped (no longer leader)")
# Start leader election
await self.leader_election.start(
on_become_leader=on_become_leader,
on_lose_leader=on_lose_leader
)
# Store leader election in app state for health checks
app.state.leader_election = self.leader_election
self.logger.info("Leader election initialized",
is_leader=self.leader_election.is_leader,
instance_id=self.leader_election.instance_id)
except Exception as e:
self.logger.error("Failed to setup leader election, falling back to standalone mode",
error=str(e))
# Fallback: start scheduler anyway (for single-pod deployments)
from app.services.orchestrator_service import OrchestratorSchedulerService
self.scheduler_service = OrchestratorSchedulerService(self.event_publisher, settings)
await self.scheduler_service.start()
app.state.scheduler_service = self.scheduler_service
self.logger.warning("Scheduler started in standalone mode (no leader election)")
async def on_shutdown(self, app: FastAPI):
"""Custom shutdown logic for orchestrator service"""
self.logger.info("Orchestrator Service shutting down...")
# Stop leader election (this will also stop scheduler if we're the leader)
if self.leader_election:
await self.leader_election.stop()
self.logger.info("Leader election stopped")
# Stop scheduler service if still running
if self.scheduler_service:
await self.scheduler_service.stop()
self.logger.info("Orchestrator scheduler service stopped")
def get_service_features(self):
"""Return orchestrator-specific features"""
return [
"automated_orchestration",
"forecasting_integration",
"production_scheduling",
"procurement_planning",
"notification_dispatch",
"leader_election",
"retry_mechanism",
"circuit_breaker"
]
# Create service instance
service = OrchestratorService()
# Create FastAPI app with standardized setup
app = service.create_app()
# Setup standard endpoints (health, readiness, metrics)
service.setup_standard_endpoints()
# Include routers
# BUSINESS: Orchestration operations
from app.api.orchestration import router as orchestration_router
from app.api.internal import router as internal_router
service.add_router(orchestration_router)
service.add_router(internal_router)
# INTERNAL: Service-to-service endpoints for demo data cloning
from app.api.internal_demo import router as internal_demo_router
service.add_router(internal_demo_router, tags=["internal-demo"])
@app.middleware("http")
async def logging_middleware(request: Request, call_next):
"""Add request logging middleware"""
import time
start_time = time.time()
response = await call_next(request)
process_time = time.time() - start_time
service.logger.info("HTTP request processed",
method=request.method,
url=str(request.url),
status_code=response.status_code,
process_time=round(process_time, 4))
return response
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"main:app",
host="0.0.0.0",
port=8000,
reload=settings.DEBUG
)

View File

View File

@@ -0,0 +1,894 @@
"""
AI-Enhanced Orchestration Saga
Integrates ML insights into daily workflow orchestration
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime, timedelta
from uuid import UUID
import structlog
from shared.clients.ai_insights_client import AIInsightsClient
logger = structlog.get_logger()
class AIEnhancedOrchestrator:
"""
Enhanced orchestration engine that integrates ML insights into daily workflow.
Workflow:
1. Pre-Orchestration: Gather all relevant insights for target date
2. Intelligent Planning: Modify orchestration plan based on insights
3. Execution: Apply insights with confidence-based decision making
4. Feedback Tracking: Record outcomes for continuous learning
Replaces hardcoded logic with learned intelligence from:
- Demand Forecasting
- Supplier Performance
- Safety Stock Optimization
- Price Forecasting
- Production Yield Prediction
- Dynamic Business Rules
"""
def __init__(
self,
ai_insights_base_url: str = "http://ai-insights-service:8000",
min_confidence_threshold: int = 70
):
self.ai_insights_client = AIInsightsClient(ai_insights_base_url)
self.min_confidence_threshold = min_confidence_threshold
self.applied_insights = [] # Track applied insights for feedback
async def orchestrate_with_ai(
self,
tenant_id: str,
target_date: datetime,
base_orchestration_plan: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""
Run AI-enhanced orchestration for a target date.
Args:
tenant_id: Tenant identifier
target_date: Date to orchestrate for
base_orchestration_plan: Optional base plan to enhance (if None, creates new)
Returns:
Enhanced orchestration plan with applied insights and metadata
"""
logger.info(
"Starting AI-enhanced orchestration",
tenant_id=tenant_id,
target_date=target_date.isoformat()
)
# Step 1: Gather insights for target date
insights = await self._gather_insights(tenant_id, target_date)
logger.info(
"Insights gathered",
demand_forecasts=len(insights['demand_forecasts']),
supplier_alerts=len(insights['supplier_alerts']),
inventory_optimizations=len(insights['inventory_optimizations']),
price_opportunities=len(insights['price_opportunities']),
yield_predictions=len(insights['yield_predictions']),
business_rules=len(insights['business_rules'])
)
# Step 2: Initialize or load base plan
if base_orchestration_plan is None:
orchestration_plan = self._create_base_plan(target_date)
else:
orchestration_plan = base_orchestration_plan.copy()
# Step 3: Apply insights to plan
enhanced_plan = await self._apply_insights_to_plan(
orchestration_plan, insights, tenant_id
)
# Step 4: Generate execution summary
execution_summary = self._generate_execution_summary(
enhanced_plan, insights
)
logger.info(
"AI-enhanced orchestration complete",
tenant_id=tenant_id,
insights_applied=execution_summary['total_insights_applied'],
modifications=execution_summary['total_modifications']
)
return {
'tenant_id': tenant_id,
'target_date': target_date.isoformat(),
'orchestrated_at': datetime.utcnow().isoformat(),
'plan': enhanced_plan,
'insights_used': insights,
'execution_summary': execution_summary,
'applied_insights': self.applied_insights
}
async def _gather_insights(
self,
tenant_id: str,
target_date: datetime
) -> Dict[str, List[Dict[str, Any]]]:
"""
Gather all relevant insights for target date from AI Insights Service.
Returns insights categorized by type:
- demand_forecasts
- supplier_alerts
- inventory_optimizations
- price_opportunities
- yield_predictions
- business_rules
"""
# Get orchestration-ready insights
insights = await self.ai_insights_client.get_orchestration_ready_insights(
tenant_id=UUID(tenant_id),
target_date=target_date,
min_confidence=self.min_confidence_threshold
)
# Categorize insights by source
categorized = {
'demand_forecasts': [],
'supplier_alerts': [],
'inventory_optimizations': [],
'price_opportunities': [],
'yield_predictions': [],
'business_rules': [],
'other': []
}
for insight in insights:
source_model = insight.get('source_model', '')
category = insight.get('category', '')
if source_model == 'hybrid_forecaster' or category == 'demand':
categorized['demand_forecasts'].append(insight)
elif source_model == 'supplier_performance_predictor':
categorized['supplier_alerts'].append(insight)
elif source_model == 'safety_stock_optimizer':
categorized['inventory_optimizations'].append(insight)
elif source_model == 'price_forecaster':
categorized['price_opportunities'].append(insight)
elif source_model == 'yield_predictor':
categorized['yield_predictions'].append(insight)
elif source_model == 'business_rules_engine':
categorized['business_rules'].append(insight)
else:
categorized['other'].append(insight)
return categorized
def _create_base_plan(self, target_date: datetime) -> Dict[str, Any]:
"""Create base orchestration plan with default hardcoded values."""
return {
'target_date': target_date.isoformat(),
'procurement': {
'orders': [],
'supplier_selections': {},
'order_quantities': {}
},
'inventory': {
'safety_stock_levels': {},
'reorder_points': {},
'transfers': []
},
'production': {
'production_runs': [],
'recipe_quantities': {},
'worker_assignments': {}
},
'sales': {
'forecasted_demand': {},
'pricing_adjustments': {}
},
'modifications': [],
'ai_enhancements': []
}
async def _apply_insights_to_plan(
self,
plan: Dict[str, Any],
insights: Dict[str, List[Dict[str, Any]]],
tenant_id: str
) -> Dict[str, Any]:
"""
Apply categorized insights to orchestration plan.
Each insight type modifies specific parts of the plan:
- Demand forecasts → sales forecasts, production quantities
- Supplier alerts → supplier selection, procurement timing
- Inventory optimizations → safety stock levels, reorder points
- Price opportunities → procurement timing, order quantities
- Yield predictions → production quantities, worker assignments
- Business rules → cross-cutting modifications
"""
enhanced_plan = plan.copy()
# Apply demand forecasts
if insights['demand_forecasts']:
enhanced_plan = await self._apply_demand_forecasts(
enhanced_plan, insights['demand_forecasts'], tenant_id
)
# Apply supplier alerts
if insights['supplier_alerts']:
enhanced_plan = await self._apply_supplier_alerts(
enhanced_plan, insights['supplier_alerts'], tenant_id
)
# Apply inventory optimizations
if insights['inventory_optimizations']:
enhanced_plan = await self._apply_inventory_optimizations(
enhanced_plan, insights['inventory_optimizations'], tenant_id
)
# Apply price opportunities
if insights['price_opportunities']:
enhanced_plan = await self._apply_price_opportunities(
enhanced_plan, insights['price_opportunities'], tenant_id
)
# Apply yield predictions
if insights['yield_predictions']:
enhanced_plan = await self._apply_yield_predictions(
enhanced_plan, insights['yield_predictions'], tenant_id
)
# Apply business rules (highest priority, can override)
if insights['business_rules']:
enhanced_plan = await self._apply_business_rules(
enhanced_plan, insights['business_rules'], tenant_id
)
return enhanced_plan
async def _apply_demand_forecasts(
self,
plan: Dict[str, Any],
forecasts: List[Dict[str, Any]],
tenant_id: str
) -> Dict[str, Any]:
"""
Apply demand forecasts to sales and production planning.
Modifications:
- Update sales forecasted_demand
- Adjust production recipe_quantities
- Record insight application
"""
for forecast in forecasts:
if forecast['confidence'] < self.min_confidence_threshold:
continue
metrics = forecast.get('metrics_json', {})
product_id = metrics.get('product_id')
predicted_demand = metrics.get('predicted_demand')
forecast_date = metrics.get('forecast_date')
if not product_id or predicted_demand is None:
continue
# Update sales forecast
plan['sales']['forecasted_demand'][product_id] = {
'quantity': predicted_demand,
'confidence': forecast['confidence'],
'source': 'ai_forecast',
'insight_id': forecast.get('id')
}
# Adjust production quantities (demand + buffer)
buffer_pct = 1.10 # 10% buffer for uncertainty
production_quantity = int(predicted_demand * buffer_pct)
plan['production']['recipe_quantities'][product_id] = {
'quantity': production_quantity,
'demand_forecast': predicted_demand,
'buffer_applied': buffer_pct,
'source': 'ai_forecast',
'insight_id': forecast.get('id')
}
# Record modification
plan['modifications'].append({
'type': 'demand_forecast_applied',
'insight_id': forecast.get('id'),
'product_id': product_id,
'predicted_demand': predicted_demand,
'production_quantity': production_quantity,
'confidence': forecast['confidence']
})
# Track for feedback
self.applied_insights.append({
'insight_id': forecast.get('id'),
'type': 'demand_forecast',
'applied_at': datetime.utcnow().isoformat(),
'tenant_id': tenant_id,
'metrics': {
'product_id': product_id,
'predicted_demand': predicted_demand,
'production_quantity': production_quantity
}
})
logger.info(
"Applied demand forecast",
product_id=product_id,
predicted_demand=predicted_demand,
production_quantity=production_quantity
)
return plan
async def _apply_supplier_alerts(
self,
plan: Dict[str, Any],
alerts: List[Dict[str, Any]],
tenant_id: str
) -> Dict[str, Any]:
"""
Apply supplier performance alerts to procurement decisions.
Modifications:
- Switch suppliers for low reliability
- Adjust lead times for delays
- Increase order quantities for short deliveries
"""
for alert in alerts:
if alert['confidence'] < self.min_confidence_threshold:
continue
metrics = alert.get('metrics_json', {})
supplier_id = metrics.get('supplier_id')
reliability_score = metrics.get('reliability_score')
predicted_delay = metrics.get('predicted_delivery_delay_days')
if not supplier_id:
continue
# Low reliability: recommend supplier switch
if reliability_score and reliability_score < 70:
plan['procurement']['supplier_selections'][supplier_id] = {
'action': 'avoid',
'reason': f'Low reliability score: {reliability_score}',
'alternative_required': True,
'source': 'supplier_alert',
'insight_id': alert.get('id')
}
plan['modifications'].append({
'type': 'supplier_switch_recommended',
'insight_id': alert.get('id'),
'supplier_id': supplier_id,
'reliability_score': reliability_score,
'confidence': alert['confidence']
})
# Delay predicted: adjust lead time
if predicted_delay and predicted_delay > 1:
plan['procurement']['supplier_selections'][supplier_id] = {
'action': 'adjust_lead_time',
'additional_lead_days': int(predicted_delay),
'reason': f'Predicted delay: {predicted_delay} days',
'source': 'supplier_alert',
'insight_id': alert.get('id')
}
plan['modifications'].append({
'type': 'lead_time_adjusted',
'insight_id': alert.get('id'),
'supplier_id': supplier_id,
'additional_days': int(predicted_delay),
'confidence': alert['confidence']
})
# Track for feedback
self.applied_insights.append({
'insight_id': alert.get('id'),
'type': 'supplier_alert',
'applied_at': datetime.utcnow().isoformat(),
'tenant_id': tenant_id,
'metrics': {
'supplier_id': supplier_id,
'reliability_score': reliability_score,
'predicted_delay': predicted_delay
}
})
logger.info(
"Applied supplier alert",
supplier_id=supplier_id,
reliability_score=reliability_score,
predicted_delay=predicted_delay
)
return plan
async def _apply_inventory_optimizations(
self,
plan: Dict[str, Any],
optimizations: List[Dict[str, Any]],
tenant_id: str
) -> Dict[str, Any]:
"""
Apply safety stock optimizations to inventory management.
Modifications:
- Update safety stock levels (from hardcoded 95% to learned optimal)
- Adjust reorder points accordingly
"""
for optimization in optimizations:
if optimization['confidence'] < self.min_confidence_threshold:
continue
metrics = optimization.get('metrics_json', {})
product_id = metrics.get('inventory_product_id')
optimal_safety_stock = metrics.get('optimal_safety_stock')
optimal_service_level = metrics.get('optimal_service_level')
if not product_id or optimal_safety_stock is None:
continue
# Update safety stock level
plan['inventory']['safety_stock_levels'][product_id] = {
'quantity': optimal_safety_stock,
'service_level': optimal_service_level,
'source': 'ai_optimization',
'insight_id': optimization.get('id'),
'replaced_hardcoded': True
}
# Adjust reorder point (lead time demand + safety stock)
# This would use demand forecast if available
lead_time_demand = metrics.get('lead_time_demand', optimal_safety_stock * 2)
reorder_point = lead_time_demand + optimal_safety_stock
plan['inventory']['reorder_points'][product_id] = {
'quantity': reorder_point,
'lead_time_demand': lead_time_demand,
'safety_stock': optimal_safety_stock,
'source': 'ai_optimization',
'insight_id': optimization.get('id')
}
plan['modifications'].append({
'type': 'safety_stock_optimized',
'insight_id': optimization.get('id'),
'product_id': product_id,
'optimal_safety_stock': optimal_safety_stock,
'optimal_service_level': optimal_service_level,
'confidence': optimization['confidence']
})
# Track for feedback
self.applied_insights.append({
'insight_id': optimization.get('id'),
'type': 'inventory_optimization',
'applied_at': datetime.utcnow().isoformat(),
'tenant_id': tenant_id,
'metrics': {
'product_id': product_id,
'optimal_safety_stock': optimal_safety_stock,
'reorder_point': reorder_point
}
})
logger.info(
"Applied safety stock optimization",
product_id=product_id,
optimal_safety_stock=optimal_safety_stock,
reorder_point=reorder_point
)
return plan
async def _apply_price_opportunities(
self,
plan: Dict[str, Any],
opportunities: List[Dict[str, Any]],
tenant_id: str
) -> Dict[str, Any]:
"""
Apply price forecasting opportunities to procurement timing.
Modifications:
- Advance orders for predicted price increases
- Delay orders for predicted price decreases
- Increase quantities for bulk opportunities
"""
for opportunity in opportunities:
if opportunity['confidence'] < self.min_confidence_threshold:
continue
metrics = opportunity.get('metrics_json', {})
ingredient_id = metrics.get('ingredient_id')
recommendation = metrics.get('recommendation')
expected_price_change = metrics.get('expected_price_change_pct')
if not ingredient_id or not recommendation:
continue
# Buy now: price increasing
if recommendation == 'buy_now' and expected_price_change and expected_price_change > 5:
plan['procurement']['order_quantities'][ingredient_id] = {
'action': 'increase',
'multiplier': 1.5, # Buy 50% more
'reason': f'Price expected to increase {expected_price_change:.1f}%',
'source': 'price_forecast',
'insight_id': opportunity.get('id')
}
plan['modifications'].append({
'type': 'bulk_purchase_opportunity',
'insight_id': opportunity.get('id'),
'ingredient_id': ingredient_id,
'expected_price_change': expected_price_change,
'quantity_multiplier': 1.5,
'confidence': opportunity['confidence']
})
# Wait: price decreasing
elif recommendation == 'wait' and expected_price_change and expected_price_change < -5:
plan['procurement']['order_quantities'][ingredient_id] = {
'action': 'delay',
'delay_days': 7,
'reason': f'Price expected to decrease {abs(expected_price_change):.1f}%',
'source': 'price_forecast',
'insight_id': opportunity.get('id')
}
plan['modifications'].append({
'type': 'procurement_delayed',
'insight_id': opportunity.get('id'),
'ingredient_id': ingredient_id,
'expected_price_change': expected_price_change,
'delay_days': 7,
'confidence': opportunity['confidence']
})
# Track for feedback
self.applied_insights.append({
'insight_id': opportunity.get('id'),
'type': 'price_opportunity',
'applied_at': datetime.utcnow().isoformat(),
'tenant_id': tenant_id,
'metrics': {
'ingredient_id': ingredient_id,
'recommendation': recommendation,
'expected_price_change': expected_price_change
}
})
logger.info(
"Applied price opportunity",
ingredient_id=ingredient_id,
recommendation=recommendation,
expected_price_change=expected_price_change
)
return plan
async def _apply_yield_predictions(
self,
plan: Dict[str, Any],
predictions: List[Dict[str, Any]],
tenant_id: str
) -> Dict[str, Any]:
"""
Apply production yield predictions to production planning.
Modifications:
- Increase production quantities for low predicted yield
- Optimize worker assignments
- Adjust production timing
"""
for prediction in predictions:
if prediction['confidence'] < self.min_confidence_threshold:
continue
metrics = prediction.get('metrics_json', {})
recipe_id = metrics.get('recipe_id')
predicted_yield = metrics.get('predicted_yield')
expected_waste = metrics.get('expected_waste')
if not recipe_id or predicted_yield is None:
continue
# Low yield: increase production quantity to compensate
if predicted_yield < 90:
current_quantity = plan['production']['recipe_quantities'].get(
recipe_id, {}
).get('quantity', 100)
# Adjust quantity to account for predicted waste
adjusted_quantity = int(current_quantity * (100 / predicted_yield))
plan['production']['recipe_quantities'][recipe_id] = {
'quantity': adjusted_quantity,
'predicted_yield': predicted_yield,
'waste_compensation': adjusted_quantity - current_quantity,
'source': 'yield_prediction',
'insight_id': prediction.get('id')
}
plan['modifications'].append({
'type': 'yield_compensation_applied',
'insight_id': prediction.get('id'),
'recipe_id': recipe_id,
'predicted_yield': predicted_yield,
'original_quantity': current_quantity,
'adjusted_quantity': adjusted_quantity,
'confidence': prediction['confidence']
})
# Track for feedback
self.applied_insights.append({
'insight_id': prediction.get('id'),
'type': 'yield_prediction',
'applied_at': datetime.utcnow().isoformat(),
'tenant_id': tenant_id,
'metrics': {
'recipe_id': recipe_id,
'predicted_yield': predicted_yield,
'expected_waste': expected_waste
}
})
logger.info(
"Applied yield prediction",
recipe_id=recipe_id,
predicted_yield=predicted_yield
)
return plan
async def _apply_business_rules(
self,
plan: Dict[str, Any],
rules: List[Dict[str, Any]],
tenant_id: str
) -> Dict[str, Any]:
"""
Apply dynamic business rules to orchestration plan.
Business rules can override other insights based on business logic.
"""
for rule in rules:
if rule['confidence'] < self.min_confidence_threshold:
continue
# Business rules are flexible and defined in JSONB
# Parse recommendation_actions to understand what to apply
actions = rule.get('recommendation_actions', [])
for action in actions:
action_type = action.get('action')
params = action.get('params', {})
# Example: Force supplier switch
if action_type == 'force_supplier_switch':
supplier_id = params.get('from_supplier_id')
alternate_id = params.get('to_supplier_id')
if supplier_id and alternate_id:
plan['procurement']['supplier_selections'][supplier_id] = {
'action': 'replace',
'alternate_supplier': alternate_id,
'reason': rule.get('description'),
'source': 'business_rule',
'insight_id': rule.get('id'),
'override': True
}
# Example: Halt production
elif action_type == 'halt_production':
recipe_id = params.get('recipe_id')
if recipe_id:
plan['production']['recipe_quantities'][recipe_id] = {
'quantity': 0,
'halted': True,
'reason': rule.get('description'),
'source': 'business_rule',
'insight_id': rule.get('id')
}
plan['modifications'].append({
'type': 'business_rule_applied',
'insight_id': rule.get('id'),
'rule_description': rule.get('description'),
'confidence': rule['confidence']
})
# Track for feedback
self.applied_insights.append({
'insight_id': rule.get('id'),
'type': 'business_rule',
'applied_at': datetime.utcnow().isoformat(),
'tenant_id': tenant_id,
'metrics': {'actions': len(actions)}
})
logger.info(
"Applied business rule",
rule_description=rule.get('title')
)
return plan
def _generate_execution_summary(
self,
plan: Dict[str, Any],
insights: Dict[str, List[Dict[str, Any]]]
) -> Dict[str, Any]:
"""Generate summary of AI-enhanced orchestration execution."""
total_insights_available = sum(len(v) for v in insights.values())
total_insights_applied = len(self.applied_insights)
total_modifications = len(plan.get('modifications', []))
# Count by type
insights_by_type = {}
for category, category_insights in insights.items():
insights_by_type[category] = {
'available': len(category_insights),
'applied': len([
i for i in self.applied_insights
if i['type'] == category.rstrip('s') # Remove plural
])
}
return {
'total_insights_available': total_insights_available,
'total_insights_applied': total_insights_applied,
'total_modifications': total_modifications,
'application_rate': round(
(total_insights_applied / total_insights_available * 100)
if total_insights_available > 0 else 0,
2
),
'insights_by_type': insights_by_type,
'modifications_summary': self._summarize_modifications(plan)
}
def _summarize_modifications(self, plan: Dict[str, Any]) -> Dict[str, int]:
"""Summarize modifications by type."""
modifications = plan.get('modifications', [])
summary = {}
for mod in modifications:
mod_type = mod.get('type', 'unknown')
summary[mod_type] = summary.get(mod_type, 0) + 1
return summary
async def record_orchestration_feedback(
self,
tenant_id: str,
target_date: datetime,
actual_outcomes: Dict[str, Any]
) -> Dict[str, Any]:
"""
Record feedback for applied insights to enable continuous learning.
Args:
tenant_id: Tenant identifier
target_date: Orchestration target date
actual_outcomes: Actual results:
- actual_demand: {product_id: actual_quantity}
- actual_yields: {recipe_id: actual_yield_pct}
- actual_costs: {ingredient_id: actual_price}
- supplier_performance: {supplier_id: on_time_delivery}
Returns:
Feedback recording results
"""
logger.info(
"Recording orchestration feedback",
tenant_id=tenant_id,
target_date=target_date.isoformat(),
applied_insights=len(self.applied_insights)
)
feedback_results = []
for applied in self.applied_insights:
insight_id = applied.get('insight_id')
insight_type = applied.get('type')
metrics = applied.get('metrics', {})
# Prepare feedback based on type
feedback_data = {
'applied': True,
'applied_at': applied.get('applied_at'),
'outcome_date': target_date.isoformat()
}
# Demand forecast feedback
if insight_type == 'demand_forecast':
product_id = metrics.get('product_id')
predicted_demand = metrics.get('predicted_demand')
actual_demand = actual_outcomes.get('actual_demand', {}).get(product_id)
if actual_demand is not None:
error = abs(actual_demand - predicted_demand)
error_pct = (error / actual_demand * 100) if actual_demand > 0 else 0
feedback_data['outcome_metrics'] = {
'predicted_demand': predicted_demand,
'actual_demand': actual_demand,
'error': error,
'error_pct': round(error_pct, 2),
'accuracy': round(100 - error_pct, 2)
}
# Yield prediction feedback
elif insight_type == 'yield_prediction':
recipe_id = metrics.get('recipe_id')
predicted_yield = metrics.get('predicted_yield')
actual_yield = actual_outcomes.get('actual_yields', {}).get(recipe_id)
if actual_yield is not None:
error = abs(actual_yield - predicted_yield)
feedback_data['outcome_metrics'] = {
'predicted_yield': predicted_yield,
'actual_yield': actual_yield,
'error': round(error, 2),
'accuracy': round(100 - (error / actual_yield * 100), 2) if actual_yield > 0 else 0
}
# Record feedback via AI Insights Client
try:
await self.ai_insights_client.record_feedback(
tenant_id=UUID(tenant_id),
insight_id=UUID(insight_id) if insight_id else None,
feedback_data=feedback_data
)
feedback_results.append({
'insight_id': insight_id,
'insight_type': insight_type,
'status': 'recorded',
'feedback': feedback_data
})
except Exception as e:
logger.error(
"Error recording feedback",
insight_id=insight_id,
error=str(e)
)
feedback_results.append({
'insight_id': insight_id,
'insight_type': insight_type,
'status': 'failed',
'error': str(e)
})
logger.info(
"Feedback recording complete",
total=len(feedback_results),
successful=len([r for r in feedback_results if r['status'] == 'recorded'])
)
return {
'tenant_id': tenant_id,
'target_date': target_date.isoformat(),
'feedback_recorded_at': datetime.utcnow().isoformat(),
'total_insights': len(self.applied_insights),
'feedback_results': feedback_results,
'successful': len([r for r in feedback_results if r['status'] == 'recorded']),
'failed': len([r for r in feedback_results if r['status'] == 'failed'])
}
async def close(self):
"""Close HTTP client connections."""
await self.ai_insights_client.close()

View File

@@ -0,0 +1,13 @@
# ================================================================
# services/orchestrator/app/models/__init__.py
# ================================================================
"""
Orchestrator Service Models
"""
from .orchestration_run import OrchestrationRun, OrchestrationStatus
__all__ = [
"OrchestrationRun",
"OrchestrationStatus",
]

View File

@@ -0,0 +1,113 @@
# ================================================================
# services/orchestrator/app/models/orchestration_run.py
# ================================================================
"""
Orchestration Run Models - Audit trail for orchestration executions
"""
import uuid
import enum
from datetime import datetime, timezone
from sqlalchemy import Column, String, DateTime, Integer, Text, Boolean, Enum as SQLEnum
from sqlalchemy.dialects.postgresql import UUID, JSONB
from sqlalchemy.sql import func
from shared.database.base import Base
class OrchestrationStatus(enum.Enum):
"""Orchestration run status"""
pending = "pending"
running = "running"
completed = "completed"
partial_success = "partial_success"
failed = "failed"
cancelled = "cancelled"
class OrchestrationRun(Base):
"""Audit trail for orchestration executions"""
__tablename__ = "orchestration_runs"
# Primary identification
id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
run_number = Column(String(50), nullable=False, unique=True, index=True)
# Run details
tenant_id = Column(UUID(as_uuid=True), nullable=False, index=True)
status = Column(SQLEnum(OrchestrationStatus), nullable=False, default=OrchestrationStatus.pending, index=True)
run_type = Column(String(50), nullable=False, default="scheduled") # scheduled, manual, test
priority = Column(String(20), nullable=False, default="normal") # normal, high, critical
# Timing
started_at = Column(DateTime(timezone=True), nullable=False, default=lambda: datetime.now(timezone.utc))
completed_at = Column(DateTime(timezone=True), nullable=True)
duration_seconds = Column(Integer, nullable=True)
# Step tracking
forecasting_started_at = Column(DateTime(timezone=True), nullable=True)
forecasting_completed_at = Column(DateTime(timezone=True), nullable=True)
forecasting_status = Column(String(20), nullable=True) # success, failed, skipped
forecasting_error = Column(Text, nullable=True)
production_started_at = Column(DateTime(timezone=True), nullable=True)
production_completed_at = Column(DateTime(timezone=True), nullable=True)
production_status = Column(String(20), nullable=True) # success, failed, skipped
production_error = Column(Text, nullable=True)
procurement_started_at = Column(DateTime(timezone=True), nullable=True)
procurement_completed_at = Column(DateTime(timezone=True), nullable=True)
procurement_status = Column(String(20), nullable=True) # success, failed, skipped
procurement_error = Column(Text, nullable=True)
notification_started_at = Column(DateTime(timezone=True), nullable=True)
notification_completed_at = Column(DateTime(timezone=True), nullable=True)
notification_status = Column(String(20), nullable=True) # success, failed, skipped
notification_error = Column(Text, nullable=True)
# AI Insights tracking
ai_insights_started_at = Column(DateTime(timezone=True), nullable=True)
ai_insights_completed_at = Column(DateTime(timezone=True), nullable=True)
ai_insights_status = Column(String(20), nullable=True) # success, failed, skipped
ai_insights_error = Column(Text, nullable=True)
ai_insights_generated = Column(Integer, nullable=False, default=0)
ai_insights_posted = Column(Integer, nullable=False, default=0)
# Results summary
forecasts_generated = Column(Integer, nullable=False, default=0)
production_batches_created = Column(Integer, nullable=False, default=0)
procurement_plans_created = Column(Integer, nullable=False, default=0)
purchase_orders_created = Column(Integer, nullable=False, default=0)
notifications_sent = Column(Integer, nullable=False, default=0)
# Forecast data passed between services
forecast_data = Column(JSONB, nullable=True) # Store forecast results for downstream services
# Error handling
retry_count = Column(Integer, nullable=False, default=0)
max_retries_reached = Column(Boolean, nullable=False, default=False)
error_message = Column(Text, nullable=True)
error_details = Column(JSONB, nullable=True)
# External references
forecast_id = Column(UUID(as_uuid=True), nullable=True)
production_schedule_id = Column(UUID(as_uuid=True), nullable=True)
procurement_plan_id = Column(UUID(as_uuid=True), nullable=True)
# Saga tracking
saga_steps_total = Column(Integer, nullable=False, default=0)
saga_steps_completed = Column(Integer, nullable=False, default=0)
# Audit fields
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
triggered_by = Column(String(100), nullable=True) # scheduler, user_id, api
# Performance metrics
fulfillment_rate = Column(Integer, nullable=True) # Percentage as integer (0-100)
on_time_delivery_rate = Column(Integer, nullable=True) # Percentage as integer (0-100)
cost_accuracy = Column(Integer, nullable=True) # Percentage as integer (0-100)
quality_score = Column(Integer, nullable=True) # Rating as integer (0-100)
# Metadata
run_metadata = Column(JSONB, nullable=True)

View File

@@ -0,0 +1,193 @@
# ================================================================
# services/orchestrator/app/repositories/orchestration_run_repository.py
# ================================================================
"""
Orchestration Run Repository - Database operations for orchestration audit trail
"""
import uuid
from datetime import datetime, date, timezone
from typing import List, Optional, Dict, Any
from sqlalchemy import select, and_, desc, func
from sqlalchemy.ext.asyncio import AsyncSession
from app.models.orchestration_run import OrchestrationRun, OrchestrationStatus
class OrchestrationRunRepository:
"""Repository for orchestration run operations"""
def __init__(self, db: AsyncSession):
self.db = db
async def create_run(self, run_data: Dict[str, Any]) -> OrchestrationRun:
"""Create a new orchestration run"""
run = OrchestrationRun(**run_data)
self.db.add(run)
await self.db.flush()
return run
async def get_run_by_id(self, run_id: uuid.UUID) -> Optional[OrchestrationRun]:
"""Get orchestration run by ID"""
stmt = select(OrchestrationRun).where(OrchestrationRun.id == run_id)
result = await self.db.execute(stmt)
return result.scalar_one_or_none()
async def update_run(self, run_id: uuid.UUID, updates: Dict[str, Any]) -> Optional[OrchestrationRun]:
"""Update orchestration run"""
run = await self.get_run_by_id(run_id)
if not run:
return None
for key, value in updates.items():
if hasattr(run, key):
setattr(run, key, value)
run.updated_at = datetime.now(timezone.utc)
await self.db.flush()
return run
async def list_runs(
self,
tenant_id: Optional[uuid.UUID] = None,
status: Optional[OrchestrationStatus] = None,
start_date: Optional[date] = None,
end_date: Optional[date] = None,
limit: int = 50,
offset: int = 0
) -> List[OrchestrationRun]:
"""List orchestration runs with filters"""
conditions = []
if tenant_id:
conditions.append(OrchestrationRun.tenant_id == tenant_id)
if status:
conditions.append(OrchestrationRun.status == status)
if start_date:
conditions.append(func.date(OrchestrationRun.started_at) >= start_date)
if end_date:
conditions.append(func.date(OrchestrationRun.started_at) <= end_date)
stmt = (
select(OrchestrationRun)
.where(and_(*conditions) if conditions else True)
.order_by(desc(OrchestrationRun.started_at))
.limit(limit)
.offset(offset)
)
result = await self.db.execute(stmt)
return result.scalars().all()
async def get_latest_run_for_tenant(self, tenant_id: uuid.UUID) -> Optional[OrchestrationRun]:
"""Get the most recent orchestration run for a tenant"""
stmt = (
select(OrchestrationRun)
.where(OrchestrationRun.tenant_id == tenant_id)
.order_by(desc(OrchestrationRun.started_at))
.limit(1)
)
result = await self.db.execute(stmt)
return result.scalar_one_or_none()
async def generate_run_number(self) -> str:
"""
Generate unique run number atomically using database-level counting.
Uses MAX(run_number) + 1 approach to avoid race conditions
between reading count and inserting new record.
"""
today = date.today()
date_str = today.strftime("%Y%m%d")
# Get the highest run number for today atomically
# Using MAX on run_number suffix to avoid counting which has race conditions
stmt = select(func.max(OrchestrationRun.run_number)).where(
OrchestrationRun.run_number.like(f"ORCH-{date_str}-%")
)
result = await self.db.execute(stmt)
max_run_number = result.scalar()
if max_run_number:
# Extract the numeric suffix and increment it
try:
suffix = int(max_run_number.split('-')[-1])
next_number = suffix + 1
except (ValueError, IndexError):
# Fallback to 1 if parsing fails
next_number = 1
else:
# No runs for today yet
next_number = 1
return f"ORCH-{date_str}-{next_number:04d}"
async def get_failed_runs(self, limit: int = 10) -> List[OrchestrationRun]:
"""Get recent failed orchestration runs"""
stmt = (
select(OrchestrationRun)
.where(OrchestrationRun.status == OrchestrationStatus.failed)
.order_by(desc(OrchestrationRun.started_at))
.limit(limit)
)
result = await self.db.execute(stmt)
return result.scalars().all()
async def get_run_statistics(
self,
start_date: Optional[date] = None,
end_date: Optional[date] = None
) -> Dict[str, Any]:
"""Get orchestration run statistics"""
conditions = []
if start_date:
conditions.append(func.date(OrchestrationRun.started_at) >= start_date)
if end_date:
conditions.append(func.date(OrchestrationRun.started_at) <= end_date)
where_clause = and_(*conditions) if conditions else True
# Total runs
total_stmt = select(func.count(OrchestrationRun.id)).where(where_clause)
total_result = await self.db.execute(total_stmt)
total_runs = total_result.scalar() or 0
# Successful runs
success_stmt = select(func.count(OrchestrationRun.id)).where(
and_(
where_clause,
OrchestrationRun.status == OrchestrationStatus.completed
)
)
success_result = await self.db.execute(success_stmt)
successful_runs = success_result.scalar() or 0
# Failed runs
failed_stmt = select(func.count(OrchestrationRun.id)).where(
and_(
where_clause,
OrchestrationRun.status == OrchestrationStatus.failed
)
)
failed_result = await self.db.execute(failed_stmt)
failed_runs = failed_result.scalar() or 0
# Average duration
avg_duration_stmt = select(func.avg(OrchestrationRun.duration_seconds)).where(
and_(
where_clause,
OrchestrationRun.status == OrchestrationStatus.completed
)
)
avg_duration_result = await self.db.execute(avg_duration_stmt)
avg_duration = avg_duration_result.scalar() or 0
return {
'total_runs': total_runs,
'successful_runs': successful_runs,
'failed_runs': failed_runs,
'success_rate': (successful_runs / total_runs * 100) if total_runs > 0 else 0,
'average_duration_seconds': float(avg_duration) if avg_duration else 0
}

View File

@@ -0,0 +1,162 @@
"""
Orchestration Notification Service - Simplified
Emits minimal events using EventPublisher.
All enrichment handled by alert_processor.
"""
from datetime import datetime, timezone
from typing import Optional, Dict, Any
from uuid import UUID
import structlog
from shared.messaging import UnifiedEventPublisher
logger = structlog.get_logger()
class OrchestrationNotificationService:
"""
Service for emitting orchestration notifications using EventPublisher.
"""
def __init__(self, event_publisher: UnifiedEventPublisher):
self.publisher = event_publisher
async def emit_orchestration_run_started_notification(
self,
tenant_id: UUID,
run_id: str,
run_type: str, # 'scheduled', 'manual', 'triggered'
scope: str, # 'full', 'inventory_only', 'production_only'
) -> None:
"""
Emit notification when an orchestration run starts.
"""
metadata = {
"run_id": run_id,
"run_type": run_type,
"scope": scope,
"started_at": datetime.now(timezone.utc).isoformat(),
}
await self.publisher.publish_notification(
event_type="operations.orchestration_run_started",
tenant_id=tenant_id,
data=metadata
)
logger.info(
"orchestration_run_started_notification_emitted",
tenant_id=str(tenant_id),
run_id=run_id
)
async def emit_orchestration_run_completed_notification(
self,
tenant_id: UUID,
run_id: str,
duration_seconds: float,
actions_created: int,
actions_by_type: Dict[str, int], # e.g., {'purchase_order': 2, 'production_batch': 3}
status: str = "success",
) -> None:
"""
Emit notification when an orchestration run completes.
"""
# Build message with action summary
if actions_created == 0:
action_summary = "No actions needed"
else:
action_summary = ", ".join([f"{count} {action_type}" for action_type, count in actions_by_type.items()])
metadata = {
"run_id": run_id,
"status": status,
"duration_seconds": float(duration_seconds),
"actions_created": actions_created,
"actions_by_type": actions_by_type,
"action_summary": action_summary,
"completed_at": datetime.now(timezone.utc).isoformat(),
}
await self.publisher.publish_notification(
event_type="operations.orchestration_run_completed",
tenant_id=tenant_id,
data=metadata
)
logger.info(
"orchestration_run_completed_notification_emitted",
tenant_id=str(tenant_id),
run_id=run_id,
actions_created=actions_created
)
async def emit_action_created_notification(
self,
tenant_id: UUID,
run_id: str,
action_id: str,
action_type: str, # 'purchase_order', 'production_batch', 'inventory_adjustment'
action_details: Dict[str, Any], # Type-specific details
reason: str,
estimated_impact: Optional[Dict[str, Any]] = None,
) -> None:
"""
Emit notification when the orchestrator creates an action.
"""
metadata = {
"run_id": run_id,
"action_id": action_id,
"action_type": action_type,
"action_details": action_details,
"reason": reason,
"estimated_impact": estimated_impact,
"created_at": datetime.now(timezone.utc).isoformat(),
}
await self.publisher.publish_notification(
event_type="operations.action_created",
tenant_id=tenant_id,
data=metadata
)
logger.info(
"action_created_notification_emitted",
tenant_id=str(tenant_id),
action_id=action_id,
action_type=action_type
)
async def emit_action_completed_notification(
self,
tenant_id: UUID,
action_id: str,
action_type: str,
action_status: str, # 'approved', 'completed', 'rejected', 'cancelled'
completed_by: Optional[str] = None,
) -> None:
"""
Emit notification when an orchestrator action is completed/resolved.
"""
metadata = {
"action_id": action_id,
"action_type": action_type,
"action_status": action_status,
"completed_by": completed_by,
"completed_at": datetime.now(timezone.utc).isoformat(),
}
await self.publisher.publish_notification(
event_type="operations.action_completed",
tenant_id=tenant_id,
data=metadata
)
logger.info(
"action_completed_notification_emitted",
tenant_id=str(tenant_id),
action_id=action_id,
action_status=action_status
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,728 @@
"""
Orchestrator Scheduler Service - REFACTORED
Coordinates daily auto-generation workflow: Forecasting → Production → Procurement
CHANGES FROM ORIGINAL:
- Updated to use new EventPublisher pattern for all messaging
- Integrated OrchestrationSaga for error handling and compensation
- Added circuit breakers for all service calls
- Implemented real Forecasting Service integration
- Implemented real Production Service integration
- Implemented real Tenant Service integration
- Implemented real Notification Service integration
- NO backwards compatibility, NO feature flags - complete rewrite
"""
import asyncio
import uuid
from datetime import datetime, date, timezone
from decimal import Decimal
from typing import List, Dict, Any, Optional
import structlog
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
# Updated imports - removed old alert system
from shared.messaging import UnifiedEventPublisher
from shared.clients.forecast_client import ForecastServiceClient
from shared.clients.production_client import ProductionServiceClient
from shared.clients.procurement_client import ProcurementServiceClient
from shared.clients.notification_client import NotificationServiceClient
from shared.clients.tenant_client import TenantServiceClient
from shared.clients.inventory_client import InventoryServiceClient
from shared.clients.suppliers_client import SuppliersServiceClient
from shared.clients.recipes_client import RecipesServiceClient
from shared.clients.training_client import TrainingServiceClient
from shared.utils.circuit_breaker import CircuitBreaker, CircuitBreakerOpenError
from app.core.config import settings
from app.repositories.orchestration_run_repository import OrchestrationRunRepository
from app.models.orchestration_run import OrchestrationStatus
from app.services.orchestration_saga import OrchestrationSaga
logger = structlog.get_logger()
class OrchestratorSchedulerService:
"""
Orchestrator Service using EventPublisher for messaging
Handles automated daily orchestration of forecasting, production, and procurement
"""
def __init__(self, event_publisher: UnifiedEventPublisher, config):
self.publisher = event_publisher
self.config = config
# APScheduler instance for running daily orchestration
self.scheduler = None
# Service clients
self.forecast_client = ForecastServiceClient(config, "orchestrator-service")
self.production_client = ProductionServiceClient(config, "orchestrator-service")
self.procurement_client = ProcurementServiceClient(config, "orchestrator-service")
self.notification_client = NotificationServiceClient(config, "orchestrator-service")
self.tenant_client = TenantServiceClient(config)
self.training_client = TrainingServiceClient(config, "orchestrator-service")
# Clients for centralized data fetching
self.inventory_client = InventoryServiceClient(config, "orchestrator-service")
self.suppliers_client = SuppliersServiceClient(config, "orchestrator-service")
self.recipes_client = RecipesServiceClient(config, "orchestrator-service")
# Circuit breakers for each service
self.forecast_breaker = CircuitBreaker(
failure_threshold=5,
timeout_duration=60,
success_threshold=2
)
self.production_breaker = CircuitBreaker(
failure_threshold=5,
timeout_duration=60,
success_threshold=2
)
self.procurement_breaker = CircuitBreaker(
failure_threshold=5,
timeout_duration=60,
success_threshold=2
)
self.tenant_breaker = CircuitBreaker(
failure_threshold=3,
timeout_duration=30,
success_threshold=2
)
self.inventory_breaker = CircuitBreaker(
failure_threshold=5,
timeout_duration=60,
success_threshold=2
)
self.suppliers_breaker = CircuitBreaker(
failure_threshold=5,
timeout_duration=60,
success_threshold=2
)
self.recipes_breaker = CircuitBreaker(
failure_threshold=5,
timeout_duration=60,
success_threshold=2
)
async def emit_orchestration_run_started(
self,
tenant_id: uuid.UUID,
run_id: str,
run_type: str, # 'scheduled', 'manual', 'triggered'
scope: str, # 'full', 'inventory_only', 'production_only'
):
"""
Emit notification when an orchestration run starts.
"""
metadata = {
"run_id": run_id,
"run_type": run_type,
"scope": scope,
"started_at": datetime.now(timezone.utc).isoformat(),
}
await self.publisher.publish_notification(
event_type="operations.orchestration_run_started",
tenant_id=tenant_id,
data=metadata
)
logger.info(
"orchestration_run_started_notification_emitted",
tenant_id=str(tenant_id),
run_id=run_id
)
async def emit_orchestration_run_completed(
self,
tenant_id: uuid.UUID,
run_id: str,
duration_seconds: float,
actions_created: int,
actions_by_type: Dict[str, int], # e.g., {'purchase_order': 2, 'production_batch': 3}
status: str = "success",
):
"""
Emit notification when an orchestration run completes.
"""
# Build message with action summary
if actions_created == 0:
action_summary = "No actions needed"
else:
action_summary = ", ".join([f"{count} {action_type}" for action_type, count in actions_by_type.items()])
metadata = {
"run_id": run_id,
"status": status,
"duration_seconds": float(duration_seconds),
"actions_created": actions_created,
"actions_by_type": actions_by_type,
"action_summary": action_summary,
"completed_at": datetime.now(timezone.utc).isoformat(),
}
await self.publisher.publish_notification(
event_type="operations.orchestration_run_completed",
tenant_id=tenant_id,
data=metadata
)
logger.info(
"orchestration_run_completed_notification_emitted",
tenant_id=str(tenant_id),
run_id=run_id,
actions_created=actions_created
)
async def emit_action_created_notification(
self,
tenant_id: uuid.UUID,
run_id: str,
action_id: str,
action_type: str, # 'purchase_order', 'production_batch', 'inventory_adjustment'
action_details: Dict[str, Any], # Type-specific details
reason: str,
estimated_impact: Optional[Dict[str, Any]] = None,
):
"""
Emit notification when the orchestrator creates an action.
"""
metadata = {
"run_id": run_id,
"action_id": action_id,
"action_type": action_type,
"action_details": action_details,
"reason": reason,
"estimated_impact": estimated_impact,
"created_at": datetime.now(timezone.utc).isoformat(),
}
await self.publisher.publish_notification(
event_type="operations.action_created",
tenant_id=tenant_id,
data=metadata
)
logger.info(
"action_created_notification_emitted",
tenant_id=str(tenant_id),
action_id=action_id,
action_type=action_type
)
async def emit_action_completed_notification(
self,
tenant_id: uuid.UUID,
action_id: str,
action_type: str,
action_status: str, # 'approved', 'completed', 'rejected', 'cancelled'
completed_by: Optional[str] = None,
):
"""
Emit notification when an orchestrator action is completed/resolved.
"""
metadata = {
"action_id": action_id,
"action_type": action_type,
"action_status": action_status,
"completed_by": completed_by,
"completed_at": datetime.now(timezone.utc).isoformat(),
}
await self.publisher.publish_notification(
event_type="operations.action_completed",
tenant_id=tenant_id,
data=metadata
)
logger.info(
"action_completed_notification_emitted",
tenant_id=str(tenant_id),
action_id=action_id,
action_status=action_status
)
async def run_daily_orchestration(self):
"""
Main orchestration workflow - runs daily
Executes for all active tenants in parallel (with limits)
"""
if not settings.ORCHESTRATION_ENABLED:
logger.info("Orchestration disabled via config")
return
logger.info("Starting daily orchestration workflow")
try:
# Get all active tenants
active_tenants = await self._get_active_tenants()
if not active_tenants:
logger.warning("No active tenants found for orchestration")
return
logger.info("Processing tenants",
total_tenants=len(active_tenants))
# Process tenants with concurrency limit
semaphore = asyncio.Semaphore(settings.MAX_CONCURRENT_TENANTS)
async def process_with_semaphore(tenant_id):
async with semaphore:
return await self._orchestrate_tenant(tenant_id)
# Process all tenants in parallel (but limited by semaphore)
tasks = [process_with_semaphore(tenant_id) for tenant_id in active_tenants]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Log summary
successful = sum(1 for r in results if r and not isinstance(r, Exception))
failed = len(results) - successful
logger.info("Daily orchestration completed",
total_tenants=len(active_tenants),
successful=successful,
failed=failed)
except Exception as e:
logger.error("Error in daily orchestration",
error=str(e), exc_info=True)
async def _orchestrate_tenant(self, tenant_id: uuid.UUID) -> bool:
"""
Orchestrate workflow for a single tenant using Saga pattern
Returns True if successful, False otherwise
"""
logger.info("Starting orchestration for tenant", tenant_id=str(tenant_id))
# Create orchestration run record
async with self.config.database_manager.get_session() as session:
repo = OrchestrationRunRepository(session)
run_number = await repo.generate_run_number()
run = await repo.create_run({
'run_number': run_number,
'tenant_id': tenant_id,
'status': OrchestrationStatus.running,
'run_type': 'scheduled',
'started_at': datetime.now(timezone.utc),
'triggered_by': 'scheduler'
})
await session.commit()
run_id = run.id
try:
# Emit orchestration started event
await self.emit_orchestration_run_started(
tenant_id=tenant_id,
run_id=str(run_id),
run_type='scheduled',
scope='full'
)
# Set timeout for entire tenant orchestration
async with asyncio.timeout(settings.TENANT_TIMEOUT_SECONDS):
# Execute orchestration using Saga pattern
# AI enhancement is enabled via ORCHESTRATION_USE_AI_INSIGHTS config
saga = OrchestrationSaga(
forecast_client=self.forecast_client,
production_client=self.production_client,
procurement_client=self.procurement_client,
notification_client=self.notification_client,
inventory_client=self.inventory_client,
suppliers_client=self.suppliers_client,
recipes_client=self.recipes_client,
training_client=self.training_client,
use_ai_enhancement=settings.ORCHESTRATION_USE_AI_INSIGHTS,
ai_insights_base_url=settings.AI_INSIGHTS_SERVICE_URL,
ai_insights_min_confidence=settings.AI_INSIGHTS_MIN_CONFIDENCE,
# Pass circuit breakers to saga for fault tolerance
forecast_breaker=self.forecast_breaker,
production_breaker=self.production_breaker,
procurement_breaker=self.procurement_breaker,
inventory_breaker=self.inventory_breaker,
suppliers_breaker=self.suppliers_breaker,
recipes_breaker=self.recipes_breaker
)
result = await saga.execute_orchestration(
tenant_id=str(tenant_id),
orchestration_run_id=str(run_id)
)
if result['success']:
# Update orchestration run with saga results
await self._complete_orchestration_run_with_saga(
run_id,
result
)
# Emit orchestration completed event
await self.emit_orchestration_run_completed(
tenant_id=tenant_id,
run_id=str(run_id),
duration_seconds=result.get('duration_seconds', 0),
actions_created=result.get('total_actions', 0),
actions_by_type=result.get('actions_by_type', {}),
status='success'
)
logger.info("Tenant orchestration completed successfully",
tenant_id=str(tenant_id), run_id=str(run_id))
return True
else:
# Saga failed (with compensation)
await self._mark_orchestration_failed(
run_id,
result.get('error', 'Saga execution failed')
)
# Emit orchestration failed event
await self.emit_orchestration_run_completed(
tenant_id=tenant_id,
run_id=str(run_id),
duration_seconds=result.get('duration_seconds', 0),
actions_created=0,
actions_by_type={},
status='failed'
)
return False
except asyncio.TimeoutError:
logger.error("Tenant orchestration timeout",
tenant_id=str(tenant_id),
timeout_seconds=settings.TENANT_TIMEOUT_SECONDS)
await self._mark_orchestration_failed(run_id, "Timeout exceeded")
return False
except Exception as e:
logger.error("Tenant orchestration failed",
tenant_id=str(tenant_id),
error=str(e), exc_info=True)
await self._mark_orchestration_failed(run_id, str(e))
return False
async def _get_active_tenants(self) -> List[uuid.UUID]:
"""
Get list of active tenants for orchestration
REAL IMPLEMENTATION (no stubs)
"""
try:
logger.info("Fetching active tenants from Tenant Service")
# Call Tenant Service with circuit breaker
tenants_data = await self.tenant_breaker.call(
self.tenant_client.get_active_tenants
)
if not tenants_data:
logger.warning("Tenant Service returned no active tenants")
return []
# Extract tenant IDs
tenant_ids = []
for tenant in tenants_data:
tenant_id = tenant.get('id') or tenant.get('tenant_id')
if tenant_id:
# Convert string to UUID if needed
if isinstance(tenant_id, str):
tenant_id = uuid.UUID(tenant_id)
tenant_ids.append(tenant_id)
logger.info(f"Found {len(tenant_ids)} active tenants for orchestration")
return tenant_ids
except CircuitBreakerOpenError:
logger.error("Circuit breaker open for Tenant Service, skipping orchestration")
return []
except Exception as e:
logger.error("Error getting active tenants", error=str(e), exc_info=True)
return []
async def _complete_orchestration_run_with_saga(
self,
run_id: uuid.UUID,
saga_result: Dict[str, Any]
):
"""
Complete orchestration run with saga results
Args:
run_id: Orchestration run ID
saga_result: Result from saga execution
"""
async with self.config.database_manager.get_session() as session:
repo = OrchestrationRunRepository(session)
run = await repo.get_run_by_id(run_id)
if run:
started_at = run.started_at
completed_at = datetime.now(timezone.utc)
duration = (completed_at - started_at).total_seconds()
# Extract results from saga
forecast_id = saga_result.get('forecast_id')
production_schedule_id = saga_result.get('production_schedule_id')
procurement_plan_id = saga_result.get('procurement_plan_id')
notifications_sent = saga_result.get('notifications_sent', 0)
# Get saga summary
saga_summary = saga_result.get('saga_summary', {})
total_steps = saga_summary.get('total_steps', 0)
completed_steps = saga_summary.get('completed_steps', 0)
# Extract actual counts from saga result (no placeholders)
forecast_data = saga_result.get('forecast_data', {})
production_data = saga_result.get('production_data', {})
procurement_data = saga_result.get('procurement_data', {})
forecasts_generated = forecast_data.get('forecasts_created', 0)
production_batches_created = production_data.get('batches_created', 0)
purchase_orders_created = procurement_data.get('pos_created', 0)
# Extract AI insights tracking
ai_insights_generated = saga_result.get('ai_insights_generated', 0)
ai_insights_posted = saga_result.get('ai_insights_posted', 0)
ai_insights_errors = saga_result.get('ai_insights_errors', [])
# Generate reasoning metadata for the orchestrator context
reasoning_metadata = self._generate_reasoning_metadata(
forecast_data,
production_data,
procurement_data,
ai_insights_generated,
ai_insights_posted
)
await repo.update_run(run_id, {
'status': OrchestrationStatus.completed,
'completed_at': completed_at,
'duration_seconds': int(duration),
'forecast_id': forecast_id,
'forecasting_status': 'success',
'forecasting_completed_at': completed_at,
'forecasts_generated': forecasts_generated,
'production_schedule_id': production_schedule_id,
'production_status': 'success',
'production_completed_at': completed_at,
'production_batches_created': production_batches_created,
'procurement_plan_id': procurement_plan_id,
'procurement_status': 'success',
'procurement_completed_at': completed_at,
'procurement_plans_created': 1, # Always 1 plan per orchestration
'purchase_orders_created': purchase_orders_created,
'notification_status': 'success',
'notification_completed_at': completed_at,
'notifications_sent': notifications_sent,
'ai_insights_status': 'success' if not ai_insights_errors else 'partial',
'ai_insights_generated': ai_insights_generated,
'ai_insights_posted': ai_insights_posted,
'ai_insights_completed_at': completed_at,
'saga_steps_total': total_steps,
'saga_steps_completed': completed_steps,
'run_metadata': reasoning_metadata
})
await session.commit()
def _generate_reasoning_metadata(
self,
forecast_data: Dict[str, Any],
production_data: Dict[str, Any],
procurement_data: Dict[str, Any],
ai_insights_generated: int,
ai_insights_posted: int
) -> Dict[str, Any]:
"""
Generate reasoning metadata for orchestration run that will be used by alert processor.
This creates structured reasoning data that the alert processor can use to provide
context when showing AI reasoning to users.
"""
reasoning_metadata = {
'reasoning': {
'type': 'daily_orchestration_summary',
'timestamp': datetime.now(timezone.utc).isoformat(),
'summary': 'Daily orchestration run completed successfully',
'details': {}
},
'purchase_orders': [],
'production_batches': [],
'ai_insights': {
'generated': ai_insights_generated,
'posted': ai_insights_posted
}
}
# Add forecast reasoning
if forecast_data:
reasoning_metadata['reasoning']['details']['forecasting'] = {
'forecasts_created': forecast_data.get('forecasts_created', 0),
'method': 'automated_daily_forecast',
'reasoning': 'Generated forecasts based on historical patterns and seasonal trends'
}
# Add production reasoning
if production_data:
reasoning_metadata['reasoning']['details']['production'] = {
'batches_created': production_data.get('batches_created', 0),
'method': 'demand_based_scheduling',
'reasoning': 'Scheduled production batches based on forecasted demand and inventory levels'
}
# Add procurement reasoning
if procurement_data:
reasoning_metadata['reasoning']['details']['procurement'] = {
'requirements_created': procurement_data.get('requirements_created', 0),
'pos_created': procurement_data.get('pos_created', 0),
'method': 'automated_procurement',
'reasoning': 'Generated procurement plan based on production needs and inventory optimization'
}
# Add purchase order details with reasoning
if procurement_data and procurement_data.get('purchase_orders'):
for po in procurement_data['purchase_orders']:
po_reasoning = {
'id': po.get('id'),
'status': po.get('status', 'created'),
'delivery_date': po.get('delivery_date'),
'reasoning': {
'type': 'inventory_optimization',
'parameters': {
'trigger': 'low_stock_prediction',
'min_depletion_days': po.get('min_depletion_days', 3),
'quantity': po.get('quantity'),
'unit': po.get('unit'),
'supplier': po.get('supplier_name'),
'financial_impact_eur': po.get('estimated_savings_eur', 0)
}
}
}
reasoning_metadata['purchase_orders'].append(po_reasoning)
# Add production batch details with reasoning
if production_data and production_data.get('production_batches'):
for batch in production_data['production_batches']:
batch_reasoning = {
'id': batch.get('id'),
'status': batch.get('status', 'scheduled'),
'scheduled_date': batch.get('scheduled_date'),
'reasoning': {
'type': 'demand_forecasting',
'parameters': {
'trigger': 'forecasted_demand',
'forecasted_quantity': batch.get('forecasted_quantity'),
'product_name': batch.get('product_name'),
'financial_impact_eur': batch.get('estimated_revenue_eur', 0)
}
}
}
reasoning_metadata['production_batches'].append(batch_reasoning)
return reasoning_metadata
async def _mark_orchestration_failed(self, run_id: uuid.UUID, error_message: str):
"""Mark orchestration run as failed"""
async with self.config.database_manager.get_session() as session:
repo = OrchestrationRunRepository(session)
run = await repo.get_run_by_id(run_id)
if run:
started_at = run.started_at
completed_at = datetime.now(timezone.utc)
duration = (completed_at - started_at).total_seconds()
await repo.update_run(run_id, {
'status': OrchestrationStatus.failed,
'completed_at': completed_at,
'duration_seconds': int(duration),
'error_message': error_message
})
await session.commit()
# Manual trigger for testing
async def trigger_orchestration_for_tenant(
self,
tenant_id: uuid.UUID,
test_scenario: Optional[str] = None
) -> Dict[str, Any]:
"""
Manually trigger orchestration for a tenant (for testing)
Args:
tenant_id: Tenant ID to orchestrate
test_scenario: Optional test scenario (full, production_only, procurement_only)
Returns:
Dict with orchestration results
"""
logger.info("Manual orchestration trigger",
tenant_id=str(tenant_id),
test_scenario=test_scenario)
success = await self._orchestrate_tenant(tenant_id)
return {
'success': success,
'tenant_id': str(tenant_id),
'test_scenario': test_scenario,
'message': 'Orchestration completed' if success else 'Orchestration failed'
}
async def start(self):
"""Start the orchestrator scheduler service"""
if not settings.ORCHESTRATION_ENABLED:
logger.info("Orchestration disabled via config")
return
# Initialize APScheduler
self.scheduler = AsyncIOScheduler()
# Add daily orchestration job
self.scheduler.add_job(
self.run_daily_orchestration,
trigger=CronTrigger(
hour=settings.ORCHESTRATION_HOUR,
minute=settings.ORCHESTRATION_MINUTE
),
id='daily_orchestration',
name='Daily Orchestration Workflow',
replace_existing=True,
max_instances=1,
coalesce=True
)
# Start the scheduler
self.scheduler.start()
# Log next run time
next_run = self.scheduler.get_job('daily_orchestration').next_run_time
logger.info(
"OrchestratorSchedulerService started with daily job",
orchestration_hour=settings.ORCHESTRATION_HOUR,
orchestration_minute=settings.ORCHESTRATION_MINUTE,
next_run=next_run.isoformat() if next_run else None
)
async def stop(self):
"""Stop the orchestrator scheduler service"""
if self.scheduler and self.scheduler.running:
self.scheduler.shutdown(wait=True)
logger.info("OrchestratorSchedulerService stopped")
else:
logger.info("OrchestratorSchedulerService already stopped")
def get_circuit_breaker_stats(self) -> Dict[str, Any]:
"""Get circuit breaker statistics for monitoring"""
return {
'forecast_service': self.forecast_breaker.get_stats(),
'production_service': self.production_breaker.get_stats(),
'procurement_service': self.procurement_breaker.get_stats(),
'tenant_service': self.tenant_breaker.get_stats(),
'inventory_service': self.inventory_breaker.get_stats(),
'suppliers_service': self.suppliers_breaker.get_stats(),
'recipes_service': self.recipes_breaker.get_stats()
}

View File

@@ -0,0 +1,265 @@
# services/orchestrator/app/utils/cache.py
"""
Redis caching utilities for dashboard endpoints
"""
import json
import redis.asyncio as redis
from typing import Optional, Any, Callable
from functools import wraps
import structlog
from app.core.config import settings
from pydantic import BaseModel
logger = structlog.get_logger()
# Redis client instance
_redis_client: Optional[redis.Redis] = None
async def get_redis_client() -> redis.Redis:
"""Get or create Redis client"""
global _redis_client
if _redis_client is None:
try:
# Check if TLS is enabled - convert string to boolean properly
redis_tls_str = str(getattr(settings, 'REDIS_TLS_ENABLED', 'false')).lower()
redis_tls_enabled = redis_tls_str in ('true', '1', 'yes', 'on')
connection_kwargs = {
'host': str(getattr(settings, 'REDIS_HOST', 'localhost')),
'port': int(getattr(settings, 'REDIS_PORT', 6379)),
'db': int(getattr(settings, 'REDIS_DB', 0)),
'decode_responses': True,
'socket_connect_timeout': 5,
'socket_timeout': 5
}
# Add password if configured
redis_password = getattr(settings, 'REDIS_PASSWORD', None)
if redis_password:
connection_kwargs['password'] = redis_password
# Add SSL/TLS support if enabled
if redis_tls_enabled:
import ssl
connection_kwargs['ssl'] = True
connection_kwargs['ssl_cert_reqs'] = ssl.CERT_NONE
logger.debug(f"Redis TLS enabled - connecting with SSL to {connection_kwargs['host']}:{connection_kwargs['port']}")
_redis_client = redis.Redis(**connection_kwargs)
# Test connection
await _redis_client.ping()
logger.info(f"Redis client connected successfully (TLS: {redis_tls_enabled})")
except Exception as e:
logger.warning(f"Failed to connect to Redis: {e}. Caching will be disabled.")
_redis_client = None
return _redis_client
async def close_redis():
"""Close Redis connection"""
global _redis_client
if _redis_client:
await _redis_client.close()
_redis_client = None
logger.info("Redis connection closed")
async def get_cached(key: str) -> Optional[Any]:
"""
Get cached value by key
Args:
key: Cache key
Returns:
Cached value (deserialized from JSON) or None if not found or error
"""
try:
client = await get_redis_client()
if not client:
return None
cached = await client.get(key)
if cached:
logger.debug(f"Cache hit: {key}")
return json.loads(cached)
else:
logger.debug(f"Cache miss: {key}")
return None
except Exception as e:
logger.warning(f"Cache get error for key {key}: {e}")
return None
def _serialize_value(value: Any) -> Any:
"""
Recursively serialize values for JSON storage, handling Pydantic models properly.
Args:
value: Value to serialize
Returns:
JSON-serializable value
"""
if isinstance(value, BaseModel):
# Convert Pydantic model to dictionary
return value.model_dump()
elif isinstance(value, (list, tuple)):
# Recursively serialize list/tuple elements
return [_serialize_value(item) for item in value]
elif isinstance(value, dict):
# Recursively serialize dictionary values
return {key: _serialize_value(val) for key, val in value.items()}
else:
# For other types, use default serialization
return value
async def set_cached(key: str, value: Any, ttl: int = 60) -> bool:
"""
Set cached value with TTL
Args:
key: Cache key
value: Value to cache (will be JSON serialized)
ttl: Time to live in seconds
Returns:
True if successful, False otherwise
"""
try:
client = await get_redis_client()
if not client:
return False
# Serialize value properly before JSON encoding
serialized_value = _serialize_value(value)
serialized = json.dumps(serialized_value)
await client.setex(key, ttl, serialized)
logger.debug(f"Cache set: {key} (TTL: {ttl}s)")
return True
except Exception as e:
logger.warning(f"Cache set error for key {key}: {e}")
return False
async def delete_cached(key: str) -> bool:
"""
Delete cached value
Args:
key: Cache key
Returns:
True if successful, False otherwise
"""
try:
client = await get_redis_client()
if not client:
return False
await client.delete(key)
logger.debug(f"Cache deleted: {key}")
return True
except Exception as e:
logger.warning(f"Cache delete error for key {key}: {e}")
return False
async def delete_pattern(pattern: str) -> int:
"""
Delete all keys matching pattern
Args:
pattern: Redis key pattern (e.g., "dashboard:*")
Returns:
Number of keys deleted
"""
try:
client = await get_redis_client()
if not client:
return 0
keys = []
async for key in client.scan_iter(match=pattern):
keys.append(key)
if keys:
deleted = await client.delete(*keys)
logger.info(f"Deleted {deleted} keys matching pattern: {pattern}")
return deleted
return 0
except Exception as e:
logger.warning(f"Cache delete pattern error for {pattern}: {e}")
return 0
def cache_response(key_prefix: str, ttl: int = 60):
"""
Decorator to cache endpoint responses
Args:
key_prefix: Prefix for cache key (will be combined with tenant_id)
ttl: Time to live in seconds
Usage:
@cache_response("dashboard:health", ttl=30)
async def get_health(tenant_id: str):
...
"""
def decorator(func: Callable):
@wraps(func)
async def wrapper(*args, **kwargs):
# Extract tenant_id from kwargs or args
tenant_id = kwargs.get('tenant_id')
if not tenant_id and args:
# Try to find tenant_id in args (assuming it's the first argument)
tenant_id = args[0] if len(args) > 0 else None
if not tenant_id:
# No tenant_id, skip caching
return await func(*args, **kwargs)
# Build cache key
cache_key = f"{key_prefix}:{tenant_id}"
# Try to get from cache
cached_value = await get_cached(cache_key)
if cached_value is not None:
return cached_value
# Execute function
result = await func(*args, **kwargs)
# Cache result
await set_cached(cache_key, result, ttl)
return result
return wrapper
return decorator
def make_cache_key(prefix: str, tenant_id: str, **params) -> str:
"""
Create a cache key with optional parameters
Args:
prefix: Key prefix
tenant_id: Tenant ID
**params: Additional parameters to include in key
Returns:
Cache key string
"""
key_parts = [prefix, tenant_id]
for k, v in sorted(params.items()):
if v is not None:
key_parts.append(f"{k}:{v}")
return ":".join(key_parts)