Initial commit - production deployment

This commit is contained in:
2026-01-21 17:17:16 +01:00
commit c23d00dd92
2289 changed files with 638440 additions and 0 deletions

View File

@@ -0,0 +1,516 @@
"""
Yield Insights Orchestrator
Coordinates yield prediction and insight posting
"""
import pandas as pd
from typing import Dict, List, Any, Optional
import structlog
from datetime import datetime
from uuid import UUID
import sys
import os
# Add shared clients to path
sys.path.append(os.path.join(os.path.dirname(__file__), '../../../..'))
from shared.clients.ai_insights_client import AIInsightsClient
from shared.messaging import UnifiedEventPublisher
from app.ml.yield_predictor import YieldPredictor
logger = structlog.get_logger()
class YieldInsightsOrchestrator:
"""
Orchestrates yield prediction and insight generation workflow.
Workflow:
1. Predict yield for upcoming production run or analyze historical performance
2. Generate insights for yield optimization opportunities
3. Post insights to AI Insights Service
4. Publish recommendation events to RabbitMQ
5. Provide yield predictions for production planning
"""
def __init__(
self,
ai_insights_base_url: str = "http://ai-insights-service:8000",
event_publisher: Optional[UnifiedEventPublisher] = None
):
self.predictor = YieldPredictor()
self.ai_insights_client = AIInsightsClient(ai_insights_base_url)
self.event_publisher = event_publisher
async def predict_and_post_insights(
self,
tenant_id: str,
recipe_id: str,
production_history: pd.DataFrame,
production_context: Dict[str, Any],
min_history_runs: int = 30
) -> Dict[str, Any]:
"""
Complete workflow: Predict yield and post insights.
Args:
tenant_id: Tenant identifier
recipe_id: Recipe identifier
production_history: Historical production runs
production_context: Upcoming production context:
- staff_assigned (list of staff IDs)
- planned_start_time
- batch_size
- planned_quantity
- unit_cost (optional)
- equipment_id (optional)
min_history_runs: Minimum production runs required
Returns:
Workflow results with prediction and posted insights
"""
logger.info(
"Starting yield prediction workflow",
tenant_id=tenant_id,
recipe_id=recipe_id,
history_runs=len(production_history)
)
# Step 1: Predict yield
prediction_results = await self.predictor.predict_yield(
tenant_id=tenant_id,
recipe_id=recipe_id,
production_history=production_history,
production_context=production_context,
min_history_runs=min_history_runs
)
logger.info(
"Yield prediction complete",
recipe_id=recipe_id,
predicted_yield=prediction_results.get('predicted_yield'),
insights_generated=len(prediction_results.get('insights', []))
)
# Step 2: Enrich insights with tenant_id and recipe context
enriched_insights = self._enrich_insights(
prediction_results.get('insights', []),
tenant_id,
recipe_id
)
# Step 3: Post insights to AI Insights Service
if enriched_insights:
post_results = await self.ai_insights_client.create_insights_bulk(
tenant_id=UUID(tenant_id),
insights=enriched_insights
)
logger.info(
"Yield insights posted to AI Insights Service",
recipe_id=recipe_id,
total=post_results['total'],
successful=post_results['successful'],
failed=post_results['failed']
)
# Step 4: Publish recommendation events to RabbitMQ
created_insights = post_results.get('created_insights', [])
if created_insights:
recipe_context = production_context.copy() if production_context else {}
recipe_context['recipe_id'] = recipe_id
await self._publish_insight_events(
tenant_id=tenant_id,
insights=created_insights,
recipe_context=recipe_context
)
else:
post_results = {'total': 0, 'successful': 0, 'failed': 0}
logger.info("No insights to post for recipe", recipe_id=recipe_id)
# Step 4: Return comprehensive results
return {
'tenant_id': tenant_id,
'recipe_id': recipe_id,
'predicted_at': prediction_results['predicted_at'],
'history_runs': prediction_results['history_runs'],
'baseline_yield': prediction_results.get('baseline_yield'),
'predicted_yield': prediction_results.get('predicted_yield'),
'prediction_range': prediction_results.get('prediction_range'),
'expected_waste': prediction_results.get('expected_waste'),
'confidence': prediction_results['confidence'],
'factor_analysis': prediction_results.get('factor_analysis'),
'patterns': prediction_results.get('patterns', []),
'insights_generated': len(enriched_insights),
'insights_posted': post_results['successful'],
'insights_failed': post_results['failed'],
'created_insights': post_results.get('created_insights', [])
}
async def analyze_and_post_insights(
self,
tenant_id: str,
recipe_id: str,
production_history: pd.DataFrame,
min_history_runs: int = 30
) -> Dict[str, Any]:
"""
Analyze historical yield performance and post insights (no prediction).
Args:
tenant_id: Tenant identifier
recipe_id: Recipe identifier
production_history: Historical production runs
min_history_runs: Minimum production runs required
Returns:
Workflow results with analysis and posted insights
"""
logger.info(
"Starting yield analysis workflow",
tenant_id=tenant_id,
recipe_id=recipe_id,
history_runs=len(production_history)
)
# Step 1: Analyze historical yield
analysis_results = await self.predictor.analyze_recipe_yield_history(
tenant_id=tenant_id,
recipe_id=recipe_id,
production_history=production_history,
min_history_runs=min_history_runs
)
logger.info(
"Yield analysis complete",
recipe_id=recipe_id,
baseline_yield=analysis_results.get('baseline_stats', {}).get('mean_yield'),
insights_generated=len(analysis_results.get('insights', []))
)
# Step 2: Enrich insights
enriched_insights = self._enrich_insights(
analysis_results.get('insights', []),
tenant_id,
recipe_id
)
# Step 3: Post insights
if enriched_insights:
post_results = await self.ai_insights_client.create_insights_bulk(
tenant_id=UUID(tenant_id),
insights=enriched_insights
)
logger.info(
"Yield analysis insights posted",
recipe_id=recipe_id,
total=post_results['total'],
successful=post_results['successful']
)
# Step 4: Publish recommendation events to RabbitMQ
created_insights = post_results.get('created_insights', [])
if created_insights:
await self._publish_insight_events(
tenant_id=tenant_id,
insights=created_insights,
recipe_context={'recipe_id': recipe_id}
)
else:
post_results = {'total': 0, 'successful': 0, 'failed': 0}
return {
'tenant_id': tenant_id,
'recipe_id': recipe_id,
'analyzed_at': analysis_results['analyzed_at'],
'history_runs': analysis_results['history_runs'],
'baseline_stats': analysis_results.get('baseline_stats'),
'factor_analysis': analysis_results.get('factor_analysis'),
'patterns': analysis_results.get('patterns', []),
'insights_generated': len(enriched_insights),
'insights_posted': post_results['successful'],
'created_insights': post_results.get('created_insights', [])
}
def _enrich_insights(
self,
insights: List[Dict[str, Any]],
tenant_id: str,
recipe_id: str
) -> List[Dict[str, Any]]:
"""
Enrich insights with required fields for AI Insights Service.
Args:
insights: Raw insights from predictor
tenant_id: Tenant identifier
recipe_id: Recipe identifier
Returns:
Enriched insights ready for posting
"""
enriched = []
for insight in insights:
# Add required tenant_id
enriched_insight = insight.copy()
enriched_insight['tenant_id'] = tenant_id
# Add recipe context to metrics
if 'metrics_json' not in enriched_insight:
enriched_insight['metrics_json'] = {}
enriched_insight['metrics_json']['recipe_id'] = recipe_id
# Add source metadata
enriched_insight['source_service'] = 'production'
enriched_insight['source_model'] = 'yield_predictor'
enriched_insight['detected_at'] = datetime.utcnow().isoformat()
enriched.append(enriched_insight)
return enriched
async def _publish_insight_events(
self,
tenant_id: str,
insights: List[Dict[str, Any]],
recipe_context: Optional[Dict[str, Any]] = None
) -> None:
"""
Publish recommendation events to RabbitMQ for each insight.
Args:
tenant_id: Tenant identifier
insights: List of created insights (with insight_id from AI Insights Service)
recipe_context: Optional recipe context (name, id, etc.)
"""
if not self.event_publisher:
logger.warning("Event publisher not configured, skipping event publication")
return
for insight in insights:
try:
# Determine severity based on confidence and priority
confidence = insight.get('confidence', 0)
priority = insight.get('priority', 'medium')
if priority == 'urgent' or confidence >= 90:
severity = 'urgent'
elif priority == 'high' or confidence >= 70:
severity = 'high'
elif priority == 'medium' or confidence >= 50:
severity = 'medium'
else:
severity = 'low'
# Build event metadata
event_metadata = {
'insight_id': insight.get('id'), # From AI Insights Service response
'insight_type': insight.get('insight_type'),
'recipe_id': insight.get('metrics_json', {}).get('recipe_id'),
'recipe_name': recipe_context.get('recipe_name') if recipe_context else None,
'predicted_yield': insight.get('metrics_json', {}).get('predicted_yield'),
'confidence': confidence,
'recommendation': insight.get('recommendation'),
'impact_type': insight.get('impact_type'),
'impact_value': insight.get('impact_value'),
'source_service': 'production',
'source_model': 'yield_predictor'
}
# Remove None values
event_metadata = {k: v for k, v in event_metadata.items() if v is not None}
# Publish recommendation event
await self.event_publisher.publish_recommendation(
event_type='ai_yield_prediction',
tenant_id=tenant_id,
severity=severity,
data=event_metadata
)
logger.info(
"Published yield insight recommendation event",
tenant_id=tenant_id,
insight_id=insight.get('id'),
insight_type=insight.get('insight_type'),
severity=severity
)
except Exception as e:
logger.error(
"Failed to publish insight event",
tenant_id=tenant_id,
insight_id=insight.get('id'),
error=str(e),
exc_info=True
)
# Don't raise - we don't want to fail the whole workflow if event publishing fails
async def analyze_all_recipes(
self,
tenant_id: str,
recipes_data: Dict[str, pd.DataFrame],
min_history_runs: int = 30
) -> Dict[str, Any]:
"""
Analyze yield performance for all recipes for a tenant.
Args:
tenant_id: Tenant identifier
recipes_data: Dict of {recipe_id: production_history_df}
min_history_runs: Minimum production runs required
Returns:
Comprehensive analysis results
"""
logger.info(
"Analyzing yield for all recipes",
tenant_id=tenant_id,
recipes=len(recipes_data)
)
all_results = []
total_insights_posted = 0
recipes_with_issues = []
# Analyze each recipe
for recipe_id, production_history in recipes_data.items():
try:
results = await self.analyze_and_post_insights(
tenant_id=tenant_id,
recipe_id=recipe_id,
production_history=production_history,
min_history_runs=min_history_runs
)
all_results.append(results)
total_insights_posted += results['insights_posted']
# Check for low baseline yield
baseline_stats = results.get('baseline_stats')
if baseline_stats and baseline_stats.get('mean_yield', 100) < 90:
recipes_with_issues.append({
'recipe_id': recipe_id,
'mean_yield': baseline_stats['mean_yield'],
'std_yield': baseline_stats['std_yield']
})
except Exception as e:
logger.error(
"Error analyzing recipe",
recipe_id=recipe_id,
error=str(e)
)
# Generate portfolio summary insight if there are yield issues
if len(recipes_with_issues) > 0:
summary_insight = self._generate_portfolio_summary_insight(
tenant_id, recipes_with_issues, all_results
)
if summary_insight:
enriched_summary = self._enrich_insights(
[summary_insight], tenant_id, 'all_recipes'
)
post_results = await self.ai_insights_client.create_insights_bulk(
tenant_id=UUID(tenant_id),
insights=enriched_summary
)
total_insights_posted += post_results['successful']
logger.info(
"All recipes yield analysis complete",
tenant_id=tenant_id,
recipes_analyzed=len(all_results),
total_insights_posted=total_insights_posted,
recipes_with_issues=len(recipes_with_issues)
)
return {
'tenant_id': tenant_id,
'analyzed_at': datetime.utcnow().isoformat(),
'recipes_analyzed': len(all_results),
'recipe_results': all_results,
'total_insights_posted': total_insights_posted,
'recipes_with_issues': recipes_with_issues
}
def _generate_portfolio_summary_insight(
self,
tenant_id: str,
recipes_with_issues: List[Dict[str, Any]],
all_results: List[Dict[str, Any]]
) -> Optional[Dict[str, Any]]:
"""
Generate portfolio-level summary insight.
Args:
tenant_id: Tenant identifier
recipes_with_issues: Recipes with low yield
all_results: All recipe analysis results
Returns:
Summary insight or None
"""
if len(recipes_with_issues) == 0:
return None
# Calculate average yield and potential improvement
total_recipes = len(all_results)
issues_count = len(recipes_with_issues)
avg_low_yield = sum(r['mean_yield'] for r in recipes_with_issues) / issues_count
# Estimate waste reduction potential
# Assuming each recipe produces 1000 units/month, €5/unit cost
monthly_production = 1000 * issues_count
current_waste_pct = 100 - avg_low_yield
target_waste_pct = 5 # Target 95% yield
if current_waste_pct > target_waste_pct:
waste_reduction_units = monthly_production * ((current_waste_pct - target_waste_pct) / 100)
annual_savings = waste_reduction_units * 12 * 5 # €5 per unit
return {
'type': 'opportunity',
'priority': 'high' if issues_count > 3 else 'medium',
'category': 'production',
'title': f'Production Yield Optimization: {issues_count} Recipes Below 90%',
'description': f'{issues_count} of {total_recipes} recipes have average yield below 90% (average {avg_low_yield:.1f}%). Improving to 95% target would reduce waste by {waste_reduction_units:.0f} units/month, saving €{annual_savings:.0f}/year.',
'impact_type': 'cost_savings',
'impact_value': annual_savings,
'impact_unit': 'euros_per_year',
'confidence': 75,
'metrics_json': {
'recipes_analyzed': total_recipes,
'recipes_with_issues': issues_count,
'avg_low_yield': round(avg_low_yield, 2),
'potential_annual_savings': round(annual_savings, 2),
'waste_reduction_units_monthly': round(waste_reduction_units, 2)
},
'actionable': True,
'recommendation_actions': [
{
'label': 'Review Low-Yield Recipes',
'action': 'review_yield_insights',
'params': {'tenant_id': tenant_id}
},
{
'label': 'Implement Yield Improvements',
'action': 'apply_yield_recommendations',
'params': {'tenant_id': tenant_id}
}
],
'source_service': 'production',
'source_model': 'yield_predictor'
}
return None
async def close(self):
"""Close HTTP client connections."""
await self.ai_insights_client.close()

View File

@@ -0,0 +1,813 @@
"""
Production Yield Predictor
Predicts actual vs planned yield and identifies waste reduction opportunities
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime, timedelta
import structlog
from scipy import stats
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
logger = structlog.get_logger()
class YieldPredictor:
"""
Predicts production yield based on historical data and production factors.
Key Features:
- Multi-factor yield prediction (recipe, worker, time-of-day, equipment, batch size)
- Identifies low-yield patterns and root causes
- Waste categorization (spoilage, measurement error, process inefficiency)
- Actionable recommendations for yield improvement
- Statistical validation of learned patterns
Methodology:
1. Feature Engineering: Extract worker skill, time factors, batch size effects
2. Statistical Analysis: Identify significant yield loss factors
3. ML Prediction: Ensemble of Random Forest + Gradient Boosting
4. Pattern Detection: Find recurring low-yield situations
5. Insight Generation: Actionable recommendations with confidence scores
"""
def __init__(self):
self.model_cache = {} # Cache trained models per recipe
self.baseline_yields = {} # Cache baseline yields per recipe
async def predict_yield(
self,
tenant_id: str,
recipe_id: str,
production_history: pd.DataFrame,
production_context: Dict[str, Any],
min_history_runs: int = 30
) -> Dict[str, Any]:
"""
Predict yield for upcoming production run and generate insights.
Args:
tenant_id: Tenant identifier
recipe_id: Recipe identifier
production_history: Historical production runs with columns:
- production_run_id
- recipe_id
- planned_quantity
- actual_quantity
- yield_percentage
- staff_assigned (list of staff IDs)
- started_at
- completed_at
- batch_size
- equipment_id (optional)
- notes (optional)
production_context: Upcoming production context:
- staff_assigned (list of staff IDs)
- planned_start_time
- batch_size
- equipment_id (optional)
min_history_runs: Minimum production runs required for learning
Returns:
Prediction results with yield forecast, confidence, and insights
"""
logger.info(
"Predicting production yield",
tenant_id=tenant_id,
recipe_id=recipe_id,
history_runs=len(production_history)
)
# Validate production history
if len(production_history) < min_history_runs:
return self._insufficient_data_response(
recipe_id, production_context, len(production_history), min_history_runs
)
# Step 1: Calculate baseline statistics
baseline_stats = self._calculate_baseline_statistics(production_history)
# Step 2: Feature engineering
feature_df = self._engineer_features(production_history)
# Step 3: Analyze yield factors
factor_analysis = self._analyze_yield_factors(feature_df)
# Step 4: Train predictive model
model_results = self._train_yield_model(feature_df)
# Step 5: Make prediction for upcoming run
prediction = self._predict_upcoming_run(
production_context, model_results, baseline_stats, feature_df
)
# Step 6: Identify low-yield patterns
patterns = self._identify_yield_patterns(feature_df, factor_analysis)
# Step 7: Generate insights
insights = self._generate_yield_insights(
tenant_id, recipe_id, baseline_stats, factor_analysis,
patterns, prediction, production_context
)
# Step 8: Calculate confidence
confidence = self._calculate_prediction_confidence(
production_history, model_results, factor_analysis
)
return {
'recipe_id': recipe_id,
'predicted_at': datetime.utcnow().isoformat(),
'history_runs': len(production_history),
'baseline_yield': baseline_stats['mean_yield'],
'baseline_std': baseline_stats['std_yield'],
'predicted_yield': prediction['predicted_yield'],
'prediction_range': prediction['prediction_range'],
'expected_waste': prediction['expected_waste'],
'confidence': confidence,
'factor_analysis': factor_analysis,
'patterns': patterns,
'model_performance': model_results['performance'],
'insights': insights
}
def _insufficient_data_response(
self, recipe_id: str, production_context: Dict[str, Any],
current_runs: int, required_runs: int
) -> Dict[str, Any]:
"""Return response when insufficient historical data."""
return {
'recipe_id': recipe_id,
'predicted_at': datetime.utcnow().isoformat(),
'history_runs': current_runs,
'status': 'insufficient_data',
'required_runs': required_runs,
'baseline_yield': None,
'predicted_yield': None,
'confidence': 0,
'insights': [{
'type': 'warning',
'priority': 'low',
'category': 'production',
'title': f'Insufficient Production History for Yield Prediction',
'description': f'Only {current_runs} production runs available. Need at least {required_runs} runs to build reliable yield predictions. Continue tracking production data to enable yield optimization.',
'impact_type': 'data_quality',
'confidence': 100,
'actionable': True,
'recommendation_actions': [{
'label': 'Track Production Data',
'action': 'continue_production_tracking',
'params': {'recipe_id': recipe_id}
}]
}]
}
def _calculate_baseline_statistics(
self, production_history: pd.DataFrame
) -> Dict[str, Any]:
"""Calculate baseline yield statistics."""
yields = production_history['yield_percentage'].values
return {
'mean_yield': float(np.mean(yields)),
'median_yield': float(np.median(yields)),
'std_yield': float(np.std(yields)),
'min_yield': float(np.min(yields)),
'max_yield': float(np.max(yields)),
'cv_yield': float(np.std(yields) / np.mean(yields)), # Coefficient of variation
'percentile_25': float(np.percentile(yields, 25)),
'percentile_75': float(np.percentile(yields, 75)),
'runs_below_90': int(np.sum(yields < 90)),
'runs_above_95': int(np.sum(yields > 95))
}
def _engineer_features(self, production_history: pd.DataFrame) -> pd.DataFrame:
"""Engineer features from production history."""
df = production_history.copy()
# Time-based features
df['started_at'] = pd.to_datetime(df['started_at'])
df['hour_of_day'] = df['started_at'].dt.hour
df['day_of_week'] = df['started_at'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['is_early_morning'] = (df['hour_of_day'] < 6).astype(int)
df['is_late_night'] = (df['hour_of_day'] >= 22).astype(int)
# Duration features
if 'completed_at' in df.columns:
df['completed_at'] = pd.to_datetime(df['completed_at'])
df['duration_hours'] = (df['completed_at'] - df['started_at']).dt.total_seconds() / 3600
df['is_rushed'] = (df['duration_hours'] < df['duration_hours'].quantile(0.25)).astype(int)
# Batch size features
df['batch_size_normalized'] = df['batch_size'] / df['batch_size'].mean()
df['is_large_batch'] = (df['batch_size'] > df['batch_size'].quantile(0.75)).astype(int)
df['is_small_batch'] = (df['batch_size'] < df['batch_size'].quantile(0.25)).astype(int)
# Worker experience features (proxy: number of previous runs)
# Extract first worker from staff_assigned list
df['worker_id'] = df['staff_assigned'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else 'unknown')
df = df.sort_values('started_at')
df['worker_run_count'] = df.groupby('worker_id').cumcount() + 1
df['worker_experience_level'] = pd.cut(
df['worker_run_count'],
bins=[0, 5, 15, 100],
labels=['novice', 'intermediate', 'expert']
)
# Recent yield trend for worker
df['worker_recent_avg_yield'] = df.groupby('worker_id')['yield_percentage'].transform(
lambda x: x.rolling(window=5, min_periods=1).mean()
)
return df
def _analyze_yield_factors(self, feature_df: pd.DataFrame) -> Dict[str, Any]:
"""Analyze factors affecting yield using statistical tests."""
factors = {}
# Worker impact
# Extract worker_id from staff_assigned for analysis
if 'worker_id' not in feature_df.columns:
feature_df['worker_id'] = feature_df['staff_assigned'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else 'unknown')
worker_yields = feature_df.groupby('worker_id')['yield_percentage'].agg(['mean', 'std', 'count'])
worker_yields = worker_yields[worker_yields['count'] >= 3] # Min 3 runs per worker
if len(worker_yields) > 1:
# ANOVA test: Does worker significantly affect yield?
worker_groups = [
feature_df[feature_df['worker_id'] == worker]['yield_percentage'].values
for worker in worker_yields.index
]
f_stat, p_value = stats.f_oneway(*worker_groups)
factors['worker'] = {
'significant': p_value < 0.05,
'p_value': float(p_value),
'f_statistic': float(f_stat),
'best_worker': worker_yields['mean'].idxmax(),
'best_worker_yield': float(worker_yields['mean'].max()),
'worst_worker': worker_yields['mean'].idxmin(),
'worst_worker_yield': float(worker_yields['mean'].min()),
'yield_range': float(worker_yields['mean'].max() - worker_yields['mean'].min())
}
else:
factors['worker'] = {'significant': False, 'reason': 'insufficient_workers'}
# Time of day impact
time_groups = {
'early_morning': feature_df[feature_df['hour_of_day'] < 6]['yield_percentage'].values,
'morning': feature_df[(feature_df['hour_of_day'] >= 6) & (feature_df['hour_of_day'] < 12)]['yield_percentage'].values,
'afternoon': feature_df[(feature_df['hour_of_day'] >= 12) & (feature_df['hour_of_day'] < 18)]['yield_percentage'].values,
'evening': feature_df[feature_df['hour_of_day'] >= 18]['yield_percentage'].values
}
time_groups = {k: v for k, v in time_groups.items() if len(v) >= 3}
if len(time_groups) > 1:
f_stat, p_value = stats.f_oneway(*time_groups.values())
time_means = {k: np.mean(v) for k, v in time_groups.items()}
factors['time_of_day'] = {
'significant': p_value < 0.05,
'p_value': float(p_value),
'best_time': max(time_means, key=time_means.get),
'best_time_yield': float(max(time_means.values())),
'worst_time': min(time_means, key=time_means.get),
'worst_time_yield': float(min(time_means.values())),
'yield_range': float(max(time_means.values()) - min(time_means.values()))
}
else:
factors['time_of_day'] = {'significant': False, 'reason': 'insufficient_data'}
# Batch size impact (correlation)
if len(feature_df) >= 10:
correlation, p_value = stats.pearsonr(
feature_df['batch_size'],
feature_df['yield_percentage']
)
factors['batch_size'] = {
'significant': abs(correlation) > 0.3 and p_value < 0.05,
'correlation': float(correlation),
'p_value': float(p_value),
'direction': 'positive' if correlation > 0 else 'negative',
'interpretation': self._interpret_batch_size_effect(correlation)
}
else:
factors['batch_size'] = {'significant': False, 'reason': 'insufficient_data'}
# Weekend vs weekday
weekend_yields = feature_df[feature_df['is_weekend'] == 1]['yield_percentage'].values
weekday_yields = feature_df[feature_df['is_weekend'] == 0]['yield_percentage'].values
if len(weekend_yields) >= 3 and len(weekday_yields) >= 3:
t_stat, p_value = stats.ttest_ind(weekend_yields, weekday_yields)
factors['weekend_effect'] = {
'significant': p_value < 0.05,
'p_value': float(p_value),
't_statistic': float(t_stat),
'weekend_yield': float(np.mean(weekend_yields)),
'weekday_yield': float(np.mean(weekday_yields)),
'difference': float(np.mean(weekend_yields) - np.mean(weekday_yields))
}
else:
factors['weekend_effect'] = {'significant': False, 'reason': 'insufficient_weekend_data'}
return factors
def _interpret_batch_size_effect(self, correlation: float) -> str:
"""Interpret batch size correlation."""
if abs(correlation) < 0.3:
return "Batch size has minimal impact on yield"
elif correlation > 0:
return "Larger batches tend to have higher yield (economies of scale)"
else:
return "Larger batches tend to have lower yield (difficulty handling large volumes)"
def _train_yield_model(self, feature_df: pd.DataFrame) -> Dict[str, Any]:
"""Train ML model to predict yield."""
# Prepare features
feature_columns = [
'hour_of_day', 'day_of_week', 'is_weekend',
'batch_size_normalized', 'is_large_batch', 'is_small_batch',
'worker_run_count'
]
if 'duration_hours' in feature_df.columns:
feature_columns.append('duration_hours')
# Encode worker_id (extracted from staff_assigned)
if 'worker_id' not in feature_df.columns:
feature_df['worker_id'] = feature_df['staff_assigned'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else 'unknown')
worker_encoding = {worker: idx for idx, worker in enumerate(feature_df['worker_id'].unique())}
feature_df['worker_encoded'] = feature_df['worker_id'].map(worker_encoding)
feature_columns.append('worker_encoded')
X = feature_df[feature_columns].fillna(0).values
y = feature_df['yield_percentage'].values
# Split into train/test (temporal split)
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train ensemble of models
models = {
'random_forest': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
'gradient_boosting': GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42),
'linear': LinearRegression()
}
performances = {}
predictions = {}
for name, model in models.items():
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
mae = np.mean(np.abs(y_test - y_pred))
rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
r2 = 1 - (np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2))
performances[name] = {
'mae': float(mae),
'rmse': float(rmse),
'r2': float(r2)
}
predictions[name] = y_pred
# Select best model based on MAE
best_model_name = min(performances, key=lambda k: performances[k]['mae'])
best_model = models[best_model_name]
# Feature importance (if available)
feature_importance = {}
if hasattr(best_model, 'feature_importances_'):
importances = best_model.feature_importances_
feature_importance = {
feature_columns[i]: float(importances[i])
for i in range(len(feature_columns))
}
feature_importance = dict(sorted(
feature_importance.items(),
key=lambda x: x[1],
reverse=True
))
return {
'best_model': best_model,
'best_model_name': best_model_name,
'scaler': scaler,
'feature_columns': feature_columns,
'worker_encoding': worker_encoding,
'performance': performances[best_model_name],
'all_performances': performances,
'feature_importance': feature_importance
}
def _predict_upcoming_run(
self,
production_context: Dict[str, Any],
model_results: Dict[str, Any],
baseline_stats: Dict[str, Any],
feature_df: pd.DataFrame
) -> Dict[str, Any]:
"""Predict yield for upcoming production run."""
# Extract context
staff_assigned = production_context.get('staff_assigned', [])
worker_id = staff_assigned[0] if isinstance(staff_assigned, list) and len(staff_assigned) > 0 else 'unknown'
planned_start = pd.to_datetime(production_context.get('planned_start_time'))
batch_size = production_context.get('batch_size')
# Get worker experience
if 'worker_id' not in feature_df.columns:
feature_df['worker_id'] = feature_df['staff_assigned'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else 'unknown')
worker_runs = feature_df[feature_df['worker_id'] == worker_id]
worker_run_count = len(worker_runs) if len(worker_runs) > 0 else 1
# Build feature vector
mean_batch_size = feature_df['batch_size'].mean()
batch_size_normalized = batch_size / mean_batch_size
is_large_batch = 1 if batch_size > feature_df['batch_size'].quantile(0.75) else 0
is_small_batch = 1 if batch_size < feature_df['batch_size'].quantile(0.25) else 0
features = {
'hour_of_day': planned_start.hour,
'day_of_week': planned_start.dayofweek,
'is_weekend': 1 if planned_start.dayofweek in [5, 6] else 0,
'batch_size_normalized': batch_size_normalized,
'is_large_batch': is_large_batch,
'is_small_batch': is_small_batch,
'worker_run_count': worker_run_count,
'duration_hours': 0, # Not known yet
'worker_encoded': model_results['worker_encoding'].get(worker_id, 0)
}
# Create feature vector in correct order
X = np.array([[features.get(col, 0) for col in model_results['feature_columns']]])
X_scaled = model_results['scaler'].transform(X)
# Predict
predicted_yield = float(model_results['best_model'].predict(X_scaled)[0])
# Prediction range (based on model RMSE)
rmse = model_results['performance']['rmse']
prediction_range = {
'lower': max(0, predicted_yield - 1.96 * rmse),
'upper': min(100, predicted_yield + 1.96 * rmse)
}
# Expected waste
planned_quantity = production_context.get('planned_quantity', 100)
expected_waste_pct = max(0, 100 - predicted_yield)
expected_waste_units = planned_quantity * (expected_waste_pct / 100)
return {
'predicted_yield': round(predicted_yield, 2),
'prediction_range': prediction_range,
'expected_waste_pct': round(expected_waste_pct, 2),
'expected_waste_units': round(expected_waste_units, 2),
'baseline_comparison': round(predicted_yield - baseline_stats['mean_yield'], 2),
'features_used': features
}
def _identify_yield_patterns(
self, feature_df: pd.DataFrame, factor_analysis: Dict[str, Any]
) -> List[Dict[str, Any]]:
"""Identify recurring low-yield patterns."""
patterns = []
# Pattern 1: Specific worker consistently low
if factor_analysis.get('worker', {}).get('significant'):
worst_worker = factor_analysis['worker']['worst_worker']
worst_yield = factor_analysis['worker']['worst_worker_yield']
best_yield = factor_analysis['worker']['best_worker_yield']
if worst_yield < 90 and (best_yield - worst_yield) > 5:
patterns.append({
'pattern': 'low_yield_worker',
'description': f'Worker {worst_worker} consistently produces {worst_yield:.1f}% yield vs best worker {best_yield:.1f}%',
'severity': 'high' if worst_yield < 85 else 'medium',
'affected_runs': int(len(feature_df[feature_df['worker_id'] == worst_worker])),
'yield_impact': round(best_yield - worst_yield, 2),
'recommendation': 'Provide additional training or reassign to different recipes'
})
# Pattern 2: Time-of-day effect
if factor_analysis.get('time_of_day', {}).get('significant'):
worst_time = factor_analysis['time_of_day']['worst_time']
worst_yield = factor_analysis['time_of_day']['worst_time_yield']
if worst_yield < 90:
patterns.append({
'pattern': 'low_yield_time',
'description': f'{worst_time} shifts produce {worst_yield:.1f}% yield',
'severity': 'medium',
'affected_runs': 'varies',
'yield_impact': round(factor_analysis['time_of_day']['yield_range'], 2),
'recommendation': f'Avoid scheduling this recipe during {worst_time}'
})
# Pattern 3: Large batch issues
if factor_analysis.get('batch_size', {}).get('significant'):
if factor_analysis['batch_size']['direction'] == 'negative':
patterns.append({
'pattern': 'large_batch_yield_loss',
'description': 'Larger batches have lower yield - equipment or process capacity issues',
'severity': 'medium',
'correlation': round(factor_analysis['batch_size']['correlation'], 3),
'recommendation': 'Split large batches or upgrade equipment'
})
# Pattern 4: Weekend effect
if factor_analysis.get('weekend_effect', {}).get('significant'):
weekend_yield = factor_analysis['weekend_effect']['weekend_yield']
weekday_yield = factor_analysis['weekend_effect']['weekday_yield']
if abs(weekend_yield - weekday_yield) > 3:
if weekend_yield < weekday_yield:
patterns.append({
'pattern': 'weekend_yield_drop',
'description': f'Weekend production {weekend_yield:.1f}% vs weekday {weekday_yield:.1f}%',
'severity': 'low',
'yield_impact': round(weekday_yield - weekend_yield, 2),
'recommendation': 'Review weekend staffing or processes'
})
return patterns
def _generate_yield_insights(
self,
tenant_id: str,
recipe_id: str,
baseline_stats: Dict[str, Any],
factor_analysis: Dict[str, Any],
patterns: List[Dict[str, Any]],
prediction: Dict[str, Any],
production_context: Dict[str, Any]
) -> List[Dict[str, Any]]:
"""Generate actionable insights for yield improvement."""
insights = []
# Insight 1: Low predicted yield warning
if prediction['predicted_yield'] < 90:
waste_value = prediction['expected_waste_units'] * production_context.get('unit_cost', 5)
insights.append({
'type': 'warning',
'priority': 'high' if prediction['predicted_yield'] < 85 else 'medium',
'category': 'production',
'title': f'Low Yield Predicted: {prediction["predicted_yield"]:.1f}%',
'description': f'Upcoming production run predicted to yield {prediction["predicted_yield"]:.1f}%, below baseline {baseline_stats["mean_yield"]:.1f}%. Expected waste: {prediction["expected_waste_units"]:.1f} units (€{waste_value:.2f}).',
'impact_type': 'waste',
'impact_value': prediction['expected_waste_units'],
'impact_unit': 'units',
'confidence': 75,
'metrics_json': {
'recipe_id': recipe_id,
'predicted_yield': prediction['predicted_yield'],
'expected_waste': prediction['expected_waste_units'],
'waste_value': round(waste_value, 2)
},
'actionable': True,
'recommendation_actions': [{
'label': 'Review Production Setup',
'action': 'review_production_factors',
'params': {
'recipe_id': recipe_id,
'worker_id': worker_id
}
}]
})
# Insight 2: High-severity patterns
for pattern in patterns:
if pattern.get('severity') == 'high':
if pattern['pattern'] == 'low_yield_worker':
insights.append({
'type': 'opportunity',
'priority': 'high',
'category': 'production',
'title': f'Worker Training Opportunity: {pattern["yield_impact"]:.1f}% Yield Gap',
'description': pattern['description'] + f'. Improving this worker to average performance would save significant waste.',
'impact_type': 'yield_improvement',
'impact_value': pattern['yield_impact'],
'impact_unit': 'percentage_points',
'confidence': 85,
'metrics_json': {
'recipe_id': recipe_id,
'pattern': pattern['pattern'],
'yield_impact': pattern['yield_impact']
},
'actionable': True,
'recommendation_actions': [{
'label': 'Schedule Training',
'action': 'schedule_worker_training',
'params': {'recipe_id': recipe_id}
}]
})
# Insight 3: Excellent yield
if prediction['predicted_yield'] > 98:
insights.append({
'type': 'positive',
'priority': 'low',
'category': 'production',
'title': f'Excellent Yield Expected: {prediction["predicted_yield"]:.1f}%',
'description': f'Optimal production conditions detected. Expected yield {prediction["predicted_yield"]:.1f}% exceeds baseline {baseline_stats["mean_yield"]:.1f}%.',
'impact_type': 'yield_improvement',
'impact_value': prediction['baseline_comparison'],
'impact_unit': 'percentage_points',
'confidence': 70,
'metrics_json': {
'recipe_id': recipe_id,
'predicted_yield': prediction['predicted_yield']
},
'actionable': False
})
# Insight 4: Yield variability issue
if baseline_stats['cv_yield'] > 0.05: # Coefficient of variation > 5%
insights.append({
'type': 'opportunity',
'priority': 'medium',
'category': 'production',
'title': f'High Yield Variability: {baseline_stats["cv_yield"]*100:.1f}% CV',
'description': f'Yield varies significantly across production runs (CV={baseline_stats["cv_yield"]*100:.1f}%, range {baseline_stats["min_yield"]:.1f}%-{baseline_stats["max_yield"]:.1f}%). Standardizing processes could reduce waste.',
'impact_type': 'process_improvement',
'confidence': 80,
'metrics_json': {
'recipe_id': recipe_id,
'cv_yield': round(baseline_stats['cv_yield'], 3),
'yield_range': round(baseline_stats['max_yield'] - baseline_stats['min_yield'], 2)
},
'actionable': True,
'recommendation_actions': [{
'label': 'Standardize Process',
'action': 'review_production_sop',
'params': {'recipe_id': recipe_id}
}]
})
return insights
def _calculate_prediction_confidence(
self,
production_history: pd.DataFrame,
model_results: Dict[str, Any],
factor_analysis: Dict[str, Any]
) -> int:
"""Calculate overall confidence score for predictions."""
confidence_factors = []
# Factor 1: Sample size (0-30 points)
n_runs = len(production_history)
if n_runs >= 100:
sample_score = 30
elif n_runs >= 50:
sample_score = 25
elif n_runs >= 30:
sample_score = 20
else:
sample_score = 10
confidence_factors.append(('sample_size', sample_score))
# Factor 2: Model performance (0-30 points)
r2 = model_results['performance']['r2']
mae = model_results['performance']['mae']
if r2 > 0.7 and mae < 3:
model_score = 30
elif r2 > 0.5 and mae < 5:
model_score = 25
elif r2 > 0.3 and mae < 7:
model_score = 20
else:
model_score = 10
confidence_factors.append(('model_performance', model_score))
# Factor 3: Statistical significance of factors (0-25 points)
significant_factors = sum(
1 for factor in factor_analysis.values()
if isinstance(factor, dict) and factor.get('significant')
)
if significant_factors >= 3:
stats_score = 25
elif significant_factors >= 2:
stats_score = 20
elif significant_factors >= 1:
stats_score = 15
else:
stats_score = 10
confidence_factors.append(('significant_factors', stats_score))
# Factor 4: Data recency (0-15 points)
most_recent = production_history['started_at'].max()
days_old = (datetime.utcnow() - pd.to_datetime(most_recent)).days
if days_old <= 7:
recency_score = 15
elif days_old <= 30:
recency_score = 12
elif days_old <= 90:
recency_score = 8
else:
recency_score = 5
confidence_factors.append(('data_recency', recency_score))
total_confidence = sum(score for _, score in confidence_factors)
return min(100, max(0, total_confidence))
async def analyze_recipe_yield_history(
self,
tenant_id: str,
recipe_id: str,
production_history: pd.DataFrame,
min_history_runs: int = 30
) -> Dict[str, Any]:
"""
Analyze historical yield performance for a recipe (no prediction).
Args:
tenant_id: Tenant identifier
recipe_id: Recipe identifier
production_history: Historical production runs
min_history_runs: Minimum production runs required
Returns:
Historical analysis with insights
"""
logger.info(
"Analyzing recipe yield history",
tenant_id=tenant_id,
recipe_id=recipe_id,
history_runs=len(production_history)
)
if len(production_history) < min_history_runs:
return self._insufficient_data_response(
recipe_id, {}, len(production_history), min_history_runs
)
# Calculate statistics
baseline_stats = self._calculate_baseline_statistics(production_history)
# Feature engineering
feature_df = self._engineer_features(production_history)
# Analyze factors
factor_analysis = self._analyze_yield_factors(feature_df)
# Identify patterns
patterns = self._identify_yield_patterns(feature_df, factor_analysis)
# Generate insights (without prediction)
insights = []
# Add insights for patterns
for pattern in patterns:
if pattern.get('severity') in ['high', 'medium']:
insights.append({
'type': 'opportunity',
'priority': pattern['severity'],
'category': 'production',
'title': f'Yield Pattern Detected: {pattern["pattern"]}',
'description': pattern['description'],
'impact_type': 'yield_improvement',
'confidence': 80,
'metrics_json': {
'recipe_id': recipe_id,
'pattern': pattern
},
'actionable': True,
'recommendation': pattern['recommendation']
})
return {
'recipe_id': recipe_id,
'analyzed_at': datetime.utcnow().isoformat(),
'history_runs': len(production_history),
'baseline_stats': baseline_stats,
'factor_analysis': factor_analysis,
'patterns': patterns,
'insights': insights
}