Improve AI logic

This commit is contained in:
Urtzi Alfaro
2025-11-05 13:34:56 +01:00
parent 5c87fbcf48
commit 394ad3aea4
218 changed files with 30627 additions and 7658 deletions

View File

@@ -0,0 +1,288 @@
"""
ML Insights API Endpoints for Production Service
Provides endpoints to trigger ML insight generation for:
- Production yield predictions
- Quality optimization
- Process efficiency analysis
"""
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel, Field
from typing import Optional, List
from uuid import UUID
from datetime import datetime, timedelta
import structlog
import pandas as pd
from app.core.database import get_db
from sqlalchemy.ext.asyncio import AsyncSession
logger = structlog.get_logger()
router = APIRouter(
prefix="/api/v1/tenants/{tenant_id}/production/ml/insights",
tags=["ML Insights"]
)
# ================================================================
# REQUEST/RESPONSE SCHEMAS
# ================================================================
class YieldPredictionRequest(BaseModel):
"""Request schema for yield prediction"""
recipe_ids: Optional[List[str]] = Field(
None,
description="Specific recipe IDs to analyze. If None, analyzes all recipes"
)
lookback_days: int = Field(
90,
description="Days of historical production to analyze",
ge=30,
le=365
)
min_history_runs: int = Field(
30,
description="Minimum production runs required",
ge=10,
le=100
)
class YieldPredictionResponse(BaseModel):
"""Response schema for yield prediction"""
success: bool
message: str
tenant_id: str
recipes_analyzed: int
total_insights_generated: int
total_insights_posted: int
recipes_with_issues: int
insights_by_recipe: dict
errors: List[str] = []
# ================================================================
# API ENDPOINTS
# ================================================================
@router.post("/predict-yields", response_model=YieldPredictionResponse)
async def trigger_yield_prediction(
tenant_id: str,
request_data: YieldPredictionRequest,
db: AsyncSession = Depends(get_db)
):
"""
Trigger yield prediction for production recipes.
This endpoint:
1. Fetches historical production data for specified recipes
2. Runs the YieldInsightsOrchestrator to predict yields
3. Generates insights about yield optimization opportunities
4. Posts insights to AI Insights Service
Args:
tenant_id: Tenant UUID
request_data: Prediction parameters
db: Database session
Returns:
YieldPredictionResponse with prediction results
"""
logger.info(
"ML insights yield prediction requested",
tenant_id=tenant_id,
recipe_ids=request_data.recipe_ids,
lookback_days=request_data.lookback_days
)
try:
# Import ML orchestrator and clients
from app.ml.yield_insights_orchestrator import YieldInsightsOrchestrator
from shared.clients.recipes_client import RecipesServiceClient
from app.core.config import settings
# Initialize orchestrator and recipes client
orchestrator = YieldInsightsOrchestrator()
recipes_client = RecipesServiceClient(settings)
# Get recipes to analyze from recipes service via API
if request_data.recipe_ids:
# Fetch specific recipes
recipes = []
for recipe_id in request_data.recipe_ids:
recipe = await recipes_client.get_recipe_by_id(
recipe_id=recipe_id,
tenant_id=tenant_id
)
if recipe:
recipes.append(recipe)
else:
# Fetch all recipes for tenant (limit to 10)
all_recipes = await recipes_client.get_all_recipes(tenant_id=tenant_id)
recipes = all_recipes[:10] if all_recipes else [] # Limit to prevent timeout
if not recipes:
return YieldPredictionResponse(
success=False,
message="No recipes found for analysis",
tenant_id=tenant_id,
recipes_analyzed=0,
total_insights_generated=0,
total_insights_posted=0,
recipes_with_issues=0,
insights_by_recipe={},
errors=["No recipes found"]
)
# Calculate date range for production history
end_date = datetime.utcnow()
start_date = end_date - timedelta(days=request_data.lookback_days)
# Process each recipe
total_insights_generated = 0
total_insights_posted = 0
recipes_with_issues = 0
insights_by_recipe = {}
errors = []
for recipe in recipes:
try:
recipe_id = str(recipe['id'])
recipe_name = recipe.get('name', 'Unknown Recipe')
logger.info(f"Analyzing yield for {recipe_name} ({recipe_id})")
# Fetch real production batch history from database
from app.models.production import ProductionBatch, ProductionStatus
from sqlalchemy import select
batch_query = select(ProductionBatch).where(
ProductionBatch.tenant_id == UUID(tenant_id),
ProductionBatch.recipe_id == UUID(recipe_id), # Use the extracted UUID
ProductionBatch.actual_start_time >= start_date,
ProductionBatch.actual_start_time <= end_date,
ProductionBatch.status == ProductionStatus.COMPLETED,
ProductionBatch.actual_quantity.isnot(None)
).order_by(ProductionBatch.actual_start_time)
batch_result = await db.execute(batch_query)
batches = batch_result.scalars().all()
if len(batches) < request_data.min_history_runs:
logger.warning(
f"Insufficient production history for recipe {recipe_id}: "
f"{len(batches)} batches < {request_data.min_history_runs} required"
)
continue
# Create production history DataFrame from real batches
production_data = []
for batch in batches:
# Calculate yield percentage
if batch.planned_quantity and batch.actual_quantity:
yield_pct = (batch.actual_quantity / batch.planned_quantity) * 100
else:
continue # Skip batches without complete data
production_data.append({
'production_date': batch.actual_start_time,
'planned_quantity': float(batch.planned_quantity),
'actual_quantity': float(batch.actual_quantity),
'yield_percentage': yield_pct,
'worker_id': batch.notes or 'unknown', # Use notes field or default
'batch_number': batch.batch_number
})
if not production_data:
logger.warning(
f"No valid production data for recipe {recipe_id}"
)
continue
production_history = pd.DataFrame(production_data)
# Run yield analysis
results = await orchestrator.analyze_and_post_insights(
tenant_id=tenant_id,
recipe_id=recipe_id,
production_history=production_history,
min_history_runs=request_data.min_history_runs
)
# Track results
total_insights_generated += results['insights_generated']
total_insights_posted += results['insights_posted']
baseline_stats = results.get('baseline_stats', {})
mean_yield = baseline_stats.get('mean_yield', 100)
if mean_yield < 90:
recipes_with_issues += 1
insights_by_recipe[recipe_id] = {
'recipe_name': recipe_name,
'insights_posted': results['insights_posted'],
'mean_yield': mean_yield,
'patterns': len(results.get('patterns', []))
}
logger.info(
f"Recipe {recipe_id} analysis complete",
insights_posted=results['insights_posted'],
mean_yield=mean_yield
)
except Exception as e:
error_msg = f"Error analyzing recipe {recipe_id}: {str(e)}"
logger.error(error_msg, exc_info=True)
errors.append(error_msg)
# Close orchestrator and clients
await orchestrator.close()
await recipes_client.close()
# Build response
response = YieldPredictionResponse(
success=total_insights_posted > 0,
message=f"Successfully analyzed {len([r for r in recipes if isinstance(r, dict)])} recipes, generated {total_insights_posted} insights",
tenant_id=tenant_id,
recipes_analyzed=len([r for r in recipes if isinstance(r, dict)]),
total_insights_generated=total_insights_generated,
total_insights_posted=total_insights_posted,
recipes_with_issues=recipes_with_issues,
insights_by_recipe=insights_by_recipe,
errors=errors
)
logger.info(
"ML insights yield prediction complete",
tenant_id=tenant_id,
total_insights=total_insights_posted,
recipes_with_issues=recipes_with_issues
)
return response
except Exception as e:
logger.error(
"ML insights yield prediction failed",
tenant_id=tenant_id,
error=str(e),
exc_info=True
)
raise HTTPException(
status_code=500,
detail=f"Yield prediction failed: {str(e)}"
)
@router.get("/health")
async def ml_insights_health():
"""Health check for ML insights endpoints"""
return {
"status": "healthy",
"service": "production-ml-insights",
"endpoints": [
"POST /ml/insights/predict-yields"
]
}

View File

@@ -101,7 +101,7 @@ class GenerateScheduleResponse(BaseModel):
# ================================================================
@router.post(
route_builder.build_nested_resource_route("", None, "generate-schedule"),
route_builder.build_operations_route("generate-schedule"),
response_model=GenerateScheduleResponse
)
async def generate_production_schedule(

View File

@@ -305,6 +305,31 @@ async def reserve_capacity(
raise HTTPException(status_code=500, detail="Failed to reserve capacity")
@router.get(
"/api/v1/tenants/{tenant_id}/production/capacity/date/{date}",
response_model=list
)
async def get_capacity_by_date(
tenant_id: UUID = Path(...),
date: date = Path(..., description="Date to retrieve capacity for (format: YYYY-MM-DD)"),
current_user: dict = Depends(get_current_user_dep),
production_service: ProductionService = Depends(get_production_service)
):
"""Get capacity by date (using direct route to support date path parameter)"""
try:
capacity_data = await production_service.get_capacity_by_date(tenant_id, date)
logger.info("Retrieved capacity by date",
tenant_id=str(tenant_id), date=date.isoformat())
return capacity_data
except Exception as e:
logger.error("Error getting capacity by date",
error=str(e), tenant_id=str(tenant_id), date=date.isoformat())
raise HTTPException(status_code=500, detail="Failed to get capacity by date")
@router.get(
route_builder.build_operations_route("capacity/bottlenecks"),
response_model=dict

View File

@@ -26,7 +26,8 @@ from app.api import (
internal_demo,
orchestrator, # NEW: Orchestrator integration endpoint
production_orders_operations, # Tenant deletion endpoints
audit
audit,
ml_insights # ML insights endpoint
)
@@ -164,6 +165,7 @@ service.add_router(production_operations.router)
service.add_router(production_dashboard.router)
service.add_router(analytics.router)
service.add_router(internal_demo.router)
service.add_router(ml_insights.router) # ML insights endpoint
# REMOVED: test_production_scheduler endpoint
# Production scheduling is now triggered by the Orchestrator Service

View File

@@ -0,0 +1,415 @@
"""
Yield Insights Orchestrator
Coordinates yield prediction and insight posting
"""
import pandas as pd
from typing import Dict, List, Any, Optional
import structlog
from datetime import datetime
from uuid import UUID
import sys
import os
# Add shared clients to path
sys.path.append(os.path.join(os.path.dirname(__file__), '../../../..'))
from shared.clients.ai_insights_client import AIInsightsClient
from app.ml.yield_predictor import YieldPredictor
logger = structlog.get_logger()
class YieldInsightsOrchestrator:
"""
Orchestrates yield prediction and insight generation workflow.
Workflow:
1. Predict yield for upcoming production run or analyze historical performance
2. Generate insights for yield optimization opportunities
3. Post insights to AI Insights Service
4. Provide yield predictions for production planning
"""
def __init__(
self,
ai_insights_base_url: str = "http://ai-insights-service:8000"
):
self.predictor = YieldPredictor()
self.ai_insights_client = AIInsightsClient(ai_insights_base_url)
async def predict_and_post_insights(
self,
tenant_id: str,
recipe_id: str,
production_history: pd.DataFrame,
production_context: Dict[str, Any],
min_history_runs: int = 30
) -> Dict[str, Any]:
"""
Complete workflow: Predict yield and post insights.
Args:
tenant_id: Tenant identifier
recipe_id: Recipe identifier
production_history: Historical production runs
production_context: Upcoming production context:
- worker_id
- planned_start_time
- batch_size
- planned_quantity
- unit_cost (optional)
- equipment_id (optional)
min_history_runs: Minimum production runs required
Returns:
Workflow results with prediction and posted insights
"""
logger.info(
"Starting yield prediction workflow",
tenant_id=tenant_id,
recipe_id=recipe_id,
history_runs=len(production_history)
)
# Step 1: Predict yield
prediction_results = await self.predictor.predict_yield(
tenant_id=tenant_id,
recipe_id=recipe_id,
production_history=production_history,
production_context=production_context,
min_history_runs=min_history_runs
)
logger.info(
"Yield prediction complete",
recipe_id=recipe_id,
predicted_yield=prediction_results.get('predicted_yield'),
insights_generated=len(prediction_results.get('insights', []))
)
# Step 2: Enrich insights with tenant_id and recipe context
enriched_insights = self._enrich_insights(
prediction_results.get('insights', []),
tenant_id,
recipe_id
)
# Step 3: Post insights to AI Insights Service
if enriched_insights:
post_results = await self.ai_insights_client.create_insights_bulk(
tenant_id=UUID(tenant_id),
insights=enriched_insights
)
logger.info(
"Yield insights posted to AI Insights Service",
recipe_id=recipe_id,
total=post_results['total'],
successful=post_results['successful'],
failed=post_results['failed']
)
else:
post_results = {'total': 0, 'successful': 0, 'failed': 0}
logger.info("No insights to post for recipe", recipe_id=recipe_id)
# Step 4: Return comprehensive results
return {
'tenant_id': tenant_id,
'recipe_id': recipe_id,
'predicted_at': prediction_results['predicted_at'],
'history_runs': prediction_results['history_runs'],
'baseline_yield': prediction_results.get('baseline_yield'),
'predicted_yield': prediction_results.get('predicted_yield'),
'prediction_range': prediction_results.get('prediction_range'),
'expected_waste': prediction_results.get('expected_waste'),
'confidence': prediction_results['confidence'],
'factor_analysis': prediction_results.get('factor_analysis'),
'patterns': prediction_results.get('patterns', []),
'insights_generated': len(enriched_insights),
'insights_posted': post_results['successful'],
'insights_failed': post_results['failed'],
'created_insights': post_results.get('created_insights', [])
}
async def analyze_and_post_insights(
self,
tenant_id: str,
recipe_id: str,
production_history: pd.DataFrame,
min_history_runs: int = 30
) -> Dict[str, Any]:
"""
Analyze historical yield performance and post insights (no prediction).
Args:
tenant_id: Tenant identifier
recipe_id: Recipe identifier
production_history: Historical production runs
min_history_runs: Minimum production runs required
Returns:
Workflow results with analysis and posted insights
"""
logger.info(
"Starting yield analysis workflow",
tenant_id=tenant_id,
recipe_id=recipe_id,
history_runs=len(production_history)
)
# Step 1: Analyze historical yield
analysis_results = await self.predictor.analyze_recipe_yield_history(
tenant_id=tenant_id,
recipe_id=recipe_id,
production_history=production_history,
min_history_runs=min_history_runs
)
logger.info(
"Yield analysis complete",
recipe_id=recipe_id,
baseline_yield=analysis_results.get('baseline_stats', {}).get('mean_yield'),
insights_generated=len(analysis_results.get('insights', []))
)
# Step 2: Enrich insights
enriched_insights = self._enrich_insights(
analysis_results.get('insights', []),
tenant_id,
recipe_id
)
# Step 3: Post insights
if enriched_insights:
post_results = await self.ai_insights_client.create_insights_bulk(
tenant_id=UUID(tenant_id),
insights=enriched_insights
)
logger.info(
"Yield analysis insights posted",
recipe_id=recipe_id,
total=post_results['total'],
successful=post_results['successful']
)
else:
post_results = {'total': 0, 'successful': 0, 'failed': 0}
return {
'tenant_id': tenant_id,
'recipe_id': recipe_id,
'analyzed_at': analysis_results['analyzed_at'],
'history_runs': analysis_results['history_runs'],
'baseline_stats': analysis_results.get('baseline_stats'),
'factor_analysis': analysis_results.get('factor_analysis'),
'patterns': analysis_results.get('patterns', []),
'insights_generated': len(enriched_insights),
'insights_posted': post_results['successful'],
'created_insights': post_results.get('created_insights', [])
}
def _enrich_insights(
self,
insights: List[Dict[str, Any]],
tenant_id: str,
recipe_id: str
) -> List[Dict[str, Any]]:
"""
Enrich insights with required fields for AI Insights Service.
Args:
insights: Raw insights from predictor
tenant_id: Tenant identifier
recipe_id: Recipe identifier
Returns:
Enriched insights ready for posting
"""
enriched = []
for insight in insights:
# Add required tenant_id
enriched_insight = insight.copy()
enriched_insight['tenant_id'] = tenant_id
# Add recipe context to metrics
if 'metrics_json' not in enriched_insight:
enriched_insight['metrics_json'] = {}
enriched_insight['metrics_json']['recipe_id'] = recipe_id
# Add source metadata
enriched_insight['source_service'] = 'production'
enriched_insight['source_model'] = 'yield_predictor'
enriched_insight['detected_at'] = datetime.utcnow().isoformat()
enriched.append(enriched_insight)
return enriched
async def analyze_all_recipes(
self,
tenant_id: str,
recipes_data: Dict[str, pd.DataFrame],
min_history_runs: int = 30
) -> Dict[str, Any]:
"""
Analyze yield performance for all recipes for a tenant.
Args:
tenant_id: Tenant identifier
recipes_data: Dict of {recipe_id: production_history_df}
min_history_runs: Minimum production runs required
Returns:
Comprehensive analysis results
"""
logger.info(
"Analyzing yield for all recipes",
tenant_id=tenant_id,
recipes=len(recipes_data)
)
all_results = []
total_insights_posted = 0
recipes_with_issues = []
# Analyze each recipe
for recipe_id, production_history in recipes_data.items():
try:
results = await self.analyze_and_post_insights(
tenant_id=tenant_id,
recipe_id=recipe_id,
production_history=production_history,
min_history_runs=min_history_runs
)
all_results.append(results)
total_insights_posted += results['insights_posted']
# Check for low baseline yield
baseline_stats = results.get('baseline_stats')
if baseline_stats and baseline_stats.get('mean_yield', 100) < 90:
recipes_with_issues.append({
'recipe_id': recipe_id,
'mean_yield': baseline_stats['mean_yield'],
'std_yield': baseline_stats['std_yield']
})
except Exception as e:
logger.error(
"Error analyzing recipe",
recipe_id=recipe_id,
error=str(e)
)
# Generate portfolio summary insight if there are yield issues
if len(recipes_with_issues) > 0:
summary_insight = self._generate_portfolio_summary_insight(
tenant_id, recipes_with_issues, all_results
)
if summary_insight:
enriched_summary = self._enrich_insights(
[summary_insight], tenant_id, 'all_recipes'
)
post_results = await self.ai_insights_client.create_insights_bulk(
tenant_id=UUID(tenant_id),
insights=enriched_summary
)
total_insights_posted += post_results['successful']
logger.info(
"All recipes yield analysis complete",
tenant_id=tenant_id,
recipes_analyzed=len(all_results),
total_insights_posted=total_insights_posted,
recipes_with_issues=len(recipes_with_issues)
)
return {
'tenant_id': tenant_id,
'analyzed_at': datetime.utcnow().isoformat(),
'recipes_analyzed': len(all_results),
'recipe_results': all_results,
'total_insights_posted': total_insights_posted,
'recipes_with_issues': recipes_with_issues
}
def _generate_portfolio_summary_insight(
self,
tenant_id: str,
recipes_with_issues: List[Dict[str, Any]],
all_results: List[Dict[str, Any]]
) -> Optional[Dict[str, Any]]:
"""
Generate portfolio-level summary insight.
Args:
tenant_id: Tenant identifier
recipes_with_issues: Recipes with low yield
all_results: All recipe analysis results
Returns:
Summary insight or None
"""
if len(recipes_with_issues) == 0:
return None
# Calculate average yield and potential improvement
total_recipes = len(all_results)
issues_count = len(recipes_with_issues)
avg_low_yield = sum(r['mean_yield'] for r in recipes_with_issues) / issues_count
# Estimate waste reduction potential
# Assuming each recipe produces 1000 units/month, €5/unit cost
monthly_production = 1000 * issues_count
current_waste_pct = 100 - avg_low_yield
target_waste_pct = 5 # Target 95% yield
if current_waste_pct > target_waste_pct:
waste_reduction_units = monthly_production * ((current_waste_pct - target_waste_pct) / 100)
annual_savings = waste_reduction_units * 12 * 5 # €5 per unit
return {
'type': 'opportunity',
'priority': 'high' if issues_count > 3 else 'medium',
'category': 'production',
'title': f'Production Yield Optimization: {issues_count} Recipes Below 90%',
'description': f'{issues_count} of {total_recipes} recipes have average yield below 90% (average {avg_low_yield:.1f}%). Improving to 95% target would reduce waste by {waste_reduction_units:.0f} units/month, saving €{annual_savings:.0f}/year.',
'impact_type': 'cost_savings',
'impact_value': annual_savings,
'impact_unit': 'euros_per_year',
'confidence': 75,
'metrics_json': {
'recipes_analyzed': total_recipes,
'recipes_with_issues': issues_count,
'avg_low_yield': round(avg_low_yield, 2),
'potential_annual_savings': round(annual_savings, 2),
'waste_reduction_units_monthly': round(waste_reduction_units, 2)
},
'actionable': True,
'recommendation_actions': [
{
'label': 'Review Low-Yield Recipes',
'action': 'review_yield_insights',
'params': {'tenant_id': tenant_id}
},
{
'label': 'Implement Yield Improvements',
'action': 'apply_yield_recommendations',
'params': {'tenant_id': tenant_id}
}
],
'source_service': 'production',
'source_model': 'yield_predictor'
}
return None
async def close(self):
"""Close HTTP client connections."""
await self.ai_insights_client.close()

View File

@@ -0,0 +1,799 @@
"""
Production Yield Predictor
Predicts actual vs planned yield and identifies waste reduction opportunities
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime, timedelta
import structlog
from scipy import stats
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
logger = structlog.get_logger()
class YieldPredictor:
"""
Predicts production yield based on historical data and production factors.
Key Features:
- Multi-factor yield prediction (recipe, worker, time-of-day, equipment, batch size)
- Identifies low-yield patterns and root causes
- Waste categorization (spoilage, measurement error, process inefficiency)
- Actionable recommendations for yield improvement
- Statistical validation of learned patterns
Methodology:
1. Feature Engineering: Extract worker skill, time factors, batch size effects
2. Statistical Analysis: Identify significant yield loss factors
3. ML Prediction: Ensemble of Random Forest + Gradient Boosting
4. Pattern Detection: Find recurring low-yield situations
5. Insight Generation: Actionable recommendations with confidence scores
"""
def __init__(self):
self.model_cache = {} # Cache trained models per recipe
self.baseline_yields = {} # Cache baseline yields per recipe
async def predict_yield(
self,
tenant_id: str,
recipe_id: str,
production_history: pd.DataFrame,
production_context: Dict[str, Any],
min_history_runs: int = 30
) -> Dict[str, Any]:
"""
Predict yield for upcoming production run and generate insights.
Args:
tenant_id: Tenant identifier
recipe_id: Recipe identifier
production_history: Historical production runs with columns:
- production_run_id
- recipe_id
- planned_quantity
- actual_quantity
- yield_percentage
- worker_id
- started_at
- completed_at
- batch_size
- equipment_id (optional)
- notes (optional)
production_context: Upcoming production context:
- worker_id
- planned_start_time
- batch_size
- equipment_id (optional)
min_history_runs: Minimum production runs required for learning
Returns:
Prediction results with yield forecast, confidence, and insights
"""
logger.info(
"Predicting production yield",
tenant_id=tenant_id,
recipe_id=recipe_id,
history_runs=len(production_history)
)
# Validate production history
if len(production_history) < min_history_runs:
return self._insufficient_data_response(
recipe_id, production_context, len(production_history), min_history_runs
)
# Step 1: Calculate baseline statistics
baseline_stats = self._calculate_baseline_statistics(production_history)
# Step 2: Feature engineering
feature_df = self._engineer_features(production_history)
# Step 3: Analyze yield factors
factor_analysis = self._analyze_yield_factors(feature_df)
# Step 4: Train predictive model
model_results = self._train_yield_model(feature_df)
# Step 5: Make prediction for upcoming run
prediction = self._predict_upcoming_run(
production_context, model_results, baseline_stats, feature_df
)
# Step 6: Identify low-yield patterns
patterns = self._identify_yield_patterns(feature_df, factor_analysis)
# Step 7: Generate insights
insights = self._generate_yield_insights(
tenant_id, recipe_id, baseline_stats, factor_analysis,
patterns, prediction, production_context
)
# Step 8: Calculate confidence
confidence = self._calculate_prediction_confidence(
production_history, model_results, factor_analysis
)
return {
'recipe_id': recipe_id,
'predicted_at': datetime.utcnow().isoformat(),
'history_runs': len(production_history),
'baseline_yield': baseline_stats['mean_yield'],
'baseline_std': baseline_stats['std_yield'],
'predicted_yield': prediction['predicted_yield'],
'prediction_range': prediction['prediction_range'],
'expected_waste': prediction['expected_waste'],
'confidence': confidence,
'factor_analysis': factor_analysis,
'patterns': patterns,
'model_performance': model_results['performance'],
'insights': insights
}
def _insufficient_data_response(
self, recipe_id: str, production_context: Dict[str, Any],
current_runs: int, required_runs: int
) -> Dict[str, Any]:
"""Return response when insufficient historical data."""
return {
'recipe_id': recipe_id,
'predicted_at': datetime.utcnow().isoformat(),
'history_runs': current_runs,
'status': 'insufficient_data',
'required_runs': required_runs,
'baseline_yield': None,
'predicted_yield': None,
'confidence': 0,
'insights': [{
'type': 'warning',
'priority': 'low',
'category': 'production',
'title': f'Insufficient Production History for Yield Prediction',
'description': f'Only {current_runs} production runs available. Need at least {required_runs} runs to build reliable yield predictions. Continue tracking production data to enable yield optimization.',
'impact_type': 'data_quality',
'confidence': 100,
'actionable': True,
'recommendation_actions': [{
'label': 'Track Production Data',
'action': 'continue_production_tracking',
'params': {'recipe_id': recipe_id}
}]
}]
}
def _calculate_baseline_statistics(
self, production_history: pd.DataFrame
) -> Dict[str, Any]:
"""Calculate baseline yield statistics."""
yields = production_history['yield_percentage'].values
return {
'mean_yield': float(np.mean(yields)),
'median_yield': float(np.median(yields)),
'std_yield': float(np.std(yields)),
'min_yield': float(np.min(yields)),
'max_yield': float(np.max(yields)),
'cv_yield': float(np.std(yields) / np.mean(yields)), # Coefficient of variation
'percentile_25': float(np.percentile(yields, 25)),
'percentile_75': float(np.percentile(yields, 75)),
'runs_below_90': int(np.sum(yields < 90)),
'runs_above_95': int(np.sum(yields > 95))
}
def _engineer_features(self, production_history: pd.DataFrame) -> pd.DataFrame:
"""Engineer features from production history."""
df = production_history.copy()
# Time-based features
df['started_at'] = pd.to_datetime(df['started_at'])
df['hour_of_day'] = df['started_at'].dt.hour
df['day_of_week'] = df['started_at'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
df['is_early_morning'] = (df['hour_of_day'] < 6).astype(int)
df['is_late_night'] = (df['hour_of_day'] >= 22).astype(int)
# Duration features
if 'completed_at' in df.columns:
df['completed_at'] = pd.to_datetime(df['completed_at'])
df['duration_hours'] = (df['completed_at'] - df['started_at']).dt.total_seconds() / 3600
df['is_rushed'] = (df['duration_hours'] < df['duration_hours'].quantile(0.25)).astype(int)
# Batch size features
df['batch_size_normalized'] = df['batch_size'] / df['batch_size'].mean()
df['is_large_batch'] = (df['batch_size'] > df['batch_size'].quantile(0.75)).astype(int)
df['is_small_batch'] = (df['batch_size'] < df['batch_size'].quantile(0.25)).astype(int)
# Worker experience features (proxy: number of previous runs)
df = df.sort_values('started_at')
df['worker_run_count'] = df.groupby('worker_id').cumcount() + 1
df['worker_experience_level'] = pd.cut(
df['worker_run_count'],
bins=[0, 5, 15, 100],
labels=['novice', 'intermediate', 'expert']
)
# Recent yield trend for worker
df['worker_recent_avg_yield'] = df.groupby('worker_id')['yield_percentage'].transform(
lambda x: x.rolling(window=5, min_periods=1).mean()
)
return df
def _analyze_yield_factors(self, feature_df: pd.DataFrame) -> Dict[str, Any]:
"""Analyze factors affecting yield using statistical tests."""
factors = {}
# Worker impact
worker_yields = feature_df.groupby('worker_id')['yield_percentage'].agg(['mean', 'std', 'count'])
worker_yields = worker_yields[worker_yields['count'] >= 3] # Min 3 runs per worker
if len(worker_yields) > 1:
# ANOVA test: Does worker significantly affect yield?
worker_groups = [
feature_df[feature_df['worker_id'] == worker]['yield_percentage'].values
for worker in worker_yields.index
]
f_stat, p_value = stats.f_oneway(*worker_groups)
factors['worker'] = {
'significant': p_value < 0.05,
'p_value': float(p_value),
'f_statistic': float(f_stat),
'best_worker': worker_yields['mean'].idxmax(),
'best_worker_yield': float(worker_yields['mean'].max()),
'worst_worker': worker_yields['mean'].idxmin(),
'worst_worker_yield': float(worker_yields['mean'].min()),
'yield_range': float(worker_yields['mean'].max() - worker_yields['mean'].min())
}
else:
factors['worker'] = {'significant': False, 'reason': 'insufficient_workers'}
# Time of day impact
time_groups = {
'early_morning': feature_df[feature_df['hour_of_day'] < 6]['yield_percentage'].values,
'morning': feature_df[(feature_df['hour_of_day'] >= 6) & (feature_df['hour_of_day'] < 12)]['yield_percentage'].values,
'afternoon': feature_df[(feature_df['hour_of_day'] >= 12) & (feature_df['hour_of_day'] < 18)]['yield_percentage'].values,
'evening': feature_df[feature_df['hour_of_day'] >= 18]['yield_percentage'].values
}
time_groups = {k: v for k, v in time_groups.items() if len(v) >= 3}
if len(time_groups) > 1:
f_stat, p_value = stats.f_oneway(*time_groups.values())
time_means = {k: np.mean(v) for k, v in time_groups.items()}
factors['time_of_day'] = {
'significant': p_value < 0.05,
'p_value': float(p_value),
'best_time': max(time_means, key=time_means.get),
'best_time_yield': float(max(time_means.values())),
'worst_time': min(time_means, key=time_means.get),
'worst_time_yield': float(min(time_means.values())),
'yield_range': float(max(time_means.values()) - min(time_means.values()))
}
else:
factors['time_of_day'] = {'significant': False, 'reason': 'insufficient_data'}
# Batch size impact (correlation)
if len(feature_df) >= 10:
correlation, p_value = stats.pearsonr(
feature_df['batch_size'],
feature_df['yield_percentage']
)
factors['batch_size'] = {
'significant': abs(correlation) > 0.3 and p_value < 0.05,
'correlation': float(correlation),
'p_value': float(p_value),
'direction': 'positive' if correlation > 0 else 'negative',
'interpretation': self._interpret_batch_size_effect(correlation)
}
else:
factors['batch_size'] = {'significant': False, 'reason': 'insufficient_data'}
# Weekend vs weekday
weekend_yields = feature_df[feature_df['is_weekend'] == 1]['yield_percentage'].values
weekday_yields = feature_df[feature_df['is_weekend'] == 0]['yield_percentage'].values
if len(weekend_yields) >= 3 and len(weekday_yields) >= 3:
t_stat, p_value = stats.ttest_ind(weekend_yields, weekday_yields)
factors['weekend_effect'] = {
'significant': p_value < 0.05,
'p_value': float(p_value),
't_statistic': float(t_stat),
'weekend_yield': float(np.mean(weekend_yields)),
'weekday_yield': float(np.mean(weekday_yields)),
'difference': float(np.mean(weekend_yields) - np.mean(weekday_yields))
}
else:
factors['weekend_effect'] = {'significant': False, 'reason': 'insufficient_weekend_data'}
return factors
def _interpret_batch_size_effect(self, correlation: float) -> str:
"""Interpret batch size correlation."""
if abs(correlation) < 0.3:
return "Batch size has minimal impact on yield"
elif correlation > 0:
return "Larger batches tend to have higher yield (economies of scale)"
else:
return "Larger batches tend to have lower yield (difficulty handling large volumes)"
def _train_yield_model(self, feature_df: pd.DataFrame) -> Dict[str, Any]:
"""Train ML model to predict yield."""
# Prepare features
feature_columns = [
'hour_of_day', 'day_of_week', 'is_weekend',
'batch_size_normalized', 'is_large_batch', 'is_small_batch',
'worker_run_count'
]
if 'duration_hours' in feature_df.columns:
feature_columns.append('duration_hours')
# Encode worker_id
worker_encoding = {worker: idx for idx, worker in enumerate(feature_df['worker_id'].unique())}
feature_df['worker_encoded'] = feature_df['worker_id'].map(worker_encoding)
feature_columns.append('worker_encoded')
X = feature_df[feature_columns].fillna(0).values
y = feature_df['yield_percentage'].values
# Split into train/test (temporal split)
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train ensemble of models
models = {
'random_forest': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
'gradient_boosting': GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42),
'linear': LinearRegression()
}
performances = {}
predictions = {}
for name, model in models.items():
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
mae = np.mean(np.abs(y_test - y_pred))
rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
r2 = 1 - (np.sum((y_test - y_pred) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2))
performances[name] = {
'mae': float(mae),
'rmse': float(rmse),
'r2': float(r2)
}
predictions[name] = y_pred
# Select best model based on MAE
best_model_name = min(performances, key=lambda k: performances[k]['mae'])
best_model = models[best_model_name]
# Feature importance (if available)
feature_importance = {}
if hasattr(best_model, 'feature_importances_'):
importances = best_model.feature_importances_
feature_importance = {
feature_columns[i]: float(importances[i])
for i in range(len(feature_columns))
}
feature_importance = dict(sorted(
feature_importance.items(),
key=lambda x: x[1],
reverse=True
))
return {
'best_model': best_model,
'best_model_name': best_model_name,
'scaler': scaler,
'feature_columns': feature_columns,
'worker_encoding': worker_encoding,
'performance': performances[best_model_name],
'all_performances': performances,
'feature_importance': feature_importance
}
def _predict_upcoming_run(
self,
production_context: Dict[str, Any],
model_results: Dict[str, Any],
baseline_stats: Dict[str, Any],
feature_df: pd.DataFrame
) -> Dict[str, Any]:
"""Predict yield for upcoming production run."""
# Extract context
worker_id = production_context.get('worker_id')
planned_start = pd.to_datetime(production_context.get('planned_start_time'))
batch_size = production_context.get('batch_size')
# Get worker experience
worker_runs = feature_df[feature_df['worker_id'] == worker_id]
worker_run_count = len(worker_runs) if len(worker_runs) > 0 else 1
# Build feature vector
mean_batch_size = feature_df['batch_size'].mean()
batch_size_normalized = batch_size / mean_batch_size
is_large_batch = 1 if batch_size > feature_df['batch_size'].quantile(0.75) else 0
is_small_batch = 1 if batch_size < feature_df['batch_size'].quantile(0.25) else 0
features = {
'hour_of_day': planned_start.hour,
'day_of_week': planned_start.dayofweek,
'is_weekend': 1 if planned_start.dayofweek in [5, 6] else 0,
'batch_size_normalized': batch_size_normalized,
'is_large_batch': is_large_batch,
'is_small_batch': is_small_batch,
'worker_run_count': worker_run_count,
'duration_hours': 0, # Not known yet
'worker_encoded': model_results['worker_encoding'].get(worker_id, 0)
}
# Create feature vector in correct order
X = np.array([[features.get(col, 0) for col in model_results['feature_columns']]])
X_scaled = model_results['scaler'].transform(X)
# Predict
predicted_yield = float(model_results['best_model'].predict(X_scaled)[0])
# Prediction range (based on model RMSE)
rmse = model_results['performance']['rmse']
prediction_range = {
'lower': max(0, predicted_yield - 1.96 * rmse),
'upper': min(100, predicted_yield + 1.96 * rmse)
}
# Expected waste
planned_quantity = production_context.get('planned_quantity', 100)
expected_waste_pct = max(0, 100 - predicted_yield)
expected_waste_units = planned_quantity * (expected_waste_pct / 100)
return {
'predicted_yield': round(predicted_yield, 2),
'prediction_range': prediction_range,
'expected_waste_pct': round(expected_waste_pct, 2),
'expected_waste_units': round(expected_waste_units, 2),
'baseline_comparison': round(predicted_yield - baseline_stats['mean_yield'], 2),
'features_used': features
}
def _identify_yield_patterns(
self, feature_df: pd.DataFrame, factor_analysis: Dict[str, Any]
) -> List[Dict[str, Any]]:
"""Identify recurring low-yield patterns."""
patterns = []
# Pattern 1: Specific worker consistently low
if factor_analysis.get('worker', {}).get('significant'):
worst_worker = factor_analysis['worker']['worst_worker']
worst_yield = factor_analysis['worker']['worst_worker_yield']
best_yield = factor_analysis['worker']['best_worker_yield']
if worst_yield < 90 and (best_yield - worst_yield) > 5:
patterns.append({
'pattern': 'low_yield_worker',
'description': f'Worker {worst_worker} consistently produces {worst_yield:.1f}% yield vs best worker {best_yield:.1f}%',
'severity': 'high' if worst_yield < 85 else 'medium',
'affected_runs': int(len(feature_df[feature_df['worker_id'] == worst_worker])),
'yield_impact': round(best_yield - worst_yield, 2),
'recommendation': 'Provide additional training or reassign to different recipes'
})
# Pattern 2: Time-of-day effect
if factor_analysis.get('time_of_day', {}).get('significant'):
worst_time = factor_analysis['time_of_day']['worst_time']
worst_yield = factor_analysis['time_of_day']['worst_time_yield']
if worst_yield < 90:
patterns.append({
'pattern': 'low_yield_time',
'description': f'{worst_time} shifts produce {worst_yield:.1f}% yield',
'severity': 'medium',
'affected_runs': 'varies',
'yield_impact': round(factor_analysis['time_of_day']['yield_range'], 2),
'recommendation': f'Avoid scheduling this recipe during {worst_time}'
})
# Pattern 3: Large batch issues
if factor_analysis.get('batch_size', {}).get('significant'):
if factor_analysis['batch_size']['direction'] == 'negative':
patterns.append({
'pattern': 'large_batch_yield_loss',
'description': 'Larger batches have lower yield - equipment or process capacity issues',
'severity': 'medium',
'correlation': round(factor_analysis['batch_size']['correlation'], 3),
'recommendation': 'Split large batches or upgrade equipment'
})
# Pattern 4: Weekend effect
if factor_analysis.get('weekend_effect', {}).get('significant'):
weekend_yield = factor_analysis['weekend_effect']['weekend_yield']
weekday_yield = factor_analysis['weekend_effect']['weekday_yield']
if abs(weekend_yield - weekday_yield) > 3:
if weekend_yield < weekday_yield:
patterns.append({
'pattern': 'weekend_yield_drop',
'description': f'Weekend production {weekend_yield:.1f}% vs weekday {weekday_yield:.1f}%',
'severity': 'low',
'yield_impact': round(weekday_yield - weekend_yield, 2),
'recommendation': 'Review weekend staffing or processes'
})
return patterns
def _generate_yield_insights(
self,
tenant_id: str,
recipe_id: str,
baseline_stats: Dict[str, Any],
factor_analysis: Dict[str, Any],
patterns: List[Dict[str, Any]],
prediction: Dict[str, Any],
production_context: Dict[str, Any]
) -> List[Dict[str, Any]]:
"""Generate actionable insights for yield improvement."""
insights = []
# Insight 1: Low predicted yield warning
if prediction['predicted_yield'] < 90:
waste_value = prediction['expected_waste_units'] * production_context.get('unit_cost', 5)
insights.append({
'type': 'warning',
'priority': 'high' if prediction['predicted_yield'] < 85 else 'medium',
'category': 'production',
'title': f'Low Yield Predicted: {prediction["predicted_yield"]:.1f}%',
'description': f'Upcoming production run predicted to yield {prediction["predicted_yield"]:.1f}%, below baseline {baseline_stats["mean_yield"]:.1f}%. Expected waste: {prediction["expected_waste_units"]:.1f} units (€{waste_value:.2f}).',
'impact_type': 'waste',
'impact_value': prediction['expected_waste_units'],
'impact_unit': 'units',
'confidence': 75,
'metrics_json': {
'recipe_id': recipe_id,
'predicted_yield': prediction['predicted_yield'],
'expected_waste': prediction['expected_waste_units'],
'waste_value': round(waste_value, 2)
},
'actionable': True,
'recommendation_actions': [{
'label': 'Review Production Setup',
'action': 'review_production_factors',
'params': {
'recipe_id': recipe_id,
'worker_id': production_context.get('worker_id')
}
}]
})
# Insight 2: High-severity patterns
for pattern in patterns:
if pattern.get('severity') == 'high':
if pattern['pattern'] == 'low_yield_worker':
insights.append({
'type': 'opportunity',
'priority': 'high',
'category': 'production',
'title': f'Worker Training Opportunity: {pattern["yield_impact"]:.1f}% Yield Gap',
'description': pattern['description'] + f'. Improving this worker to average performance would save significant waste.',
'impact_type': 'yield_improvement',
'impact_value': pattern['yield_impact'],
'impact_unit': 'percentage_points',
'confidence': 85,
'metrics_json': {
'recipe_id': recipe_id,
'pattern': pattern['pattern'],
'yield_impact': pattern['yield_impact']
},
'actionable': True,
'recommendation_actions': [{
'label': 'Schedule Training',
'action': 'schedule_worker_training',
'params': {'recipe_id': recipe_id}
}]
})
# Insight 3: Excellent yield
if prediction['predicted_yield'] > 98:
insights.append({
'type': 'positive',
'priority': 'low',
'category': 'production',
'title': f'Excellent Yield Expected: {prediction["predicted_yield"]:.1f}%',
'description': f'Optimal production conditions detected. Expected yield {prediction["predicted_yield"]:.1f}% exceeds baseline {baseline_stats["mean_yield"]:.1f}%.',
'impact_type': 'yield_improvement',
'impact_value': prediction['baseline_comparison'],
'impact_unit': 'percentage_points',
'confidence': 70,
'metrics_json': {
'recipe_id': recipe_id,
'predicted_yield': prediction['predicted_yield']
},
'actionable': False
})
# Insight 4: Yield variability issue
if baseline_stats['cv_yield'] > 0.05: # Coefficient of variation > 5%
insights.append({
'type': 'opportunity',
'priority': 'medium',
'category': 'production',
'title': f'High Yield Variability: {baseline_stats["cv_yield"]*100:.1f}% CV',
'description': f'Yield varies significantly across production runs (CV={baseline_stats["cv_yield"]*100:.1f}%, range {baseline_stats["min_yield"]:.1f}%-{baseline_stats["max_yield"]:.1f}%). Standardizing processes could reduce waste.',
'impact_type': 'process_improvement',
'confidence': 80,
'metrics_json': {
'recipe_id': recipe_id,
'cv_yield': round(baseline_stats['cv_yield'], 3),
'yield_range': round(baseline_stats['max_yield'] - baseline_stats['min_yield'], 2)
},
'actionable': True,
'recommendation_actions': [{
'label': 'Standardize Process',
'action': 'review_production_sop',
'params': {'recipe_id': recipe_id}
}]
})
return insights
def _calculate_prediction_confidence(
self,
production_history: pd.DataFrame,
model_results: Dict[str, Any],
factor_analysis: Dict[str, Any]
) -> int:
"""Calculate overall confidence score for predictions."""
confidence_factors = []
# Factor 1: Sample size (0-30 points)
n_runs = len(production_history)
if n_runs >= 100:
sample_score = 30
elif n_runs >= 50:
sample_score = 25
elif n_runs >= 30:
sample_score = 20
else:
sample_score = 10
confidence_factors.append(('sample_size', sample_score))
# Factor 2: Model performance (0-30 points)
r2 = model_results['performance']['r2']
mae = model_results['performance']['mae']
if r2 > 0.7 and mae < 3:
model_score = 30
elif r2 > 0.5 and mae < 5:
model_score = 25
elif r2 > 0.3 and mae < 7:
model_score = 20
else:
model_score = 10
confidence_factors.append(('model_performance', model_score))
# Factor 3: Statistical significance of factors (0-25 points)
significant_factors = sum(
1 for factor in factor_analysis.values()
if isinstance(factor, dict) and factor.get('significant')
)
if significant_factors >= 3:
stats_score = 25
elif significant_factors >= 2:
stats_score = 20
elif significant_factors >= 1:
stats_score = 15
else:
stats_score = 10
confidence_factors.append(('significant_factors', stats_score))
# Factor 4: Data recency (0-15 points)
most_recent = production_history['started_at'].max()
days_old = (datetime.utcnow() - pd.to_datetime(most_recent)).days
if days_old <= 7:
recency_score = 15
elif days_old <= 30:
recency_score = 12
elif days_old <= 90:
recency_score = 8
else:
recency_score = 5
confidence_factors.append(('data_recency', recency_score))
total_confidence = sum(score for _, score in confidence_factors)
return min(100, max(0, total_confidence))
async def analyze_recipe_yield_history(
self,
tenant_id: str,
recipe_id: str,
production_history: pd.DataFrame,
min_history_runs: int = 30
) -> Dict[str, Any]:
"""
Analyze historical yield performance for a recipe (no prediction).
Args:
tenant_id: Tenant identifier
recipe_id: Recipe identifier
production_history: Historical production runs
min_history_runs: Minimum production runs required
Returns:
Historical analysis with insights
"""
logger.info(
"Analyzing recipe yield history",
tenant_id=tenant_id,
recipe_id=recipe_id,
history_runs=len(production_history)
)
if len(production_history) < min_history_runs:
return self._insufficient_data_response(
recipe_id, {}, len(production_history), min_history_runs
)
# Calculate statistics
baseline_stats = self._calculate_baseline_statistics(production_history)
# Feature engineering
feature_df = self._engineer_features(production_history)
# Analyze factors
factor_analysis = self._analyze_yield_factors(feature_df)
# Identify patterns
patterns = self._identify_yield_patterns(feature_df, factor_analysis)
# Generate insights (without prediction)
insights = []
# Add insights for patterns
for pattern in patterns:
if pattern.get('severity') in ['high', 'medium']:
insights.append({
'type': 'opportunity',
'priority': pattern['severity'],
'category': 'production',
'title': f'Yield Pattern Detected: {pattern["pattern"]}',
'description': pattern['description'],
'impact_type': 'yield_improvement',
'confidence': 80,
'metrics_json': {
'recipe_id': recipe_id,
'pattern': pattern
},
'actionable': True,
'recommendation': pattern['recommendation']
})
return {
'recipe_id': recipe_id,
'analyzed_at': datetime.utcnow().isoformat(),
'history_runs': len(production_history),
'baseline_stats': baseline_stats,
'factor_analysis': factor_analysis,
'patterns': patterns,
'insights': insights
}

View File

@@ -11,7 +11,7 @@ import structlog
from shared.database.repository import BaseRepository
from shared.database.exceptions import DatabaseError
from shared.database.transactions import transactional
logger = structlog.get_logger()
@@ -56,7 +56,6 @@ class ProductionBaseRepository(BaseRepository):
)
return await self.get_by_tenant_id(tenant_id, skip, limit)
@transactional
async def get_by_date_range(
self,
tenant_id: str,
@@ -89,7 +88,6 @@ class ProductionBaseRepository(BaseRepository):
error=str(e), tenant_id=tenant_id)
raise DatabaseError(f"Failed to fetch records by date range: {str(e)}")
@transactional
async def get_active_records(
self,
tenant_id: str,

View File

@@ -13,7 +13,7 @@ import structlog
from .base import ProductionBaseRepository
from app.models.production import ProductionCapacity
from shared.database.exceptions import DatabaseError, ValidationError
from shared.database.transactions import transactional
logger = structlog.get_logger()
@@ -25,7 +25,6 @@ class ProductionCapacityRepository(ProductionBaseRepository):
# Capacity data changes moderately, medium cache time (10 minutes)
super().__init__(ProductionCapacity, session, cache_ttl)
@transactional
async def create_capacity(self, capacity_data: Dict[str, Any]) -> ProductionCapacity:
"""Create a new production capacity entry with validation"""
try:
@@ -68,7 +67,6 @@ class ProductionCapacityRepository(ProductionBaseRepository):
logger.error("Error creating production capacity", error=str(e))
raise DatabaseError(f"Failed to create production capacity: {str(e)}")
@transactional
async def get_capacity_by_resource(
self,
tenant_id: str,
@@ -101,7 +99,6 @@ class ProductionCapacityRepository(ProductionBaseRepository):
logger.error("Error fetching capacity by resource", error=str(e))
raise DatabaseError(f"Failed to fetch capacity by resource: {str(e)}")
@transactional
async def get_available_capacity(
self,
tenant_id: str,
@@ -136,7 +133,6 @@ class ProductionCapacityRepository(ProductionBaseRepository):
logger.error("Error fetching available capacity", error=str(e))
raise DatabaseError(f"Failed to fetch available capacity: {str(e)}")
@transactional
async def allocate_capacity(
self,
capacity_id: UUID,
@@ -183,7 +179,6 @@ class ProductionCapacityRepository(ProductionBaseRepository):
logger.error("Error allocating capacity", error=str(e))
raise DatabaseError(f"Failed to allocate capacity: {str(e)}")
@transactional
async def release_capacity(
self,
capacity_id: UUID,
@@ -230,7 +225,6 @@ class ProductionCapacityRepository(ProductionBaseRepository):
logger.error("Error releasing capacity", error=str(e))
raise DatabaseError(f"Failed to release capacity: {str(e)}")
@transactional
async def get_capacity_utilization_summary(
self,
tenant_id: str,
@@ -299,7 +293,6 @@ class ProductionCapacityRepository(ProductionBaseRepository):
logger.error("Error calculating capacity utilization summary", error=str(e))
raise DatabaseError(f"Failed to calculate capacity utilization summary: {str(e)}")
@transactional
async def set_maintenance_mode(
self,
capacity_id: UUID,

View File

@@ -13,7 +13,7 @@ import structlog
from .base import ProductionBaseRepository
from app.models.production import ProductionSchedule
from shared.database.exceptions import DatabaseError, ValidationError
from shared.database.transactions import transactional
logger = structlog.get_logger()
@@ -25,7 +25,6 @@ class ProductionScheduleRepository(ProductionBaseRepository):
# Schedules are more stable, medium cache time (10 minutes)
super().__init__(ProductionSchedule, session, cache_ttl)
@transactional
async def create_schedule(self, schedule_data: Dict[str, Any]) -> ProductionSchedule:
"""Create a new production schedule with validation"""
try:
@@ -71,7 +70,6 @@ class ProductionScheduleRepository(ProductionBaseRepository):
logger.error("Error creating production schedule", error=str(e))
raise DatabaseError(f"Failed to create production schedule: {str(e)}")
@transactional
async def get_schedule_by_date(
self,
tenant_id: str,
@@ -101,7 +99,6 @@ class ProductionScheduleRepository(ProductionBaseRepository):
logger.error("Error fetching schedule by date", error=str(e))
raise DatabaseError(f"Failed to fetch schedule by date: {str(e)}")
@transactional
async def get_schedules_by_date_range(
self,
tenant_id: str,
@@ -131,7 +128,6 @@ class ProductionScheduleRepository(ProductionBaseRepository):
logger.error("Error fetching schedules by date range", error=str(e))
raise DatabaseError(f"Failed to fetch schedules by date range: {str(e)}")
@transactional
async def get_active_schedules(self, tenant_id: str) -> List[ProductionSchedule]:
"""Get active production schedules for a tenant"""
try:
@@ -153,7 +149,6 @@ class ProductionScheduleRepository(ProductionBaseRepository):
logger.error("Error fetching active schedules", error=str(e))
raise DatabaseError(f"Failed to fetch active schedules: {str(e)}")
@transactional
async def finalize_schedule(
self,
schedule_id: UUID,
@@ -188,7 +183,6 @@ class ProductionScheduleRepository(ProductionBaseRepository):
logger.error("Error finalizing schedule", error=str(e))
raise DatabaseError(f"Failed to finalize schedule: {str(e)}")
@transactional
async def update_schedule_metrics(
self,
schedule_id: UUID,
@@ -227,7 +221,6 @@ class ProductionScheduleRepository(ProductionBaseRepository):
logger.error("Error updating schedule metrics", error=str(e))
raise DatabaseError(f"Failed to update schedule metrics: {str(e)}")
@transactional
async def get_schedule_performance_summary(
self,
tenant_id: str,

View File

@@ -13,7 +13,7 @@ import structlog
from .base import ProductionBaseRepository
from app.models.production import QualityCheck
from shared.database.exceptions import DatabaseError, ValidationError
from shared.database.transactions import transactional
logger = structlog.get_logger()
@@ -25,7 +25,6 @@ class QualityCheckRepository(ProductionBaseRepository):
# Quality checks are dynamic, short cache time (5 minutes)
super().__init__(QualityCheck, session, cache_ttl)
@transactional
async def create_quality_check(self, check_data: Dict[str, Any]) -> QualityCheck:
"""Create a new quality check with validation"""
try:
@@ -69,7 +68,6 @@ class QualityCheckRepository(ProductionBaseRepository):
logger.error("Error creating quality check", error=str(e))
raise DatabaseError(f"Failed to create quality check: {str(e)}")
@transactional
async def get_checks_by_batch(
self,
tenant_id: str,
@@ -96,7 +94,6 @@ class QualityCheckRepository(ProductionBaseRepository):
logger.error("Error fetching quality checks by batch", error=str(e))
raise DatabaseError(f"Failed to fetch quality checks by batch: {str(e)}")
@transactional
async def get_checks_by_date_range(
self,
tenant_id: str,
@@ -136,7 +133,6 @@ class QualityCheckRepository(ProductionBaseRepository):
logger.error("Error fetching quality checks by date range", error=str(e))
raise DatabaseError(f"Failed to fetch quality checks by date range: {str(e)}")
@transactional
async def get_failed_checks(
self,
tenant_id: str,
@@ -167,7 +163,6 @@ class QualityCheckRepository(ProductionBaseRepository):
logger.error("Error fetching failed quality checks", error=str(e))
raise DatabaseError(f"Failed to fetch failed quality checks: {str(e)}")
@transactional
async def get_quality_metrics(
self,
tenant_id: str,
@@ -247,7 +242,6 @@ class QualityCheckRepository(ProductionBaseRepository):
logger.error("Error calculating quality metrics", error=str(e))
raise DatabaseError(f"Failed to calculate quality metrics: {str(e)}")
@transactional
async def get_quality_trends(
self,
tenant_id: str,

View File

@@ -952,6 +952,28 @@ class ProductionService:
raise
# Capacity Methods
async def get_capacity_by_date(
self,
tenant_id: UUID,
target_date: date
) -> List[Dict[str, Any]]:
"""Get capacity entries for a specific date"""
try:
async with self.database_manager.get_session() as session:
capacity_repo = ProductionCapacityRepository(session)
capacity_list = await capacity_repo.get_capacity_by_date(
str(tenant_id), target_date
)
# Convert to dictionaries for API response
return [capacity.to_dict() for capacity in capacity_list]
except Exception as e:
logger.error("Error getting capacity by date",
error=str(e), tenant_id=str(tenant_id), date=target_date.isoformat())
raise
async def get_capacity_list(
self,
tenant_id: UUID,

View File

@@ -29,6 +29,12 @@ APScheduler==3.10.4
python-dateutil==2.9.0.post0
pytz==2024.2
# Data processing for ML insights
pandas==2.2.3
numpy==2.2.1
scikit-learn==1.6.1
scipy==1.15.1
# Validation and utilities
email-validator==2.2.0

View File

@@ -0,0 +1,578 @@
"""
Tests for Production Yield Predictor
"""
import pytest
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from services.production.app.ml.yield_predictor import YieldPredictor
@pytest.fixture
def yield_predictor():
"""Create YieldPredictor instance."""
return YieldPredictor()
@pytest.fixture
def stable_yield_history():
"""Generate production history with stable high yield."""
np.random.seed(42)
base_date = datetime.utcnow() - timedelta(days=180)
history = []
for i in range(50):
run_date = base_date + timedelta(days=i * 3)
history.append({
'production_run_id': f'run_{i}',
'recipe_id': 'recipe_123',
'planned_quantity': 100,
'actual_quantity': np.random.normal(97, 1.5), # 97% avg, low variance
'yield_percentage': np.random.normal(97, 1.5),
'worker_id': f'worker_{i % 3}', # 3 workers
'started_at': run_date,
'completed_at': run_date + timedelta(hours=4),
'batch_size': np.random.randint(80, 120)
})
df = pd.DataFrame(history)
df['yield_percentage'] = df['yield_percentage'].clip(90, 100)
return df
@pytest.fixture
def variable_yield_history():
"""Generate production history with variable yield."""
np.random.seed(42)
base_date = datetime.utcnow() - timedelta(days=180)
history = []
workers = ['worker_expert', 'worker_intermediate', 'worker_novice']
worker_skills = {'worker_expert': 96, 'worker_intermediate': 90, 'worker_novice': 82}
for i in range(60):
run_date = base_date + timedelta(days=i * 3)
worker = workers[i % 3]
base_yield = worker_skills[worker]
# Time of day effect
hour = (6 + i * 2) % 24
time_penalty = 5 if hour < 6 or hour > 22 else 0
# Batch size effect
batch_size = np.random.randint(50, 150)
batch_penalty = 3 if batch_size > 120 else 0
final_yield = base_yield - time_penalty - batch_penalty + np.random.normal(0, 2)
history.append({
'production_run_id': f'run_{i}',
'recipe_id': 'recipe_456',
'planned_quantity': 100,
'actual_quantity': final_yield,
'yield_percentage': final_yield,
'worker_id': worker,
'started_at': run_date.replace(hour=hour),
'completed_at': run_date.replace(hour=hour) + timedelta(hours=4),
'batch_size': batch_size
})
df = pd.DataFrame(history)
df['yield_percentage'] = df['yield_percentage'].clip(70, 100)
return df
@pytest.fixture
def low_yield_history():
"""Generate production history with consistently low yield."""
np.random.seed(42)
base_date = datetime.utcnow() - timedelta(days=120)
history = []
for i in range(40):
run_date = base_date + timedelta(days=i * 3)
history.append({
'production_run_id': f'run_{i}',
'recipe_id': 'recipe_789',
'planned_quantity': 100,
'actual_quantity': np.random.normal(82, 5), # 82% avg, high variance
'yield_percentage': np.random.normal(82, 5),
'worker_id': f'worker_{i % 2}',
'started_at': run_date,
'completed_at': run_date + timedelta(hours=4),
'batch_size': np.random.randint(80, 120)
})
df = pd.DataFrame(history)
df['yield_percentage'] = df['yield_percentage'].clip(60, 95)
return df
@pytest.fixture
def production_context_optimal():
"""Production context for optimal conditions."""
return {
'worker_id': 'worker_expert',
'planned_start_time': (datetime.utcnow() + timedelta(days=1)).replace(hour=10),
'batch_size': 100,
'planned_quantity': 100,
'unit_cost': 5.0
}
@pytest.fixture
def production_context_suboptimal():
"""Production context for suboptimal conditions."""
return {
'worker_id': 'worker_novice',
'planned_start_time': (datetime.utcnow() + timedelta(days=1)).replace(hour=4),
'batch_size': 140,
'planned_quantity': 100,
'unit_cost': 5.0
}
class TestYieldPredictorBasics:
"""Test basic functionality."""
@pytest.mark.asyncio
async def test_insufficient_data(self, yield_predictor):
"""Test handling of insufficient production history."""
# Create minimal history (< 30 runs)
history = pd.DataFrame([{
'production_run_id': 'run_1',
'recipe_id': 'recipe_123',
'planned_quantity': 100,
'actual_quantity': 95,
'yield_percentage': 95,
'worker_id': 'worker_1',
'started_at': datetime.utcnow() - timedelta(days=1),
'completed_at': datetime.utcnow() - timedelta(hours=20),
'batch_size': 100
}])
context = {
'worker_id': 'worker_1',
'planned_start_time': datetime.utcnow() + timedelta(days=1),
'batch_size': 100,
'planned_quantity': 100
}
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=history,
production_context=context,
min_history_runs=30
)
assert result['status'] == 'insufficient_data'
assert result['history_runs'] == 1
assert result['required_runs'] == 30
assert len(result['insights']) == 1
assert result['insights'][0]['type'] == 'warning'
@pytest.mark.asyncio
async def test_baseline_statistics_stable_yield(self, yield_predictor, stable_yield_history):
"""Test baseline statistics calculation for stable yield."""
stats = yield_predictor._calculate_baseline_statistics(stable_yield_history)
assert 95 < stats['mean_yield'] < 99
assert stats['std_yield'] < 3 # Low variance
assert stats['cv_yield'] < 0.05 # Low coefficient of variation
assert stats['min_yield'] >= 90
assert stats['max_yield'] <= 100
@pytest.mark.asyncio
async def test_baseline_statistics_variable_yield(self, yield_predictor, variable_yield_history):
"""Test baseline statistics for variable yield."""
stats = yield_predictor._calculate_baseline_statistics(variable_yield_history)
assert 85 < stats['mean_yield'] < 93
assert stats['std_yield'] > 3 # Higher variance
assert stats['cv_yield'] > 0.03
assert stats['runs_below_90'] > 0
class TestFeatureEngineering:
"""Test feature engineering."""
@pytest.mark.asyncio
async def test_time_features(self, yield_predictor, stable_yield_history):
"""Test time-based feature extraction."""
feature_df = yield_predictor._engineer_features(stable_yield_history)
assert 'hour_of_day' in feature_df.columns
assert 'day_of_week' in feature_df.columns
assert 'is_weekend' in feature_df.columns
assert 'is_early_morning' in feature_df.columns
assert 'is_late_night' in feature_df.columns
assert feature_df['hour_of_day'].min() >= 0
assert feature_df['hour_of_day'].max() <= 23
assert feature_df['day_of_week'].min() >= 0
assert feature_df['day_of_week'].max() <= 6
@pytest.mark.asyncio
async def test_batch_size_features(self, yield_predictor, stable_yield_history):
"""Test batch size feature engineering."""
feature_df = yield_predictor._engineer_features(stable_yield_history)
assert 'batch_size_normalized' in feature_df.columns
assert 'is_large_batch' in feature_df.columns
assert 'is_small_batch' in feature_df.columns
# Normalized batch size should be around 1.0 on average
assert 0.5 < feature_df['batch_size_normalized'].mean() < 1.5
@pytest.mark.asyncio
async def test_worker_experience_features(self, yield_predictor, variable_yield_history):
"""Test worker experience feature engineering."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
assert 'worker_run_count' in feature_df.columns
assert 'worker_experience_level' in feature_df.columns
# Worker run count should increase for each worker
for worker in feature_df['worker_id'].unique():
worker_runs = feature_df[feature_df['worker_id'] == worker]['worker_run_count']
assert worker_runs.is_monotonic_increasing
class TestFactorAnalysis:
"""Test yield factor analysis."""
@pytest.mark.asyncio
async def test_worker_impact_detection(self, yield_predictor, variable_yield_history):
"""Test detection of worker impact on yield."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
factor_analysis = yield_predictor._analyze_yield_factors(feature_df)
assert 'worker' in factor_analysis
# Should detect worker skill differences
if factor_analysis['worker'].get('significant'):
assert 'best_worker' in factor_analysis['worker']
assert 'worst_worker' in factor_analysis['worker']
assert factor_analysis['worker']['yield_range'] > 0
@pytest.mark.asyncio
async def test_batch_size_correlation(self, yield_predictor, variable_yield_history):
"""Test batch size correlation analysis."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
factor_analysis = yield_predictor._analyze_yield_factors(feature_df)
assert 'batch_size' in factor_analysis
if factor_analysis['batch_size'].get('significant'):
assert 'correlation' in factor_analysis['batch_size']
assert 'direction' in factor_analysis['batch_size']
assert factor_analysis['batch_size']['direction'] in ['positive', 'negative']
@pytest.mark.asyncio
async def test_time_of_day_effect(self, yield_predictor, variable_yield_history):
"""Test time of day effect analysis."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
factor_analysis = yield_predictor._analyze_yield_factors(feature_df)
assert 'time_of_day' in factor_analysis
class TestYieldPrediction:
"""Test yield prediction."""
@pytest.mark.asyncio
async def test_predict_stable_yield(self, yield_predictor, stable_yield_history, production_context_optimal):
"""Test prediction for stable yield recipe."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=stable_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
assert result['status'] != 'insufficient_data'
assert result['predicted_yield'] is not None
assert 90 < result['predicted_yield'] < 100
assert result['confidence'] > 0
assert 'prediction_range' in result
assert result['prediction_range']['lower'] < result['predicted_yield']
assert result['prediction_range']['upper'] > result['predicted_yield']
@pytest.mark.asyncio
async def test_predict_variable_yield_optimal_context(
self, yield_predictor, variable_yield_history, production_context_optimal
):
"""Test prediction with optimal production context."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_456',
production_history=variable_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
assert result['predicted_yield'] is not None
# Optimal context should predict higher yield
assert result['predicted_yield'] > result['baseline_yield'] - 5
@pytest.mark.asyncio
async def test_predict_variable_yield_suboptimal_context(
self, yield_predictor, variable_yield_history, production_context_suboptimal
):
"""Test prediction with suboptimal production context."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_456',
production_history=variable_yield_history,
production_context=production_context_suboptimal,
min_history_runs=30
)
assert result['predicted_yield'] is not None
# Suboptimal context (novice worker, early morning, large batch)
# should predict lower yield
@pytest.mark.asyncio
async def test_expected_waste_calculation(
self, yield_predictor, low_yield_history, production_context_optimal
):
"""Test expected waste calculation."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_789',
production_history=low_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
assert 'expected_waste' in result
assert result['expected_waste'] > 0
# For low yield (82%), waste should be significant
expected_waste_pct = 100 - result['predicted_yield']
assert expected_waste_pct > 5
class TestPatternDetection:
"""Test yield pattern identification."""
@pytest.mark.asyncio
async def test_low_yield_worker_pattern(self, yield_predictor, variable_yield_history):
"""Test detection of low-yield worker pattern."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
factor_analysis = yield_predictor._analyze_yield_factors(feature_df)
patterns = yield_predictor._identify_yield_patterns(feature_df, factor_analysis)
# Should detect novice worker pattern
low_worker_patterns = [p for p in patterns if p['pattern'] == 'low_yield_worker']
if factor_analysis.get('worker', {}).get('significant'):
assert len(low_worker_patterns) > 0
pattern = low_worker_patterns[0]
assert pattern['severity'] in ['high', 'medium', 'low']
assert 'recommendation' in pattern
@pytest.mark.asyncio
async def test_time_of_day_pattern(self, yield_predictor, variable_yield_history):
"""Test detection of time-of-day pattern."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
factor_analysis = yield_predictor._analyze_yield_factors(feature_df)
patterns = yield_predictor._identify_yield_patterns(feature_df, factor_analysis)
# May detect early morning low yield pattern
time_patterns = [p for p in patterns if p['pattern'] == 'low_yield_time']
# Patterns are conditional on statistical significance
class TestInsightGeneration:
"""Test insight generation."""
@pytest.mark.asyncio
async def test_low_yield_warning_insight(
self, yield_predictor, low_yield_history, production_context_optimal
):
"""Test generation of low yield warning insight."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_789',
production_history=low_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
# Should generate low yield warning
warning_insights = [i for i in result['insights'] if i['type'] == 'warning']
assert len(warning_insights) > 0
warning = warning_insights[0]
assert warning['priority'] in ['high', 'medium']
assert warning['category'] == 'production'
assert 'impact_value' in warning
assert warning['actionable'] is True
@pytest.mark.asyncio
async def test_excellent_yield_insight(
self, yield_predictor, stable_yield_history, production_context_optimal
):
"""Test generation of excellent yield insight."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=stable_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
# May generate positive insight for excellent yield
positive_insights = [i for i in result['insights'] if i['type'] == 'positive']
if result['predicted_yield'] > 98:
assert len(positive_insights) > 0
@pytest.mark.asyncio
async def test_yield_variability_insight(
self, yield_predictor, variable_yield_history, production_context_optimal
):
"""Test generation of yield variability insight."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_456',
production_history=variable_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
# Should detect high variability
if result['baseline_std'] / result['baseline_yield'] > 0.05:
variability_insights = [
i for i in result['insights']
if 'variability' in i['title'].lower() or 'variability' in i['description'].lower()
]
assert len(variability_insights) > 0
class TestConfidenceScoring:
"""Test confidence score calculation."""
@pytest.mark.asyncio
async def test_high_confidence_large_sample(
self, yield_predictor, stable_yield_history, production_context_optimal
):
"""Test high confidence with large stable sample."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=stable_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
# Large sample + stable data should give high confidence
assert result['confidence'] > 60
@pytest.mark.asyncio
async def test_lower_confidence_small_sample(self, yield_predictor, production_context_optimal):
"""Test lower confidence with small sample."""
# Create small history (exactly 30 runs)
small_history = pd.DataFrame([{
'production_run_id': f'run_{i}',
'recipe_id': 'recipe_123',
'planned_quantity': 100,
'actual_quantity': 95 + np.random.normal(0, 2),
'yield_percentage': 95 + np.random.normal(0, 2),
'worker_id': 'worker_1',
'started_at': datetime.utcnow() - timedelta(days=90-i),
'completed_at': datetime.utcnow() - timedelta(days=90-i, hours=-4),
'batch_size': 100
} for i in range(30)])
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=small_history,
production_context=production_context_optimal,
min_history_runs=30
)
# Small sample should give moderate confidence
assert result['confidence'] < 85
class TestHistoricalAnalysis:
"""Test historical analysis (no prediction)."""
@pytest.mark.asyncio
async def test_analyze_recipe_history(self, yield_predictor, variable_yield_history):
"""Test historical analysis without prediction."""
result = await yield_predictor.analyze_recipe_yield_history(
tenant_id='tenant_123',
recipe_id='recipe_456',
production_history=variable_yield_history,
min_history_runs=30
)
assert result['recipe_id'] == 'recipe_456'
assert 'baseline_stats' in result
assert 'factor_analysis' in result
assert 'patterns' in result
assert 'insights' in result
@pytest.mark.asyncio
async def test_analyze_insufficient_history(self, yield_predictor):
"""Test analysis with insufficient history."""
small_history = pd.DataFrame([{
'production_run_id': 'run_1',
'recipe_id': 'recipe_123',
'planned_quantity': 100,
'actual_quantity': 95,
'yield_percentage': 95,
'worker_id': 'worker_1',
'started_at': datetime.utcnow() - timedelta(days=1),
'completed_at': datetime.utcnow() - timedelta(hours=20),
'batch_size': 100
}])
result = await yield_predictor.analyze_recipe_yield_history(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=small_history,
min_history_runs=30
)
assert result['status'] == 'insufficient_data'
class TestModelPerformance:
"""Test ML model performance."""
@pytest.mark.asyncio
async def test_model_training(self, yield_predictor, variable_yield_history):
"""Test model training and performance metrics."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
model_results = yield_predictor._train_yield_model(feature_df)
assert 'best_model' in model_results
assert 'best_model_name' in model_results
assert 'performance' in model_results
assert 'feature_importance' in model_results
performance = model_results['performance']
assert 'mae' in performance
assert 'rmse' in performance
assert 'r2' in performance
# MAE should be reasonable (< 15 percentage points)
assert performance['mae'] < 15
@pytest.mark.asyncio
async def test_feature_importance(self, yield_predictor, variable_yield_history):
"""Test feature importance extraction."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
model_results = yield_predictor._train_yield_model(feature_df)
feature_importance = model_results['feature_importance']
# Should have feature importances
if len(feature_importance) > 0:
# Worker encoding should be important (due to skill differences)
assert 'worker_encoded' in feature_importance or len(feature_importance) > 0