Files
bakery-ia/services/production/tests/test_yield_predictor.py

579 lines
22 KiB
Python
Raw Normal View History

2025-11-05 13:34:56 +01:00
"""
Tests for Production Yield Predictor
"""
import pytest
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from services.production.app.ml.yield_predictor import YieldPredictor
@pytest.fixture
def yield_predictor():
"""Create YieldPredictor instance."""
return YieldPredictor()
@pytest.fixture
def stable_yield_history():
"""Generate production history with stable high yield."""
np.random.seed(42)
base_date = datetime.utcnow() - timedelta(days=180)
history = []
for i in range(50):
run_date = base_date + timedelta(days=i * 3)
history.append({
'production_run_id': f'run_{i}',
'recipe_id': 'recipe_123',
'planned_quantity': 100,
'actual_quantity': np.random.normal(97, 1.5), # 97% avg, low variance
'yield_percentage': np.random.normal(97, 1.5),
2025-12-15 21:14:22 +01:00
'staff_assigned': [f'worker_{i % 3}'], # 3 workers
2025-11-05 13:34:56 +01:00
'started_at': run_date,
'completed_at': run_date + timedelta(hours=4),
'batch_size': np.random.randint(80, 120)
})
df = pd.DataFrame(history)
df['yield_percentage'] = df['yield_percentage'].clip(90, 100)
return df
@pytest.fixture
def variable_yield_history():
"""Generate production history with variable yield."""
np.random.seed(42)
base_date = datetime.utcnow() - timedelta(days=180)
history = []
workers = ['worker_expert', 'worker_intermediate', 'worker_novice']
worker_skills = {'worker_expert': 96, 'worker_intermediate': 90, 'worker_novice': 82}
for i in range(60):
run_date = base_date + timedelta(days=i * 3)
worker = workers[i % 3]
base_yield = worker_skills[worker]
# Time of day effect
hour = (6 + i * 2) % 24
time_penalty = 5 if hour < 6 or hour > 22 else 0
# Batch size effect
batch_size = np.random.randint(50, 150)
batch_penalty = 3 if batch_size > 120 else 0
final_yield = base_yield - time_penalty - batch_penalty + np.random.normal(0, 2)
history.append({
'production_run_id': f'run_{i}',
'recipe_id': 'recipe_456',
'planned_quantity': 100,
'actual_quantity': final_yield,
'yield_percentage': final_yield,
'worker_id': worker,
'started_at': run_date.replace(hour=hour),
'completed_at': run_date.replace(hour=hour) + timedelta(hours=4),
'batch_size': batch_size
})
df = pd.DataFrame(history)
df['yield_percentage'] = df['yield_percentage'].clip(70, 100)
return df
@pytest.fixture
def low_yield_history():
"""Generate production history with consistently low yield."""
np.random.seed(42)
base_date = datetime.utcnow() - timedelta(days=120)
history = []
for i in range(40):
run_date = base_date + timedelta(days=i * 3)
history.append({
'production_run_id': f'run_{i}',
'recipe_id': 'recipe_789',
'planned_quantity': 100,
'actual_quantity': np.random.normal(82, 5), # 82% avg, high variance
'yield_percentage': np.random.normal(82, 5),
'worker_id': f'worker_{i % 2}',
'started_at': run_date,
'completed_at': run_date + timedelta(hours=4),
'batch_size': np.random.randint(80, 120)
})
df = pd.DataFrame(history)
df['yield_percentage'] = df['yield_percentage'].clip(60, 95)
return df
@pytest.fixture
def production_context_optimal():
"""Production context for optimal conditions."""
return {
'worker_id': 'worker_expert',
'planned_start_time': (datetime.utcnow() + timedelta(days=1)).replace(hour=10),
'batch_size': 100,
'planned_quantity': 100,
'unit_cost': 5.0
}
@pytest.fixture
def production_context_suboptimal():
"""Production context for suboptimal conditions."""
return {
'worker_id': 'worker_novice',
'planned_start_time': (datetime.utcnow() + timedelta(days=1)).replace(hour=4),
'batch_size': 140,
'planned_quantity': 100,
'unit_cost': 5.0
}
class TestYieldPredictorBasics:
"""Test basic functionality."""
@pytest.mark.asyncio
async def test_insufficient_data(self, yield_predictor):
"""Test handling of insufficient production history."""
# Create minimal history (< 30 runs)
history = pd.DataFrame([{
'production_run_id': 'run_1',
'recipe_id': 'recipe_123',
'planned_quantity': 100,
'actual_quantity': 95,
'yield_percentage': 95,
'worker_id': 'worker_1',
'started_at': datetime.utcnow() - timedelta(days=1),
'completed_at': datetime.utcnow() - timedelta(hours=20),
'batch_size': 100
}])
context = {
'worker_id': 'worker_1',
'planned_start_time': datetime.utcnow() + timedelta(days=1),
'batch_size': 100,
'planned_quantity': 100
}
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=history,
production_context=context,
min_history_runs=30
)
assert result['status'] == 'insufficient_data'
assert result['history_runs'] == 1
assert result['required_runs'] == 30
assert len(result['insights']) == 1
assert result['insights'][0]['type'] == 'warning'
@pytest.mark.asyncio
async def test_baseline_statistics_stable_yield(self, yield_predictor, stable_yield_history):
"""Test baseline statistics calculation for stable yield."""
stats = yield_predictor._calculate_baseline_statistics(stable_yield_history)
assert 95 < stats['mean_yield'] < 99
assert stats['std_yield'] < 3 # Low variance
assert stats['cv_yield'] < 0.05 # Low coefficient of variation
assert stats['min_yield'] >= 90
assert stats['max_yield'] <= 100
@pytest.mark.asyncio
async def test_baseline_statistics_variable_yield(self, yield_predictor, variable_yield_history):
"""Test baseline statistics for variable yield."""
stats = yield_predictor._calculate_baseline_statistics(variable_yield_history)
assert 85 < stats['mean_yield'] < 93
assert stats['std_yield'] > 3 # Higher variance
assert stats['cv_yield'] > 0.03
assert stats['runs_below_90'] > 0
class TestFeatureEngineering:
"""Test feature engineering."""
@pytest.mark.asyncio
async def test_time_features(self, yield_predictor, stable_yield_history):
"""Test time-based feature extraction."""
feature_df = yield_predictor._engineer_features(stable_yield_history)
assert 'hour_of_day' in feature_df.columns
assert 'day_of_week' in feature_df.columns
assert 'is_weekend' in feature_df.columns
assert 'is_early_morning' in feature_df.columns
assert 'is_late_night' in feature_df.columns
assert feature_df['hour_of_day'].min() >= 0
assert feature_df['hour_of_day'].max() <= 23
assert feature_df['day_of_week'].min() >= 0
assert feature_df['day_of_week'].max() <= 6
@pytest.mark.asyncio
async def test_batch_size_features(self, yield_predictor, stable_yield_history):
"""Test batch size feature engineering."""
feature_df = yield_predictor._engineer_features(stable_yield_history)
assert 'batch_size_normalized' in feature_df.columns
assert 'is_large_batch' in feature_df.columns
assert 'is_small_batch' in feature_df.columns
# Normalized batch size should be around 1.0 on average
assert 0.5 < feature_df['batch_size_normalized'].mean() < 1.5
@pytest.mark.asyncio
async def test_worker_experience_features(self, yield_predictor, variable_yield_history):
"""Test worker experience feature engineering."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
assert 'worker_run_count' in feature_df.columns
assert 'worker_experience_level' in feature_df.columns
# Worker run count should increase for each worker
for worker in feature_df['worker_id'].unique():
worker_runs = feature_df[feature_df['worker_id'] == worker]['worker_run_count']
assert worker_runs.is_monotonic_increasing
class TestFactorAnalysis:
"""Test yield factor analysis."""
@pytest.mark.asyncio
async def test_worker_impact_detection(self, yield_predictor, variable_yield_history):
"""Test detection of worker impact on yield."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
factor_analysis = yield_predictor._analyze_yield_factors(feature_df)
assert 'worker' in factor_analysis
# Should detect worker skill differences
if factor_analysis['worker'].get('significant'):
assert 'best_worker' in factor_analysis['worker']
assert 'worst_worker' in factor_analysis['worker']
assert factor_analysis['worker']['yield_range'] > 0
@pytest.mark.asyncio
async def test_batch_size_correlation(self, yield_predictor, variable_yield_history):
"""Test batch size correlation analysis."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
factor_analysis = yield_predictor._analyze_yield_factors(feature_df)
assert 'batch_size' in factor_analysis
if factor_analysis['batch_size'].get('significant'):
assert 'correlation' in factor_analysis['batch_size']
assert 'direction' in factor_analysis['batch_size']
assert factor_analysis['batch_size']['direction'] in ['positive', 'negative']
@pytest.mark.asyncio
async def test_time_of_day_effect(self, yield_predictor, variable_yield_history):
"""Test time of day effect analysis."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
factor_analysis = yield_predictor._analyze_yield_factors(feature_df)
assert 'time_of_day' in factor_analysis
class TestYieldPrediction:
"""Test yield prediction."""
@pytest.mark.asyncio
async def test_predict_stable_yield(self, yield_predictor, stable_yield_history, production_context_optimal):
"""Test prediction for stable yield recipe."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=stable_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
assert result['status'] != 'insufficient_data'
assert result['predicted_yield'] is not None
assert 90 < result['predicted_yield'] < 100
assert result['confidence'] > 0
assert 'prediction_range' in result
assert result['prediction_range']['lower'] < result['predicted_yield']
assert result['prediction_range']['upper'] > result['predicted_yield']
@pytest.mark.asyncio
async def test_predict_variable_yield_optimal_context(
self, yield_predictor, variable_yield_history, production_context_optimal
):
"""Test prediction with optimal production context."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_456',
production_history=variable_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
assert result['predicted_yield'] is not None
# Optimal context should predict higher yield
assert result['predicted_yield'] > result['baseline_yield'] - 5
@pytest.mark.asyncio
async def test_predict_variable_yield_suboptimal_context(
self, yield_predictor, variable_yield_history, production_context_suboptimal
):
"""Test prediction with suboptimal production context."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_456',
production_history=variable_yield_history,
production_context=production_context_suboptimal,
min_history_runs=30
)
assert result['predicted_yield'] is not None
# Suboptimal context (novice worker, early morning, large batch)
# should predict lower yield
@pytest.mark.asyncio
async def test_expected_waste_calculation(
self, yield_predictor, low_yield_history, production_context_optimal
):
"""Test expected waste calculation."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_789',
production_history=low_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
assert 'expected_waste' in result
assert result['expected_waste'] > 0
# For low yield (82%), waste should be significant
expected_waste_pct = 100 - result['predicted_yield']
assert expected_waste_pct > 5
class TestPatternDetection:
"""Test yield pattern identification."""
@pytest.mark.asyncio
async def test_low_yield_worker_pattern(self, yield_predictor, variable_yield_history):
"""Test detection of low-yield worker pattern."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
factor_analysis = yield_predictor._analyze_yield_factors(feature_df)
patterns = yield_predictor._identify_yield_patterns(feature_df, factor_analysis)
# Should detect novice worker pattern
low_worker_patterns = [p for p in patterns if p['pattern'] == 'low_yield_worker']
if factor_analysis.get('worker', {}).get('significant'):
assert len(low_worker_patterns) > 0
pattern = low_worker_patterns[0]
assert pattern['severity'] in ['high', 'medium', 'low']
assert 'recommendation' in pattern
@pytest.mark.asyncio
async def test_time_of_day_pattern(self, yield_predictor, variable_yield_history):
"""Test detection of time-of-day pattern."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
factor_analysis = yield_predictor._analyze_yield_factors(feature_df)
patterns = yield_predictor._identify_yield_patterns(feature_df, factor_analysis)
# May detect early morning low yield pattern
time_patterns = [p for p in patterns if p['pattern'] == 'low_yield_time']
# Patterns are conditional on statistical significance
class TestInsightGeneration:
"""Test insight generation."""
@pytest.mark.asyncio
async def test_low_yield_warning_insight(
self, yield_predictor, low_yield_history, production_context_optimal
):
"""Test generation of low yield warning insight."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_789',
production_history=low_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
# Should generate low yield warning
warning_insights = [i for i in result['insights'] if i['type'] == 'warning']
assert len(warning_insights) > 0
warning = warning_insights[0]
assert warning['priority'] in ['high', 'medium']
assert warning['category'] == 'production'
assert 'impact_value' in warning
assert warning['actionable'] is True
@pytest.mark.asyncio
async def test_excellent_yield_insight(
self, yield_predictor, stable_yield_history, production_context_optimal
):
"""Test generation of excellent yield insight."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=stable_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
# May generate positive insight for excellent yield
positive_insights = [i for i in result['insights'] if i['type'] == 'positive']
if result['predicted_yield'] > 98:
assert len(positive_insights) > 0
@pytest.mark.asyncio
async def test_yield_variability_insight(
self, yield_predictor, variable_yield_history, production_context_optimal
):
"""Test generation of yield variability insight."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_456',
production_history=variable_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
# Should detect high variability
if result['baseline_std'] / result['baseline_yield'] > 0.05:
variability_insights = [
i for i in result['insights']
if 'variability' in i['title'].lower() or 'variability' in i['description'].lower()
]
assert len(variability_insights) > 0
class TestConfidenceScoring:
"""Test confidence score calculation."""
@pytest.mark.asyncio
async def test_high_confidence_large_sample(
self, yield_predictor, stable_yield_history, production_context_optimal
):
"""Test high confidence with large stable sample."""
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=stable_yield_history,
production_context=production_context_optimal,
min_history_runs=30
)
# Large sample + stable data should give high confidence
assert result['confidence'] > 60
@pytest.mark.asyncio
async def test_lower_confidence_small_sample(self, yield_predictor, production_context_optimal):
"""Test lower confidence with small sample."""
# Create small history (exactly 30 runs)
small_history = pd.DataFrame([{
'production_run_id': f'run_{i}',
'recipe_id': 'recipe_123',
'planned_quantity': 100,
'actual_quantity': 95 + np.random.normal(0, 2),
'yield_percentage': 95 + np.random.normal(0, 2),
'worker_id': 'worker_1',
'started_at': datetime.utcnow() - timedelta(days=90-i),
'completed_at': datetime.utcnow() - timedelta(days=90-i, hours=-4),
'batch_size': 100
} for i in range(30)])
result = await yield_predictor.predict_yield(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=small_history,
production_context=production_context_optimal,
min_history_runs=30
)
# Small sample should give moderate confidence
assert result['confidence'] < 85
class TestHistoricalAnalysis:
"""Test historical analysis (no prediction)."""
@pytest.mark.asyncio
async def test_analyze_recipe_history(self, yield_predictor, variable_yield_history):
"""Test historical analysis without prediction."""
result = await yield_predictor.analyze_recipe_yield_history(
tenant_id='tenant_123',
recipe_id='recipe_456',
production_history=variable_yield_history,
min_history_runs=30
)
assert result['recipe_id'] == 'recipe_456'
assert 'baseline_stats' in result
assert 'factor_analysis' in result
assert 'patterns' in result
assert 'insights' in result
@pytest.mark.asyncio
async def test_analyze_insufficient_history(self, yield_predictor):
"""Test analysis with insufficient history."""
small_history = pd.DataFrame([{
'production_run_id': 'run_1',
'recipe_id': 'recipe_123',
'planned_quantity': 100,
'actual_quantity': 95,
'yield_percentage': 95,
'worker_id': 'worker_1',
'started_at': datetime.utcnow() - timedelta(days=1),
'completed_at': datetime.utcnow() - timedelta(hours=20),
'batch_size': 100
}])
result = await yield_predictor.analyze_recipe_yield_history(
tenant_id='tenant_123',
recipe_id='recipe_123',
production_history=small_history,
min_history_runs=30
)
assert result['status'] == 'insufficient_data'
class TestModelPerformance:
"""Test ML model performance."""
@pytest.mark.asyncio
async def test_model_training(self, yield_predictor, variable_yield_history):
"""Test model training and performance metrics."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
model_results = yield_predictor._train_yield_model(feature_df)
assert 'best_model' in model_results
assert 'best_model_name' in model_results
assert 'performance' in model_results
assert 'feature_importance' in model_results
performance = model_results['performance']
assert 'mae' in performance
assert 'rmse' in performance
assert 'r2' in performance
# MAE should be reasonable (< 15 percentage points)
assert performance['mae'] < 15
@pytest.mark.asyncio
async def test_feature_importance(self, yield_predictor, variable_yield_history):
"""Test feature importance extraction."""
feature_df = yield_predictor._engineer_features(variable_yield_history)
model_results = yield_predictor._train_yield_model(feature_df)
feature_importance = model_results['feature_importance']
# Should have feature importances
if len(feature_importance) > 0:
# Worker encoding should be important (due to skill differences)
assert 'worker_encoded' in feature_importance or len(feature_importance) > 0