Files
bakery-ia/services/ai_insights/tests/test_feedback_learning_system.py
2025-11-05 13:34:56 +01:00

580 lines
20 KiB
Python

"""
Tests for Feedback Loop & Learning System
"""
import pytest
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from services.ai_insights.app.ml.feedback_learning_system import FeedbackLearningSystem
@pytest.fixture
def learning_system():
"""Create FeedbackLearningSystem instance."""
return FeedbackLearningSystem(
performance_threshold=0.85,
degradation_threshold=0.10,
min_feedback_samples=30
)
@pytest.fixture
def good_feedback_data():
"""Generate feedback data for well-performing model."""
np.random.seed(42)
dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D')
feedback = []
for i, date in enumerate(dates):
predicted = 100 + np.random.normal(0, 10)
actual = predicted + np.random.normal(0, 5) # Small error
error = predicted - actual
error_pct = abs(error / actual * 100) if actual != 0 else 0
accuracy = max(0, 100 - error_pct)
feedback.append({
'insight_id': f'insight_{i}',
'applied_at': date - timedelta(days=1),
'outcome_date': date,
'predicted_value': predicted,
'actual_value': actual,
'error': error,
'error_pct': error_pct,
'accuracy': accuracy,
'confidence': 85
})
return pd.DataFrame(feedback)
@pytest.fixture
def degraded_feedback_data():
"""Generate feedback data for degrading model."""
np.random.seed(42)
dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D')
feedback = []
for i, date in enumerate(dates):
# Introduce increasing error over time
error_multiplier = 1 + (i / 50) * 2 # Errors double by end
predicted = 100 + np.random.normal(0, 10)
actual = predicted + np.random.normal(0, 10 * error_multiplier)
error = predicted - actual
error_pct = abs(error / actual * 100) if actual != 0 else 0
accuracy = max(0, 100 - error_pct)
feedback.append({
'insight_id': f'insight_{i}',
'applied_at': date - timedelta(days=1),
'outcome_date': date,
'predicted_value': predicted,
'actual_value': actual,
'error': error,
'error_pct': error_pct,
'accuracy': accuracy,
'confidence': 85
})
return pd.DataFrame(feedback)
@pytest.fixture
def biased_feedback_data():
"""Generate feedback data with systematic bias."""
np.random.seed(42)
dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D')
feedback = []
for i, date in enumerate(dates):
predicted = 100 + np.random.normal(0, 10)
# Systematic over-prediction by 15%
actual = predicted * 0.85 + np.random.normal(0, 3)
error = predicted - actual
error_pct = abs(error / actual * 100) if actual != 0 else 0
accuracy = max(0, 100 - error_pct)
feedback.append({
'insight_id': f'insight_{i}',
'applied_at': date - timedelta(days=1),
'outcome_date': date,
'predicted_value': predicted,
'actual_value': actual,
'error': error,
'error_pct': error_pct,
'accuracy': accuracy,
'confidence': 80
})
return pd.DataFrame(feedback)
@pytest.fixture
def poorly_calibrated_feedback_data():
"""Generate feedback with poor confidence calibration."""
np.random.seed(42)
dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D')
feedback = []
for i, date in enumerate(dates):
predicted = 100 + np.random.normal(0, 10)
# High confidence but low accuracy
if i < 25:
confidence = 90
actual = predicted + np.random.normal(0, 20) # Large error
else:
confidence = 60
actual = predicted + np.random.normal(0, 5) # Small error
error = predicted - actual
error_pct = abs(error / actual * 100) if actual != 0 else 0
accuracy = max(0, 100 - error_pct)
feedback.append({
'insight_id': f'insight_{i}',
'applied_at': date - timedelta(days=1),
'outcome_date': date,
'predicted_value': predicted,
'actual_value': actual,
'error': error,
'error_pct': error_pct,
'accuracy': accuracy,
'confidence': confidence
})
return pd.DataFrame(feedback)
class TestPerformanceMetrics:
"""Test performance metric calculation."""
@pytest.mark.asyncio
async def test_calculate_metrics_good_performance(self, learning_system, good_feedback_data):
"""Test metric calculation for good performance."""
metrics = learning_system._calculate_performance_metrics(good_feedback_data)
assert 'accuracy' in metrics
assert 'mae' in metrics
assert 'rmse' in metrics
assert 'mape' in metrics
assert 'bias' in metrics
assert 'r_squared' in metrics
# Good model should have high accuracy
assert metrics['accuracy'] > 80
assert metrics['mae'] < 10
assert abs(metrics['bias']) < 5
@pytest.mark.asyncio
async def test_calculate_metrics_degraded_performance(self, learning_system, degraded_feedback_data):
"""Test metric calculation for degraded performance."""
metrics = learning_system._calculate_performance_metrics(degraded_feedback_data)
# Degraded model should have lower accuracy
assert metrics['accuracy'] < 80
assert metrics['mae'] > 5
class TestPerformanceTrend:
"""Test performance trend analysis."""
@pytest.mark.asyncio
async def test_stable_trend(self, learning_system, good_feedback_data):
"""Test detection of stable performance trend."""
trend = learning_system._analyze_performance_trend(good_feedback_data)
assert trend['trend'] in ['stable', 'improving']
@pytest.mark.asyncio
async def test_degrading_trend(self, learning_system, degraded_feedback_data):
"""Test detection of degrading performance trend."""
trend = learning_system._analyze_performance_trend(degraded_feedback_data)
# May detect degrading trend depending on data
assert trend['trend'] in ['degrading', 'stable']
if trend['significant']:
assert 'slope' in trend
@pytest.mark.asyncio
async def test_insufficient_data_trend(self, learning_system):
"""Test trend analysis with insufficient data."""
small_data = pd.DataFrame([{
'insight_id': 'test',
'outcome_date': datetime.utcnow(),
'accuracy': 90
}])
trend = learning_system._analyze_performance_trend(small_data)
assert trend['trend'] == 'insufficient_data'
class TestDegradationDetection:
"""Test performance degradation detection."""
@pytest.mark.asyncio
async def test_no_degradation_detected(self, learning_system, good_feedback_data):
"""Test no degradation for good performance."""
current_metrics = learning_system._calculate_performance_metrics(good_feedback_data)
trend = learning_system._analyze_performance_trend(good_feedback_data)
degradation = learning_system._detect_performance_degradation(
current_metrics,
baseline_performance={'accuracy': 85},
trend_analysis=trend
)
assert degradation['detected'] is False
assert degradation['severity'] == 'none'
@pytest.mark.asyncio
async def test_degradation_below_threshold(self, learning_system):
"""Test degradation detection when below absolute threshold."""
current_metrics = {'accuracy': 70} # Below 85% threshold
trend = {'trend': 'stable', 'significant': False}
degradation = learning_system._detect_performance_degradation(
current_metrics,
baseline_performance=None,
trend_analysis=trend
)
assert degradation['detected'] is True
assert degradation['severity'] == 'high'
assert len(degradation['reasons']) > 0
@pytest.mark.asyncio
async def test_degradation_vs_baseline(self, learning_system):
"""Test degradation detection vs baseline."""
current_metrics = {'accuracy': 80}
baseline = {'accuracy': 95} # 15.8% drop
trend = {'trend': 'stable', 'significant': False}
degradation = learning_system._detect_performance_degradation(
current_metrics,
baseline_performance=baseline,
trend_analysis=trend
)
assert degradation['detected'] is True
assert 'dropped' in degradation['reasons'][0].lower()
@pytest.mark.asyncio
async def test_degradation_trending_down(self, learning_system, degraded_feedback_data):
"""Test degradation detection from trending down."""
current_metrics = learning_system._calculate_performance_metrics(degraded_feedback_data)
trend = learning_system._analyze_performance_trend(degraded_feedback_data)
degradation = learning_system._detect_performance_degradation(
current_metrics,
baseline_performance={'accuracy': 90},
trend_analysis=trend
)
# Should detect some form of degradation
assert degradation['detected'] is True
class TestRetrainingRecommendation:
"""Test retraining recommendation generation."""
@pytest.mark.asyncio
async def test_urgent_retraining_recommendation(self, learning_system):
"""Test urgent retraining recommendation."""
current_metrics = {'accuracy': 70}
degradation = {
'detected': True,
'severity': 'high',
'reasons': ['Accuracy below threshold'],
'current_accuracy': 70,
'baseline_accuracy': 90
}
trend = {'trend': 'degrading', 'significant': True}
recommendation = learning_system._generate_retraining_recommendation(
'test_model',
current_metrics,
degradation,
trend
)
assert recommendation['recommended'] is True
assert recommendation['priority'] == 'urgent'
assert 'immediately' in recommendation['recommendation'].lower()
@pytest.mark.asyncio
async def test_no_retraining_needed(self, learning_system, good_feedback_data):
"""Test no retraining recommendation for good performance."""
current_metrics = learning_system._calculate_performance_metrics(good_feedback_data)
degradation = {'detected': False, 'severity': 'none'}
trend = learning_system._analyze_performance_trend(good_feedback_data)
recommendation = learning_system._generate_retraining_recommendation(
'test_model',
current_metrics,
degradation,
trend
)
assert recommendation['recommended'] is False
assert recommendation['priority'] == 'none'
class TestErrorPatternDetection:
"""Test error pattern identification."""
@pytest.mark.asyncio
async def test_systematic_bias_detection(self, learning_system, biased_feedback_data):
"""Test detection of systematic bias."""
patterns = learning_system._identify_error_patterns(biased_feedback_data)
# Should detect over-prediction bias
bias_patterns = [p for p in patterns if p['pattern'] == 'systematic_bias']
assert len(bias_patterns) > 0
bias = bias_patterns[0]
assert 'over-prediction' in bias['description']
assert bias['severity'] in ['high', 'medium']
@pytest.mark.asyncio
async def test_no_patterns_for_good_data(self, learning_system, good_feedback_data):
"""Test no significant patterns for good data."""
patterns = learning_system._identify_error_patterns(good_feedback_data)
# May have some minor patterns, but no high severity
high_severity = [p for p in patterns if p.get('severity') == 'high']
assert len(high_severity) == 0
class TestConfidenceCalibration:
"""Test confidence calibration analysis."""
@pytest.mark.asyncio
async def test_well_calibrated_confidence(self, learning_system, good_feedback_data):
"""Test well-calibrated confidence scores."""
calibration = learning_system._calculate_confidence_calibration(good_feedback_data)
# Good data with consistent confidence should be well calibrated
if 'overall_calibration_error' in calibration:
# Small calibration error indicates good calibration
assert calibration['overall_calibration_error'] < 20
@pytest.mark.asyncio
async def test_poorly_calibrated_confidence(self, learning_system, poorly_calibrated_feedback_data):
"""Test poorly calibrated confidence scores."""
calibration = learning_system._calculate_confidence_calibration(poorly_calibrated_feedback_data)
# Should detect poor calibration
assert calibration['calibrated'] is False
if 'by_confidence_range' in calibration:
assert len(calibration['by_confidence_range']) > 0
@pytest.mark.asyncio
async def test_no_confidence_data(self, learning_system):
"""Test calibration when no confidence scores available."""
no_conf_data = pd.DataFrame([{
'predicted_value': 100,
'actual_value': 95,
'accuracy': 95
}])
calibration = learning_system._calculate_confidence_calibration(no_conf_data)
assert calibration['calibrated'] is False
assert 'reason' in calibration
class TestCompletePerformanceAnalysis:
"""Test complete performance analysis workflow."""
@pytest.mark.asyncio
async def test_analyze_good_performance(self, learning_system, good_feedback_data):
"""Test complete analysis of good performance."""
result = await learning_system.analyze_model_performance(
model_name='test_model',
feedback_data=good_feedback_data,
baseline_performance={'accuracy': 85}
)
assert result['model_name'] == 'test_model'
assert result['status'] != 'insufficient_feedback'
assert 'current_performance' in result
assert 'trend_analysis' in result
assert 'degradation_detected' in result
assert 'retraining_recommendation' in result
# Good performance should not recommend retraining
assert result['retraining_recommendation']['recommended'] is False
@pytest.mark.asyncio
async def test_analyze_degraded_performance(self, learning_system, degraded_feedback_data):
"""Test complete analysis of degraded performance."""
result = await learning_system.analyze_model_performance(
model_name='degraded_model',
feedback_data=degraded_feedback_data,
baseline_performance={'accuracy': 90}
)
assert result['degradation_detected']['detected'] is True
assert result['retraining_recommendation']['recommended'] is True
@pytest.mark.asyncio
async def test_insufficient_feedback(self, learning_system):
"""Test analysis with insufficient feedback samples."""
small_data = pd.DataFrame([{
'insight_id': 'test',
'outcome_date': datetime.utcnow(),
'predicted_value': 100,
'actual_value': 95,
'error': 5,
'error_pct': 5,
'accuracy': 95,
'confidence': 85
}])
result = await learning_system.analyze_model_performance(
model_name='test_model',
feedback_data=small_data
)
assert result['status'] == 'insufficient_feedback'
assert result['feedback_samples'] == 1
assert result['required_samples'] == 30
class TestLearningInsights:
"""Test learning insight generation."""
@pytest.mark.asyncio
async def test_generate_urgent_retraining_insight(self, learning_system):
"""Test generation of urgent retraining insight."""
analyses = [{
'model_name': 'urgent_model',
'retraining_recommendation': {
'priority': 'urgent',
'recommended': True
},
'degradation_detected': {
'detected': True
}
}]
insights = await learning_system.generate_learning_insights(
analyses,
tenant_id='tenant_123'
)
# Should generate urgent warning
urgent_insights = [i for i in insights if i['priority'] == 'urgent']
assert len(urgent_insights) > 0
insight = urgent_insights[0]
assert insight['type'] == 'warning'
assert 'urgent_model' in insight['description'].lower()
@pytest.mark.asyncio
async def test_generate_system_health_insight(self, learning_system):
"""Test generation of system health insight."""
# 3 models, 1 degraded
analyses = [
{
'model_name': 'model_1',
'degradation_detected': {'detected': False},
'retraining_recommendation': {'priority': 'none'}
},
{
'model_name': 'model_2',
'degradation_detected': {'detected': False},
'retraining_recommendation': {'priority': 'none'}
},
{
'model_name': 'model_3',
'degradation_detected': {'detected': True},
'retraining_recommendation': {'priority': 'high'}
}
]
insights = await learning_system.generate_learning_insights(
analyses,
tenant_id='tenant_123'
)
# Should generate system health insight (66% healthy < 80%)
# Note: May or may not trigger depending on threshold
# At minimum should not crash
assert isinstance(insights, list)
@pytest.mark.asyncio
async def test_generate_calibration_insight(self, learning_system):
"""Test generation of calibration insight."""
analyses = [{
'model_name': 'model_1',
'degradation_detected': {'detected': False},
'retraining_recommendation': {'priority': 'none'},
'confidence_calibration': {
'calibrated': False,
'overall_calibration_error': 15
}
}]
insights = await learning_system.generate_learning_insights(
analyses,
tenant_id='tenant_123'
)
# Should generate calibration insight
calibration_insights = [
i for i in insights
if 'calibration' in i['title'].lower()
]
assert len(calibration_insights) > 0
class TestROICalculation:
"""Test ROI calculation."""
@pytest.mark.asyncio
async def test_calculate_roi_with_impact_values(self, learning_system):
"""Test ROI calculation with impact values."""
feedback_data = pd.DataFrame([
{
'accuracy': 90,
'impact_value': 1000
},
{
'accuracy': 85,
'impact_value': 1500
},
{
'accuracy': 95,
'impact_value': 800
}
])
roi = await learning_system.calculate_roi(
feedback_data,
insight_type='demand_forecast'
)
assert roi['insight_type'] == 'demand_forecast'
assert roi['samples'] == 3
assert roi['avg_accuracy'] == 90.0
assert roi['total_impact_value'] == 3300
assert roi['roi_validated'] is True
@pytest.mark.asyncio
async def test_calculate_roi_without_impact_values(self, learning_system, good_feedback_data):
"""Test ROI calculation without impact values."""
roi = await learning_system.calculate_roi(
good_feedback_data,
insight_type='yield_prediction'
)
assert roi['insight_type'] == 'yield_prediction'
assert roi['samples'] > 0
assert 'avg_accuracy' in roi
assert roi['roi_validated'] is False