""" Tests for Feedback Loop & Learning System """ import pytest import pandas as pd import numpy as np from datetime import datetime, timedelta from services.ai_insights.app.ml.feedback_learning_system import FeedbackLearningSystem @pytest.fixture def learning_system(): """Create FeedbackLearningSystem instance.""" return FeedbackLearningSystem( performance_threshold=0.85, degradation_threshold=0.10, min_feedback_samples=30 ) @pytest.fixture def good_feedback_data(): """Generate feedback data for well-performing model.""" np.random.seed(42) dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D') feedback = [] for i, date in enumerate(dates): predicted = 100 + np.random.normal(0, 10) actual = predicted + np.random.normal(0, 5) # Small error error = predicted - actual error_pct = abs(error / actual * 100) if actual != 0 else 0 accuracy = max(0, 100 - error_pct) feedback.append({ 'insight_id': f'insight_{i}', 'applied_at': date - timedelta(days=1), 'outcome_date': date, 'predicted_value': predicted, 'actual_value': actual, 'error': error, 'error_pct': error_pct, 'accuracy': accuracy, 'confidence': 85 }) return pd.DataFrame(feedback) @pytest.fixture def degraded_feedback_data(): """Generate feedback data for degrading model.""" np.random.seed(42) dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D') feedback = [] for i, date in enumerate(dates): # Introduce increasing error over time error_multiplier = 1 + (i / 50) * 2 # Errors double by end predicted = 100 + np.random.normal(0, 10) actual = predicted + np.random.normal(0, 10 * error_multiplier) error = predicted - actual error_pct = abs(error / actual * 100) if actual != 0 else 0 accuracy = max(0, 100 - error_pct) feedback.append({ 'insight_id': f'insight_{i}', 'applied_at': date - timedelta(days=1), 'outcome_date': date, 'predicted_value': predicted, 'actual_value': actual, 'error': error, 'error_pct': error_pct, 'accuracy': accuracy, 'confidence': 85 }) return pd.DataFrame(feedback) @pytest.fixture def biased_feedback_data(): """Generate feedback data with systematic bias.""" np.random.seed(42) dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D') feedback = [] for i, date in enumerate(dates): predicted = 100 + np.random.normal(0, 10) # Systematic over-prediction by 15% actual = predicted * 0.85 + np.random.normal(0, 3) error = predicted - actual error_pct = abs(error / actual * 100) if actual != 0 else 0 accuracy = max(0, 100 - error_pct) feedback.append({ 'insight_id': f'insight_{i}', 'applied_at': date - timedelta(days=1), 'outcome_date': date, 'predicted_value': predicted, 'actual_value': actual, 'error': error, 'error_pct': error_pct, 'accuracy': accuracy, 'confidence': 80 }) return pd.DataFrame(feedback) @pytest.fixture def poorly_calibrated_feedback_data(): """Generate feedback with poor confidence calibration.""" np.random.seed(42) dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D') feedback = [] for i, date in enumerate(dates): predicted = 100 + np.random.normal(0, 10) # High confidence but low accuracy if i < 25: confidence = 90 actual = predicted + np.random.normal(0, 20) # Large error else: confidence = 60 actual = predicted + np.random.normal(0, 5) # Small error error = predicted - actual error_pct = abs(error / actual * 100) if actual != 0 else 0 accuracy = max(0, 100 - error_pct) feedback.append({ 'insight_id': f'insight_{i}', 'applied_at': date - timedelta(days=1), 'outcome_date': date, 'predicted_value': predicted, 'actual_value': actual, 'error': error, 'error_pct': error_pct, 'accuracy': accuracy, 'confidence': confidence }) return pd.DataFrame(feedback) class TestPerformanceMetrics: """Test performance metric calculation.""" @pytest.mark.asyncio async def test_calculate_metrics_good_performance(self, learning_system, good_feedback_data): """Test metric calculation for good performance.""" metrics = learning_system._calculate_performance_metrics(good_feedback_data) assert 'accuracy' in metrics assert 'mae' in metrics assert 'rmse' in metrics assert 'mape' in metrics assert 'bias' in metrics assert 'r_squared' in metrics # Good model should have high accuracy assert metrics['accuracy'] > 80 assert metrics['mae'] < 10 assert abs(metrics['bias']) < 5 @pytest.mark.asyncio async def test_calculate_metrics_degraded_performance(self, learning_system, degraded_feedback_data): """Test metric calculation for degraded performance.""" metrics = learning_system._calculate_performance_metrics(degraded_feedback_data) # Degraded model should have lower accuracy assert metrics['accuracy'] < 80 assert metrics['mae'] > 5 class TestPerformanceTrend: """Test performance trend analysis.""" @pytest.mark.asyncio async def test_stable_trend(self, learning_system, good_feedback_data): """Test detection of stable performance trend.""" trend = learning_system._analyze_performance_trend(good_feedback_data) assert trend['trend'] in ['stable', 'improving'] @pytest.mark.asyncio async def test_degrading_trend(self, learning_system, degraded_feedback_data): """Test detection of degrading performance trend.""" trend = learning_system._analyze_performance_trend(degraded_feedback_data) # May detect degrading trend depending on data assert trend['trend'] in ['degrading', 'stable'] if trend['significant']: assert 'slope' in trend @pytest.mark.asyncio async def test_insufficient_data_trend(self, learning_system): """Test trend analysis with insufficient data.""" small_data = pd.DataFrame([{ 'insight_id': 'test', 'outcome_date': datetime.utcnow(), 'accuracy': 90 }]) trend = learning_system._analyze_performance_trend(small_data) assert trend['trend'] == 'insufficient_data' class TestDegradationDetection: """Test performance degradation detection.""" @pytest.mark.asyncio async def test_no_degradation_detected(self, learning_system, good_feedback_data): """Test no degradation for good performance.""" current_metrics = learning_system._calculate_performance_metrics(good_feedback_data) trend = learning_system._analyze_performance_trend(good_feedback_data) degradation = learning_system._detect_performance_degradation( current_metrics, baseline_performance={'accuracy': 85}, trend_analysis=trend ) assert degradation['detected'] is False assert degradation['severity'] == 'none' @pytest.mark.asyncio async def test_degradation_below_threshold(self, learning_system): """Test degradation detection when below absolute threshold.""" current_metrics = {'accuracy': 70} # Below 85% threshold trend = {'trend': 'stable', 'significant': False} degradation = learning_system._detect_performance_degradation( current_metrics, baseline_performance=None, trend_analysis=trend ) assert degradation['detected'] is True assert degradation['severity'] == 'high' assert len(degradation['reasons']) > 0 @pytest.mark.asyncio async def test_degradation_vs_baseline(self, learning_system): """Test degradation detection vs baseline.""" current_metrics = {'accuracy': 80} baseline = {'accuracy': 95} # 15.8% drop trend = {'trend': 'stable', 'significant': False} degradation = learning_system._detect_performance_degradation( current_metrics, baseline_performance=baseline, trend_analysis=trend ) assert degradation['detected'] is True assert 'dropped' in degradation['reasons'][0].lower() @pytest.mark.asyncio async def test_degradation_trending_down(self, learning_system, degraded_feedback_data): """Test degradation detection from trending down.""" current_metrics = learning_system._calculate_performance_metrics(degraded_feedback_data) trend = learning_system._analyze_performance_trend(degraded_feedback_data) degradation = learning_system._detect_performance_degradation( current_metrics, baseline_performance={'accuracy': 90}, trend_analysis=trend ) # Should detect some form of degradation assert degradation['detected'] is True class TestRetrainingRecommendation: """Test retraining recommendation generation.""" @pytest.mark.asyncio async def test_urgent_retraining_recommendation(self, learning_system): """Test urgent retraining recommendation.""" current_metrics = {'accuracy': 70} degradation = { 'detected': True, 'severity': 'high', 'reasons': ['Accuracy below threshold'], 'current_accuracy': 70, 'baseline_accuracy': 90 } trend = {'trend': 'degrading', 'significant': True} recommendation = learning_system._generate_retraining_recommendation( 'test_model', current_metrics, degradation, trend ) assert recommendation['recommended'] is True assert recommendation['priority'] == 'urgent' assert 'immediately' in recommendation['recommendation'].lower() @pytest.mark.asyncio async def test_no_retraining_needed(self, learning_system, good_feedback_data): """Test no retraining recommendation for good performance.""" current_metrics = learning_system._calculate_performance_metrics(good_feedback_data) degradation = {'detected': False, 'severity': 'none'} trend = learning_system._analyze_performance_trend(good_feedback_data) recommendation = learning_system._generate_retraining_recommendation( 'test_model', current_metrics, degradation, trend ) assert recommendation['recommended'] is False assert recommendation['priority'] == 'none' class TestErrorPatternDetection: """Test error pattern identification.""" @pytest.mark.asyncio async def test_systematic_bias_detection(self, learning_system, biased_feedback_data): """Test detection of systematic bias.""" patterns = learning_system._identify_error_patterns(biased_feedback_data) # Should detect over-prediction bias bias_patterns = [p for p in patterns if p['pattern'] == 'systematic_bias'] assert len(bias_patterns) > 0 bias = bias_patterns[0] assert 'over-prediction' in bias['description'] assert bias['severity'] in ['high', 'medium'] @pytest.mark.asyncio async def test_no_patterns_for_good_data(self, learning_system, good_feedback_data): """Test no significant patterns for good data.""" patterns = learning_system._identify_error_patterns(good_feedback_data) # May have some minor patterns, but no high severity high_severity = [p for p in patterns if p.get('severity') == 'high'] assert len(high_severity) == 0 class TestConfidenceCalibration: """Test confidence calibration analysis.""" @pytest.mark.asyncio async def test_well_calibrated_confidence(self, learning_system, good_feedback_data): """Test well-calibrated confidence scores.""" calibration = learning_system._calculate_confidence_calibration(good_feedback_data) # Good data with consistent confidence should be well calibrated if 'overall_calibration_error' in calibration: # Small calibration error indicates good calibration assert calibration['overall_calibration_error'] < 20 @pytest.mark.asyncio async def test_poorly_calibrated_confidence(self, learning_system, poorly_calibrated_feedback_data): """Test poorly calibrated confidence scores.""" calibration = learning_system._calculate_confidence_calibration(poorly_calibrated_feedback_data) # Should detect poor calibration assert calibration['calibrated'] is False if 'by_confidence_range' in calibration: assert len(calibration['by_confidence_range']) > 0 @pytest.mark.asyncio async def test_no_confidence_data(self, learning_system): """Test calibration when no confidence scores available.""" no_conf_data = pd.DataFrame([{ 'predicted_value': 100, 'actual_value': 95, 'accuracy': 95 }]) calibration = learning_system._calculate_confidence_calibration(no_conf_data) assert calibration['calibrated'] is False assert 'reason' in calibration class TestCompletePerformanceAnalysis: """Test complete performance analysis workflow.""" @pytest.mark.asyncio async def test_analyze_good_performance(self, learning_system, good_feedback_data): """Test complete analysis of good performance.""" result = await learning_system.analyze_model_performance( model_name='test_model', feedback_data=good_feedback_data, baseline_performance={'accuracy': 85} ) assert result['model_name'] == 'test_model' assert result['status'] != 'insufficient_feedback' assert 'current_performance' in result assert 'trend_analysis' in result assert 'degradation_detected' in result assert 'retraining_recommendation' in result # Good performance should not recommend retraining assert result['retraining_recommendation']['recommended'] is False @pytest.mark.asyncio async def test_analyze_degraded_performance(self, learning_system, degraded_feedback_data): """Test complete analysis of degraded performance.""" result = await learning_system.analyze_model_performance( model_name='degraded_model', feedback_data=degraded_feedback_data, baseline_performance={'accuracy': 90} ) assert result['degradation_detected']['detected'] is True assert result['retraining_recommendation']['recommended'] is True @pytest.mark.asyncio async def test_insufficient_feedback(self, learning_system): """Test analysis with insufficient feedback samples.""" small_data = pd.DataFrame([{ 'insight_id': 'test', 'outcome_date': datetime.utcnow(), 'predicted_value': 100, 'actual_value': 95, 'error': 5, 'error_pct': 5, 'accuracy': 95, 'confidence': 85 }]) result = await learning_system.analyze_model_performance( model_name='test_model', feedback_data=small_data ) assert result['status'] == 'insufficient_feedback' assert result['feedback_samples'] == 1 assert result['required_samples'] == 30 class TestLearningInsights: """Test learning insight generation.""" @pytest.mark.asyncio async def test_generate_urgent_retraining_insight(self, learning_system): """Test generation of urgent retraining insight.""" analyses = [{ 'model_name': 'urgent_model', 'retraining_recommendation': { 'priority': 'urgent', 'recommended': True }, 'degradation_detected': { 'detected': True } }] insights = await learning_system.generate_learning_insights( analyses, tenant_id='tenant_123' ) # Should generate urgent warning urgent_insights = [i for i in insights if i['priority'] == 'urgent'] assert len(urgent_insights) > 0 insight = urgent_insights[0] assert insight['type'] == 'warning' assert 'urgent_model' in insight['description'].lower() @pytest.mark.asyncio async def test_generate_system_health_insight(self, learning_system): """Test generation of system health insight.""" # 3 models, 1 degraded analyses = [ { 'model_name': 'model_1', 'degradation_detected': {'detected': False}, 'retraining_recommendation': {'priority': 'none'} }, { 'model_name': 'model_2', 'degradation_detected': {'detected': False}, 'retraining_recommendation': {'priority': 'none'} }, { 'model_name': 'model_3', 'degradation_detected': {'detected': True}, 'retraining_recommendation': {'priority': 'high'} } ] insights = await learning_system.generate_learning_insights( analyses, tenant_id='tenant_123' ) # Should generate system health insight (66% healthy < 80%) # Note: May or may not trigger depending on threshold # At minimum should not crash assert isinstance(insights, list) @pytest.mark.asyncio async def test_generate_calibration_insight(self, learning_system): """Test generation of calibration insight.""" analyses = [{ 'model_name': 'model_1', 'degradation_detected': {'detected': False}, 'retraining_recommendation': {'priority': 'none'}, 'confidence_calibration': { 'calibrated': False, 'overall_calibration_error': 15 } }] insights = await learning_system.generate_learning_insights( analyses, tenant_id='tenant_123' ) # Should generate calibration insight calibration_insights = [ i for i in insights if 'calibration' in i['title'].lower() ] assert len(calibration_insights) > 0 class TestROICalculation: """Test ROI calculation.""" @pytest.mark.asyncio async def test_calculate_roi_with_impact_values(self, learning_system): """Test ROI calculation with impact values.""" feedback_data = pd.DataFrame([ { 'accuracy': 90, 'impact_value': 1000 }, { 'accuracy': 85, 'impact_value': 1500 }, { 'accuracy': 95, 'impact_value': 800 } ]) roi = await learning_system.calculate_roi( feedback_data, insight_type='demand_forecast' ) assert roi['insight_type'] == 'demand_forecast' assert roi['samples'] == 3 assert roi['avg_accuracy'] == 90.0 assert roi['total_impact_value'] == 3300 assert roi['roi_validated'] is True @pytest.mark.asyncio async def test_calculate_roi_without_impact_values(self, learning_system, good_feedback_data): """Test ROI calculation without impact values.""" roi = await learning_system.calculate_roi( good_feedback_data, insight_type='yield_prediction' ) assert roi['insight_type'] == 'yield_prediction' assert roi['samples'] > 0 assert 'avg_accuracy' in roi assert roi['roi_validated'] is False