bakery-ia/services/ai_insights/tests/test_feedback_learning_system.py

"""
Tests for Feedback Loop & Learning System
"""

import pytest
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from services.ai_insights.app.ml.feedback_learning_system import FeedbackLearningSystem


@pytest.fixture
def learning_system():
    """Create FeedbackLearningSystem instance."""
    return FeedbackLearningSystem(
        performance_threshold=0.85,
        degradation_threshold=0.10,
        min_feedback_samples=30
    )


@pytest.fixture
def good_feedback_data():
    """Generate feedback data for well-performing model."""
    np.random.seed(42)
    dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D')

    feedback = []
    for i, date in enumerate(dates):
        predicted = 100 + np.random.normal(0, 10)
        actual = predicted + np.random.normal(0, 5)  # Small error

        error = predicted - actual
        error_pct = abs(error / actual * 100) if actual != 0 else 0
        accuracy = max(0, 100 - error_pct)

        feedback.append({
            'insight_id': f'insight_{i}',
            'applied_at': date - timedelta(days=1),
            'outcome_date': date,
            'predicted_value': predicted,
            'actual_value': actual,
            'error': error,
            'error_pct': error_pct,
            'accuracy': accuracy,
            'confidence': 85
        })

    return pd.DataFrame(feedback)


@pytest.fixture
def degraded_feedback_data():
    """Generate feedback data for degrading model."""
    np.random.seed(42)
    dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D')

    feedback = []
    for i, date in enumerate(dates):
        # Introduce increasing error over time
        error_multiplier = 1 + (i / 50) * 2  # Errors double by end

        predicted = 100 + np.random.normal(0, 10)
        actual = predicted + np.random.normal(0, 10 * error_multiplier)

        error = predicted - actual
        error_pct = abs(error / actual * 100) if actual != 0 else 0
        accuracy = max(0, 100 - error_pct)

        feedback.append({
            'insight_id': f'insight_{i}',
            'applied_at': date - timedelta(days=1),
            'outcome_date': date,
            'predicted_value': predicted,
            'actual_value': actual,
            'error': error,
            'error_pct': error_pct,
            'accuracy': accuracy,
            'confidence': 85
        })

    return pd.DataFrame(feedback)


@pytest.fixture
def biased_feedback_data():
    """Generate feedback data with systematic bias."""
    np.random.seed(42)
    dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D')

    feedback = []
    for i, date in enumerate(dates):
        predicted = 100 + np.random.normal(0, 10)
        # Systematic over-prediction by 15%
        actual = predicted * 0.85 + np.random.normal(0, 3)

        error = predicted - actual
        error_pct = abs(error / actual * 100) if actual != 0 else 0
        accuracy = max(0, 100 - error_pct)

        feedback.append({
            'insight_id': f'insight_{i}',
            'applied_at': date - timedelta(days=1),
            'outcome_date': date,
            'predicted_value': predicted,
            'actual_value': actual,
            'error': error,
            'error_pct': error_pct,
            'accuracy': accuracy,
            'confidence': 80
        })

    return pd.DataFrame(feedback)


@pytest.fixture
def poorly_calibrated_feedback_data():
    """Generate feedback with poor confidence calibration."""
    np.random.seed(42)
    dates = pd.date_range(start=datetime.utcnow() - timedelta(days=60), periods=50, freq='D')

    feedback = []
    for i, date in enumerate(dates):
        predicted = 100 + np.random.normal(0, 10)

        # High confidence but low accuracy
        if i < 25:
            confidence = 90
            actual = predicted + np.random.normal(0, 20)  # Large error
        else:
            confidence = 60
            actual = predicted + np.random.normal(0, 5)   # Small error

        error = predicted - actual
        error_pct = abs(error / actual * 100) if actual != 0 else 0
        accuracy = max(0, 100 - error_pct)

        feedback.append({
            'insight_id': f'insight_{i}',
            'applied_at': date - timedelta(days=1),
            'outcome_date': date,
            'predicted_value': predicted,
            'actual_value': actual,
            'error': error,
            'error_pct': error_pct,
            'accuracy': accuracy,
            'confidence': confidence
        })

    return pd.DataFrame(feedback)


class TestPerformanceMetrics:
    """Test performance metric calculation."""

    @pytest.mark.asyncio
    async def test_calculate_metrics_good_performance(self, learning_system, good_feedback_data):
        """Test metric calculation for good performance."""
        metrics = learning_system._calculate_performance_metrics(good_feedback_data)

        assert 'accuracy' in metrics
        assert 'mae' in metrics
        assert 'rmse' in metrics
        assert 'mape' in metrics
        assert 'bias' in metrics
        assert 'r_squared' in metrics

        # Good model should have high accuracy
        assert metrics['accuracy'] > 80
        assert metrics['mae'] < 10
        assert abs(metrics['bias']) < 5

    @pytest.mark.asyncio
    async def test_calculate_metrics_degraded_performance(self, learning_system, degraded_feedback_data):
        """Test metric calculation for degraded performance."""
        metrics = learning_system._calculate_performance_metrics(degraded_feedback_data)

        # Degraded model should have lower accuracy
        assert metrics['accuracy'] < 80
        assert metrics['mae'] > 5


class TestPerformanceTrend:
    """Test performance trend analysis."""

    @pytest.mark.asyncio
    async def test_stable_trend(self, learning_system, good_feedback_data):
        """Test detection of stable performance trend."""
        trend = learning_system._analyze_performance_trend(good_feedback_data)

        assert trend['trend'] in ['stable', 'improving']

    @pytest.mark.asyncio
    async def test_degrading_trend(self, learning_system, degraded_feedback_data):
        """Test detection of degrading performance trend."""
        trend = learning_system._analyze_performance_trend(degraded_feedback_data)

        # May detect degrading trend depending on data
        assert trend['trend'] in ['degrading', 'stable']
        if trend['significant']:
            assert 'slope' in trend

    @pytest.mark.asyncio
    async def test_insufficient_data_trend(self, learning_system):
        """Test trend analysis with insufficient data."""
        small_data = pd.DataFrame([{
            'insight_id': 'test',
            'outcome_date': datetime.utcnow(),
            'accuracy': 90
        }])

        trend = learning_system._analyze_performance_trend(small_data)
        assert trend['trend'] == 'insufficient_data'


class TestDegradationDetection:
    """Test performance degradation detection."""

    @pytest.mark.asyncio
    async def test_no_degradation_detected(self, learning_system, good_feedback_data):
        """Test no degradation for good performance."""
        current_metrics = learning_system._calculate_performance_metrics(good_feedback_data)
        trend = learning_system._analyze_performance_trend(good_feedback_data)

        degradation = learning_system._detect_performance_degradation(
            current_metrics,
            baseline_performance={'accuracy': 85},
            trend_analysis=trend
        )

        assert degradation['detected'] is False
        assert degradation['severity'] == 'none'

    @pytest.mark.asyncio
    async def test_degradation_below_threshold(self, learning_system):
        """Test degradation detection when below absolute threshold."""
        current_metrics = {'accuracy': 70}  # Below 85% threshold
        trend = {'trend': 'stable', 'significant': False}

        degradation = learning_system._detect_performance_degradation(
            current_metrics,
            baseline_performance=None,
            trend_analysis=trend
        )

        assert degradation['detected'] is True
        assert degradation['severity'] == 'high'
        assert len(degradation['reasons']) > 0

    @pytest.mark.asyncio
    async def test_degradation_vs_baseline(self, learning_system):
        """Test degradation detection vs baseline."""
        current_metrics = {'accuracy': 80}
        baseline = {'accuracy': 95}  # 15.8% drop
        trend = {'trend': 'stable', 'significant': False}

        degradation = learning_system._detect_performance_degradation(
            current_metrics,
            baseline_performance=baseline,
            trend_analysis=trend
        )

        assert degradation['detected'] is True
        assert 'dropped' in degradation['reasons'][0].lower()

    @pytest.mark.asyncio
    async def test_degradation_trending_down(self, learning_system, degraded_feedback_data):
        """Test degradation detection from trending down."""
        current_metrics = learning_system._calculate_performance_metrics(degraded_feedback_data)
        trend = learning_system._analyze_performance_trend(degraded_feedback_data)

        degradation = learning_system._detect_performance_degradation(
            current_metrics,
            baseline_performance={'accuracy': 90},
            trend_analysis=trend
        )

        # Should detect some form of degradation
        assert degradation['detected'] is True


class TestRetrainingRecommendation:
    """Test retraining recommendation generation."""

    @pytest.mark.asyncio
    async def test_urgent_retraining_recommendation(self, learning_system):
        """Test urgent retraining recommendation."""
        current_metrics = {'accuracy': 70}
        degradation = {
            'detected': True,
            'severity': 'high',
            'reasons': ['Accuracy below threshold'],
            'current_accuracy': 70,
            'baseline_accuracy': 90
        }
        trend = {'trend': 'degrading', 'significant': True}

        recommendation = learning_system._generate_retraining_recommendation(
            'test_model',
            current_metrics,
            degradation,
            trend
        )

        assert recommendation['recommended'] is True
        assert recommendation['priority'] == 'urgent'
        assert 'immediately' in recommendation['recommendation'].lower()

    @pytest.mark.asyncio
    async def test_no_retraining_needed(self, learning_system, good_feedback_data):
        """Test no retraining recommendation for good performance."""
        current_metrics = learning_system._calculate_performance_metrics(good_feedback_data)
        degradation = {'detected': False, 'severity': 'none'}
        trend = learning_system._analyze_performance_trend(good_feedback_data)

        recommendation = learning_system._generate_retraining_recommendation(
            'test_model',
            current_metrics,
            degradation,
            trend
        )

        assert recommendation['recommended'] is False
        assert recommendation['priority'] == 'none'


class TestErrorPatternDetection:
    """Test error pattern identification."""

    @pytest.mark.asyncio
    async def test_systematic_bias_detection(self, learning_system, biased_feedback_data):
        """Test detection of systematic bias."""
        patterns = learning_system._identify_error_patterns(biased_feedback_data)

        # Should detect over-prediction bias
        bias_patterns = [p for p in patterns if p['pattern'] == 'systematic_bias']
        assert len(bias_patterns) > 0

        bias = bias_patterns[0]
        assert 'over-prediction' in bias['description']
        assert bias['severity'] in ['high', 'medium']

    @pytest.mark.asyncio
    async def test_no_patterns_for_good_data(self, learning_system, good_feedback_data):
        """Test no significant patterns for good data."""
        patterns = learning_system._identify_error_patterns(good_feedback_data)

        # May have some minor patterns, but no high severity
        high_severity = [p for p in patterns if p.get('severity') == 'high']
        assert len(high_severity) == 0


class TestConfidenceCalibration:
    """Test confidence calibration analysis."""

    @pytest.mark.asyncio
    async def test_well_calibrated_confidence(self, learning_system, good_feedback_data):
        """Test well-calibrated confidence scores."""
        calibration = learning_system._calculate_confidence_calibration(good_feedback_data)

        # Good data with consistent confidence should be well calibrated
        if 'overall_calibration_error' in calibration:
            # Small calibration error indicates good calibration
            assert calibration['overall_calibration_error'] < 20

    @pytest.mark.asyncio
    async def test_poorly_calibrated_confidence(self, learning_system, poorly_calibrated_feedback_data):
        """Test poorly calibrated confidence scores."""
        calibration = learning_system._calculate_confidence_calibration(poorly_calibrated_feedback_data)

        # Should detect poor calibration
        assert calibration['calibrated'] is False
        if 'by_confidence_range' in calibration:
            assert len(calibration['by_confidence_range']) > 0

    @pytest.mark.asyncio
    async def test_no_confidence_data(self, learning_system):
        """Test calibration when no confidence scores available."""
        no_conf_data = pd.DataFrame([{
            'predicted_value': 100,
            'actual_value': 95,
            'accuracy': 95
        }])

        calibration = learning_system._calculate_confidence_calibration(no_conf_data)
        assert calibration['calibrated'] is False
        assert 'reason' in calibration


class TestCompletePerformanceAnalysis:
    """Test complete performance analysis workflow."""

    @pytest.mark.asyncio
    async def test_analyze_good_performance(self, learning_system, good_feedback_data):
        """Test complete analysis of good performance."""
        result = await learning_system.analyze_model_performance(
            model_name='test_model',
            feedback_data=good_feedback_data,
            baseline_performance={'accuracy': 85}
        )

        assert result['model_name'] == 'test_model'
        assert result['status'] != 'insufficient_feedback'
        assert 'current_performance' in result
        assert 'trend_analysis' in result
        assert 'degradation_detected' in result
        assert 'retraining_recommendation' in result

        # Good performance should not recommend retraining
        assert result['retraining_recommendation']['recommended'] is False

    @pytest.mark.asyncio
    async def test_analyze_degraded_performance(self, learning_system, degraded_feedback_data):
        """Test complete analysis of degraded performance."""
        result = await learning_system.analyze_model_performance(
            model_name='degraded_model',
            feedback_data=degraded_feedback_data,
            baseline_performance={'accuracy': 90}
        )

        assert result['degradation_detected']['detected'] is True
        assert result['retraining_recommendation']['recommended'] is True

    @pytest.mark.asyncio
    async def test_insufficient_feedback(self, learning_system):
        """Test analysis with insufficient feedback samples."""
        small_data = pd.DataFrame([{
            'insight_id': 'test',
            'outcome_date': datetime.utcnow(),
            'predicted_value': 100,
            'actual_value': 95,
            'error': 5,
            'error_pct': 5,
            'accuracy': 95,
            'confidence': 85
        }])

        result = await learning_system.analyze_model_performance(
            model_name='test_model',
            feedback_data=small_data
        )

        assert result['status'] == 'insufficient_feedback'
        assert result['feedback_samples'] == 1
        assert result['required_samples'] == 30


class TestLearningInsights:
    """Test learning insight generation."""

    @pytest.mark.asyncio
    async def test_generate_urgent_retraining_insight(self, learning_system):
        """Test generation of urgent retraining insight."""
        analyses = [{
            'model_name': 'urgent_model',
            'retraining_recommendation': {
                'priority': 'urgent',
                'recommended': True
            },
            'degradation_detected': {
                'detected': True
            }
        }]

        insights = await learning_system.generate_learning_insights(
            analyses,
            tenant_id='tenant_123'
        )

        # Should generate urgent warning
        urgent_insights = [i for i in insights if i['priority'] == 'urgent']
        assert len(urgent_insights) > 0

        insight = urgent_insights[0]
        assert insight['type'] == 'warning'
        assert 'urgent_model' in insight['description'].lower()

    @pytest.mark.asyncio
    async def test_generate_system_health_insight(self, learning_system):
        """Test generation of system health insight."""
        # 3 models, 1 degraded
        analyses = [
            {
                'model_name': 'model_1',
                'degradation_detected': {'detected': False},
                'retraining_recommendation': {'priority': 'none'}
            },
            {
                'model_name': 'model_2',
                'degradation_detected': {'detected': False},
                'retraining_recommendation': {'priority': 'none'}
            },
            {
                'model_name': 'model_3',
                'degradation_detected': {'detected': True},
                'retraining_recommendation': {'priority': 'high'}
            }
        ]

        insights = await learning_system.generate_learning_insights(
            analyses,
            tenant_id='tenant_123'
        )

        # Should generate system health insight (66% healthy < 80%)
        # Note: May or may not trigger depending on threshold
        # At minimum should not crash
        assert isinstance(insights, list)

    @pytest.mark.asyncio
    async def test_generate_calibration_insight(self, learning_system):
        """Test generation of calibration insight."""
        analyses = [{
            'model_name': 'model_1',
            'degradation_detected': {'detected': False},
            'retraining_recommendation': {'priority': 'none'},
            'confidence_calibration': {
                'calibrated': False,
                'overall_calibration_error': 15
            }
        }]

        insights = await learning_system.generate_learning_insights(
            analyses,
            tenant_id='tenant_123'
        )

        # Should generate calibration insight
        calibration_insights = [
            i for i in insights
            if 'calibration' in i['title'].lower()
        ]
        assert len(calibration_insights) > 0


class TestROICalculation:
    """Test ROI calculation."""

    @pytest.mark.asyncio
    async def test_calculate_roi_with_impact_values(self, learning_system):
        """Test ROI calculation with impact values."""
        feedback_data = pd.DataFrame([
            {
                'accuracy': 90,
                'impact_value': 1000
            },
            {
                'accuracy': 85,
                'impact_value': 1500
            },
            {
                'accuracy': 95,
                'impact_value': 800
            }
        ])

        roi = await learning_system.calculate_roi(
            feedback_data,
            insight_type='demand_forecast'
        )

        assert roi['insight_type'] == 'demand_forecast'
        assert roi['samples'] == 3
        assert roi['avg_accuracy'] == 90.0
        assert roi['total_impact_value'] == 3300
        assert roi['roi_validated'] is True

    @pytest.mark.asyncio
    async def test_calculate_roi_without_impact_values(self, learning_system, good_feedback_data):
        """Test ROI calculation without impact values."""
        roi = await learning_system.calculate_roi(
            good_feedback_data,
            insight_type='yield_prediction'
        )

        assert roi['insight_type'] == 'yield_prediction'
        assert roi['samples'] > 0
        assert 'avg_accuracy' in roi
        assert roi['roi_validated'] is False