""" Feedback Loop & Learning System Enables continuous improvement through outcome tracking and model retraining """ import pandas as pd import numpy as np from typing import Dict, List, Any, Optional, Tuple from datetime import datetime, timedelta from uuid import UUID import structlog from scipy import stats from collections import defaultdict logger = structlog.get_logger() class FeedbackLearningSystem: """ Manages feedback collection, model performance tracking, and retraining triggers. Key Responsibilities: 1. Aggregate feedback from applied insights 2. Calculate model performance metrics (accuracy, precision, recall) 3. Detect performance degradation 4. Trigger automatic retraining when needed 5. Calibrate confidence scores based on actual accuracy 6. Generate learning insights for model improvement Workflow: - Feedback continuously recorded via AIInsightsClient - Periodic performance analysis (daily/weekly) - Automatic alerts when performance degrades - Retraining recommendations with priority """ def __init__( self, performance_threshold: float = 0.85, # Minimum acceptable accuracy degradation_threshold: float = 0.10, # 10% drop triggers alert min_feedback_samples: int = 30, # Minimum samples for analysis retraining_window_days: int = 90 # Consider last 90 days ): self.performance_threshold = performance_threshold self.degradation_threshold = degradation_threshold self.min_feedback_samples = min_feedback_samples self.retraining_window_days = retraining_window_days async def analyze_model_performance( self, model_name: str, feedback_data: pd.DataFrame, baseline_performance: Optional[Dict[str, float]] = None ) -> Dict[str, Any]: """ Analyze model performance based on feedback data. Args: model_name: Name of the model (e.g., 'hybrid_forecaster', 'yield_predictor') feedback_data: DataFrame with columns: - insight_id - applied_at - outcome_date - predicted_value - actual_value - error - error_pct - accuracy baseline_performance: Optional baseline metrics for comparison Returns: Performance analysis with metrics, trends, and recommendations """ logger.info( "Analyzing model performance", model_name=model_name, feedback_samples=len(feedback_data) ) if len(feedback_data) < self.min_feedback_samples: return self._insufficient_feedback_response( model_name, len(feedback_data), self.min_feedback_samples ) # Step 1: Calculate current performance metrics current_metrics = self._calculate_performance_metrics(feedback_data) # Step 2: Analyze performance trend over time trend_analysis = self._analyze_performance_trend(feedback_data) # Step 3: Detect performance degradation degradation_detected = self._detect_performance_degradation( current_metrics, baseline_performance, trend_analysis ) # Step 4: Generate retraining recommendation retraining_recommendation = self._generate_retraining_recommendation( model_name, current_metrics, degradation_detected, trend_analysis ) # Step 5: Identify error patterns error_patterns = self._identify_error_patterns(feedback_data) # Step 6: Calculate confidence calibration confidence_calibration = self._calculate_confidence_calibration(feedback_data) logger.info( "Model performance analysis complete", model_name=model_name, current_accuracy=current_metrics['accuracy'], degradation_detected=degradation_detected['detected'], retraining_recommended=retraining_recommendation['recommended'] ) return { 'model_name': model_name, 'analyzed_at': datetime.utcnow().isoformat(), 'feedback_samples': len(feedback_data), 'date_range': { 'start': feedback_data['outcome_date'].min().isoformat(), 'end': feedback_data['outcome_date'].max().isoformat() }, 'current_performance': current_metrics, 'baseline_performance': baseline_performance, 'trend_analysis': trend_analysis, 'degradation_detected': degradation_detected, 'retraining_recommendation': retraining_recommendation, 'error_patterns': error_patterns, 'confidence_calibration': confidence_calibration } def _insufficient_feedback_response( self, model_name: str, current_samples: int, required_samples: int ) -> Dict[str, Any]: """Return response when insufficient feedback data.""" return { 'model_name': model_name, 'analyzed_at': datetime.utcnow().isoformat(), 'status': 'insufficient_feedback', 'feedback_samples': current_samples, 'required_samples': required_samples, 'current_performance': None, 'recommendation': f'Need {required_samples - current_samples} more feedback samples for reliable analysis' } def _calculate_performance_metrics( self, feedback_data: pd.DataFrame ) -> Dict[str, float]: """ Calculate comprehensive performance metrics. Metrics: - Accuracy: % of predictions within acceptable error - MAE: Mean Absolute Error - RMSE: Root Mean Squared Error - MAPE: Mean Absolute Percentage Error - Bias: Systematic over/under prediction - R²: Correlation between predicted and actual """ predicted = feedback_data['predicted_value'].values actual = feedback_data['actual_value'].values # Filter out invalid values valid_mask = ~(np.isnan(predicted) | np.isnan(actual)) predicted = predicted[valid_mask] actual = actual[valid_mask] if len(predicted) == 0: return { 'accuracy': 0, 'mae': 0, 'rmse': 0, 'mape': 0, 'bias': 0, 'r_squared': 0 } # Calculate errors errors = predicted - actual abs_errors = np.abs(errors) pct_errors = np.abs(errors / actual) * 100 if np.all(actual != 0) else np.zeros_like(errors) # MAE and RMSE mae = float(np.mean(abs_errors)) rmse = float(np.sqrt(np.mean(errors ** 2))) # MAPE (excluding cases where actual = 0) valid_pct_mask = actual != 0 mape = float(np.mean(pct_errors[valid_pct_mask])) if np.any(valid_pct_mask) else 0 # Accuracy (% within 10% error) within_10pct = np.sum(pct_errors <= 10) / len(pct_errors) * 100 # Bias (mean error - positive = over-prediction) bias = float(np.mean(errors)) # R² (correlation) if len(predicted) > 1 and np.std(actual) > 0: correlation = np.corrcoef(predicted, actual)[0, 1] r_squared = correlation ** 2 else: r_squared = 0 return { 'accuracy': round(within_10pct, 2), # % within 10% error 'mae': round(mae, 2), 'rmse': round(rmse, 2), 'mape': round(mape, 2), 'bias': round(bias, 2), 'r_squared': round(r_squared, 3), 'sample_size': len(predicted) } def _analyze_performance_trend( self, feedback_data: pd.DataFrame ) -> Dict[str, Any]: """ Analyze performance trend over time. Returns trend direction (improving/stable/degrading) and slope. """ # Sort by date df = feedback_data.sort_values('outcome_date').copy() # Calculate rolling accuracy (7-day window) df['rolling_accuracy'] = df['accuracy'].rolling(window=7, min_periods=3).mean() # Linear trend if len(df) >= 10: # Use day index as x df['day_index'] = (df['outcome_date'] - df['outcome_date'].min()).dt.days # Fit linear regression valid_mask = ~np.isnan(df['rolling_accuracy']) if valid_mask.sum() >= 10: x = df.loc[valid_mask, 'day_index'].values y = df.loc[valid_mask, 'rolling_accuracy'].values slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) # Determine trend if p_value < 0.05: if slope > 0.1: trend = 'improving' elif slope < -0.1: trend = 'degrading' else: trend = 'stable' else: trend = 'stable' return { 'trend': trend, 'slope': round(float(slope), 4), 'p_value': round(float(p_value), 4), 'significant': p_value < 0.05, 'recent_performance': round(float(df['rolling_accuracy'].iloc[-1]), 2), 'initial_performance': round(float(df['rolling_accuracy'].dropna().iloc[0]), 2) } # Not enough data for trend return { 'trend': 'insufficient_data', 'slope': 0, 'p_value': 1.0, 'significant': False } def _detect_performance_degradation( self, current_metrics: Dict[str, float], baseline_performance: Optional[Dict[str, float]], trend_analysis: Dict[str, Any] ) -> Dict[str, Any]: """ Detect if model performance has degraded. Degradation triggers: 1. Current accuracy below threshold (85%) 2. Significant drop from baseline (>10%) 3. Degrading trend detected """ degradation_reasons = [] severity = 'none' # Check absolute performance if current_metrics['accuracy'] < self.performance_threshold * 100: degradation_reasons.append( f"Accuracy {current_metrics['accuracy']:.1f}% below threshold {self.performance_threshold*100}%" ) severity = 'high' # Check vs baseline if baseline_performance and 'accuracy' in baseline_performance: baseline_acc = baseline_performance['accuracy'] current_acc = current_metrics['accuracy'] drop_pct = (baseline_acc - current_acc) / baseline_acc if drop_pct > self.degradation_threshold: degradation_reasons.append( f"Accuracy dropped {drop_pct*100:.1f}% from baseline {baseline_acc:.1f}%" ) severity = 'high' if severity != 'high' else severity # Check trend if trend_analysis.get('trend') == 'degrading' and trend_analysis.get('significant'): degradation_reasons.append( f"Degrading trend detected (slope: {trend_analysis['slope']:.4f})" ) severity = 'medium' if severity == 'none' else severity detected = len(degradation_reasons) > 0 return { 'detected': detected, 'severity': severity, 'reasons': degradation_reasons, 'current_accuracy': current_metrics['accuracy'], 'baseline_accuracy': baseline_performance.get('accuracy') if baseline_performance else None } def _generate_retraining_recommendation( self, model_name: str, current_metrics: Dict[str, float], degradation_detected: Dict[str, Any], trend_analysis: Dict[str, Any] ) -> Dict[str, Any]: """ Generate retraining recommendation based on performance analysis. Priority Levels: - urgent: Severe degradation, retrain immediately - high: Performance below threshold, retrain soon - medium: Trending down, schedule retraining - low: Stable, routine retraining - none: No retraining needed """ if degradation_detected['detected']: severity = degradation_detected['severity'] if severity == 'high': priority = 'urgent' recommendation = f"Retrain {model_name} immediately - severe performance degradation" elif severity == 'medium': priority = 'high' recommendation = f"Schedule {model_name} retraining within 7 days" else: priority = 'medium' recommendation = f"Schedule routine {model_name} retraining" return { 'recommended': True, 'priority': priority, 'recommendation': recommendation, 'reasons': degradation_detected['reasons'], 'estimated_improvement': self._estimate_retraining_benefit( current_metrics, degradation_detected ) } # Check if routine retraining is due (e.g., every 90 days) # This would require tracking last_retrained_at else: return { 'recommended': False, 'priority': 'none', 'recommendation': f"{model_name} performance is acceptable, no immediate retraining needed", 'next_review_date': (datetime.utcnow() + timedelta(days=30)).isoformat() } def _estimate_retraining_benefit( self, current_metrics: Dict[str, float], degradation_detected: Dict[str, Any] ) -> Dict[str, Any]: """Estimate expected improvement from retraining.""" baseline_acc = degradation_detected.get('baseline_accuracy') current_acc = current_metrics['accuracy'] if baseline_acc: # Expect to recover 70-80% of lost performance expected_improvement = (baseline_acc - current_acc) * 0.75 expected_new_acc = current_acc + expected_improvement return { 'expected_accuracy_improvement': round(expected_improvement, 2), 'expected_new_accuracy': round(expected_new_acc, 2), 'confidence': 'medium' } return { 'expected_accuracy_improvement': 'unknown', 'confidence': 'low' } def _identify_error_patterns( self, feedback_data: pd.DataFrame ) -> List[Dict[str, Any]]: """ Identify systematic error patterns. Patterns: - Consistent over/under prediction - Higher errors for specific ranges - Day-of-week effects - Seasonal effects """ patterns = [] # Pattern 1: Systematic bias mean_error = feedback_data['error'].mean() if abs(mean_error) > feedback_data['error'].std() * 0.5: direction = 'over-prediction' if mean_error > 0 else 'under-prediction' patterns.append({ 'pattern': 'systematic_bias', 'description': f'Consistent {direction} by {abs(mean_error):.1f} units', 'severity': 'high' if abs(mean_error) > 10 else 'medium', 'recommendation': 'Recalibrate model bias term' }) # Pattern 2: High error for large values if 'predicted_value' in feedback_data.columns: # Split into quartiles feedback_data['value_quartile'] = pd.qcut( feedback_data['predicted_value'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'], duplicates='drop' ) quartile_errors = feedback_data.groupby('value_quartile')['error_pct'].mean() if len(quartile_errors) == 4 and quartile_errors['Q4'] > quartile_errors['Q1'] * 1.5: patterns.append({ 'pattern': 'high_value_error', 'description': f'Higher errors for large predictions (Q4: {quartile_errors["Q4"]:.1f}% vs Q1: {quartile_errors["Q1"]:.1f}%)', 'severity': 'medium', 'recommendation': 'Add log transformation or separate model for high values' }) # Pattern 3: Day-of-week effect if 'outcome_date' in feedback_data.columns: feedback_data['day_of_week'] = pd.to_datetime(feedback_data['outcome_date']).dt.dayofweek dow_errors = feedback_data.groupby('day_of_week')['error_pct'].mean() if len(dow_errors) >= 5 and dow_errors.max() > dow_errors.min() * 1.5: worst_day = dow_errors.idxmax() day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] patterns.append({ 'pattern': 'day_of_week_effect', 'description': f'Higher errors on {day_names[worst_day]} ({dow_errors[worst_day]:.1f}%)', 'severity': 'low', 'recommendation': 'Add day-of-week features to model' }) return patterns def _calculate_confidence_calibration( self, feedback_data: pd.DataFrame ) -> Dict[str, Any]: """ Calculate how well confidence scores match actual accuracy. Well-calibrated model: 80% confidence → 80% accuracy """ if 'confidence' not in feedback_data.columns: return {'calibrated': False, 'reason': 'No confidence scores available'} # Bin by confidence ranges feedback_data['confidence_bin'] = pd.cut( feedback_data['confidence'], bins=[0, 60, 70, 80, 90, 100], labels=['<60', '60-70', '70-80', '80-90', '90+'] ) calibration_results = [] for conf_bin in feedback_data['confidence_bin'].unique(): if pd.isna(conf_bin): continue bin_data = feedback_data[feedback_data['confidence_bin'] == conf_bin] if len(bin_data) >= 5: avg_confidence = bin_data['confidence'].mean() avg_accuracy = bin_data['accuracy'].mean() calibration_error = abs(avg_confidence - avg_accuracy) calibration_results.append({ 'confidence_range': str(conf_bin), 'avg_confidence': round(avg_confidence, 1), 'avg_accuracy': round(avg_accuracy, 1), 'calibration_error': round(calibration_error, 1), 'sample_size': len(bin_data), 'well_calibrated': calibration_error < 10 }) # Overall calibration if calibration_results: overall_calibration_error = np.mean([r['calibration_error'] for r in calibration_results]) well_calibrated = overall_calibration_error < 10 return { 'calibrated': well_calibrated, 'overall_calibration_error': round(overall_calibration_error, 2), 'by_confidence_range': calibration_results, 'recommendation': 'Confidence scores are well-calibrated' if well_calibrated else 'Recalibrate confidence scoring algorithm' } return {'calibrated': False, 'reason': 'Insufficient data for calibration analysis'} async def generate_learning_insights( self, performance_analyses: List[Dict[str, Any]], tenant_id: str ) -> List[Dict[str, Any]]: """ Generate high-level insights about learning system performance. Args: performance_analyses: List of model performance analyses tenant_id: Tenant identifier Returns: Learning insights for system improvement """ insights = [] # Insight 1: Models needing urgent retraining urgent_models = [ a for a in performance_analyses if a.get('retraining_recommendation', {}).get('priority') == 'urgent' ] if urgent_models: model_names = ', '.join([a['model_name'] for a in urgent_models]) insights.append({ 'type': 'warning', 'priority': 'urgent', 'category': 'system', 'title': f'Urgent Model Retraining Required: {len(urgent_models)} Models', 'description': f'Models requiring immediate retraining: {model_names}. Performance has degraded significantly.', 'impact_type': 'system_health', 'confidence': 95, 'metrics_json': { 'tenant_id': tenant_id, 'urgent_models': [a['model_name'] for a in urgent_models], 'affected_count': len(urgent_models) }, 'actionable': True, 'recommendation_actions': [{ 'label': 'Retrain Models', 'action': 'trigger_model_retraining', 'params': {'models': [a['model_name'] for a in urgent_models]} }], 'source_service': 'ai_insights', 'source_model': 'feedback_learning_system' }) # Insight 2: Overall system health total_models = len(performance_analyses) healthy_models = [ a for a in performance_analyses if not a.get('degradation_detected', {}).get('detected', False) ] health_pct = (len(healthy_models) / total_models * 100) if total_models > 0 else 0 if health_pct < 80: insights.append({ 'type': 'warning', 'priority': 'high', 'category': 'system', 'title': f'Learning System Health: {health_pct:.0f}%', 'description': f'{len(healthy_models)} of {total_models} models are performing well. System-wide performance review recommended.', 'impact_type': 'system_health', 'confidence': 90, 'metrics_json': { 'tenant_id': tenant_id, 'total_models': total_models, 'healthy_models': len(healthy_models), 'health_percentage': round(health_pct, 1) }, 'actionable': True, 'recommendation_actions': [{ 'label': 'Review System Health', 'action': 'review_learning_system', 'params': {'tenant_id': tenant_id} }], 'source_service': 'ai_insights', 'source_model': 'feedback_learning_system' }) # Insight 3: Confidence calibration issues poorly_calibrated = [ a for a in performance_analyses if not a.get('confidence_calibration', {}).get('calibrated', True) ] if poorly_calibrated: insights.append({ 'type': 'opportunity', 'priority': 'medium', 'category': 'system', 'title': f'Confidence Calibration Needed: {len(poorly_calibrated)} Models', 'description': 'Confidence scores do not match actual accuracy. Recalibration recommended.', 'impact_type': 'system_improvement', 'confidence': 85, 'metrics_json': { 'tenant_id': tenant_id, 'models_needing_calibration': [a['model_name'] for a in poorly_calibrated] }, 'actionable': True, 'recommendation_actions': [{ 'label': 'Recalibrate Confidence Scores', 'action': 'recalibrate_confidence', 'params': {'models': [a['model_name'] for a in poorly_calibrated]} }], 'source_service': 'ai_insights', 'source_model': 'feedback_learning_system' }) return insights async def calculate_roi( self, feedback_data: pd.DataFrame, insight_type: str ) -> Dict[str, Any]: """ Calculate ROI for applied insights. Args: feedback_data: Feedback data with business impact metrics insight_type: Type of insight (e.g., 'demand_forecast', 'safety_stock') Returns: ROI calculation with cost savings and accuracy metrics """ if len(feedback_data) == 0: return {'status': 'insufficient_data', 'samples': 0} # Calculate accuracy avg_accuracy = feedback_data['accuracy'].mean() # Estimate cost savings (would be more sophisticated in production) # For now, use impact_value from insights if available if 'impact_value' in feedback_data.columns: total_impact = feedback_data['impact_value'].sum() avg_impact = feedback_data['impact_value'].mean() return { 'insight_type': insight_type, 'samples': len(feedback_data), 'avg_accuracy': round(avg_accuracy, 2), 'total_impact_value': round(total_impact, 2), 'avg_impact_per_insight': round(avg_impact, 2), 'roi_validated': True } return { 'insight_type': insight_type, 'samples': len(feedback_data), 'avg_accuracy': round(avg_accuracy, 2), 'roi_validated': False, 'note': 'Impact values not tracked in feedback' }