Files
bakery-ia/services/forecasting/app/api/performance_monitoring.py

288 lines
8.8 KiB
Python
Raw Normal View History

2026-01-21 17:17:16 +01:00
# ================================================================
# services/forecasting/app/api/performance_monitoring.py
# ================================================================
"""
Performance Monitoring API - Track and analyze forecast accuracy over time
"""
from fastapi import APIRouter, Depends, HTTPException, Path, Query, status
from typing import Dict, Any
from uuid import UUID
import structlog
from pydantic import BaseModel, Field
from app.services.performance_monitoring_service import PerformanceMonitoringService
from shared.auth.decorators import get_current_user_dep
from shared.auth.access_control import require_user_role
from shared.routing import RouteBuilder
from app.core.database import get_db
from sqlalchemy.ext.asyncio import AsyncSession
route_builder = RouteBuilder('forecasting')
router = APIRouter(tags=["performance-monitoring"])
logger = structlog.get_logger()
# ================================================================
# Request/Response Schemas
# ================================================================
class AccuracySummaryRequest(BaseModel):
"""Request model for accuracy summary"""
days: int = Field(default=30, ge=1, le=365, description="Analysis period in days")
class DegradationAnalysisRequest(BaseModel):
"""Request model for degradation analysis"""
lookback_days: int = Field(default=30, ge=7, le=365, description="Days to analyze")
class ModelAgeCheckRequest(BaseModel):
"""Request model for model age check"""
max_age_days: int = Field(default=30, ge=1, le=90, description="Max acceptable model age")
class PerformanceReportRequest(BaseModel):
"""Request model for comprehensive performance report"""
days: int = Field(default=30, ge=1, le=365, description="Analysis period in days")
# ================================================================
# Endpoints
# ================================================================
@router.get(
route_builder.build_base_route("monitoring/accuracy-summary"),
status_code=status.HTTP_200_OK
)
@require_user_role(['admin', 'owner', 'member'])
async def get_accuracy_summary(
tenant_id: UUID = Path(..., description="Tenant ID"),
days: int = Query(30, ge=1, le=365, description="Analysis period in days"),
current_user: Dict[str, Any] = Depends(get_current_user_dep),
db: AsyncSession = Depends(get_db)
):
"""
Get forecast accuracy summary for recent period
Returns overall metrics, validation coverage, and health status.
"""
try:
logger.info(
"Getting accuracy summary",
tenant_id=tenant_id,
days=days,
user_id=current_user.get("user_id")
)
service = PerformanceMonitoringService(db)
summary = await service.get_accuracy_summary(
tenant_id=tenant_id,
days=days
)
return summary
except Exception as e:
logger.error(
"Failed to get accuracy summary",
tenant_id=tenant_id,
error=str(e)
)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to get accuracy summary: {str(e)}"
)
@router.get(
route_builder.build_base_route("monitoring/degradation-analysis"),
status_code=status.HTTP_200_OK
)
@require_user_role(['admin', 'owner', 'member'])
async def analyze_performance_degradation(
tenant_id: UUID = Path(..., description="Tenant ID"),
lookback_days: int = Query(30, ge=7, le=365, description="Days to analyze"),
current_user: Dict[str, Any] = Depends(get_current_user_dep),
db: AsyncSession = Depends(get_db)
):
"""
Detect if forecast performance is degrading over time
Compares first half vs second half of period and identifies poor performers.
"""
try:
logger.info(
"Analyzing performance degradation",
tenant_id=tenant_id,
lookback_days=lookback_days,
user_id=current_user.get("user_id")
)
service = PerformanceMonitoringService(db)
analysis = await service.detect_performance_degradation(
tenant_id=tenant_id,
lookback_days=lookback_days
)
return analysis
except Exception as e:
logger.error(
"Failed to analyze degradation",
tenant_id=tenant_id,
error=str(e)
)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to analyze degradation: {str(e)}"
)
@router.get(
route_builder.build_base_route("monitoring/model-age"),
status_code=status.HTTP_200_OK
)
@require_user_role(['admin', 'owner', 'member'])
async def check_model_age(
tenant_id: UUID = Path(..., description="Tenant ID"),
max_age_days: int = Query(30, ge=1, le=90, description="Max acceptable model age"),
current_user: Dict[str, Any] = Depends(get_current_user_dep),
db: AsyncSession = Depends(get_db)
):
"""
Check if models are outdated and need retraining
Returns models in use and identifies those needing updates.
"""
try:
logger.info(
"Checking model age",
tenant_id=tenant_id,
max_age_days=max_age_days,
user_id=current_user.get("user_id")
)
service = PerformanceMonitoringService(db)
analysis = await service.check_model_age(
tenant_id=tenant_id,
max_age_days=max_age_days
)
return analysis
except Exception as e:
logger.error(
"Failed to check model age",
tenant_id=tenant_id,
error=str(e)
)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to check model age: {str(e)}"
)
@router.post(
route_builder.build_base_route("monitoring/performance-report"),
status_code=status.HTTP_200_OK
)
@require_user_role(['admin', 'owner', 'member'])
async def generate_performance_report(
request: PerformanceReportRequest,
tenant_id: UUID = Path(..., description="Tenant ID"),
current_user: Dict[str, Any] = Depends(get_current_user_dep),
db: AsyncSession = Depends(get_db)
):
"""
Generate comprehensive performance report
Combines accuracy summary, degradation analysis, and model age check
with actionable recommendations.
"""
try:
logger.info(
"Generating performance report",
tenant_id=tenant_id,
days=request.days,
user_id=current_user.get("user_id")
)
service = PerformanceMonitoringService(db)
report = await service.generate_performance_report(
tenant_id=tenant_id,
days=request.days
)
return report
except Exception as e:
logger.error(
"Failed to generate performance report",
tenant_id=tenant_id,
error=str(e)
)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to generate performance report: {str(e)}"
)
@router.get(
route_builder.build_base_route("monitoring/health"),
status_code=status.HTTP_200_OK
)
@require_user_role(['admin', 'owner', 'member'])
async def get_health_status(
tenant_id: UUID = Path(..., description="Tenant ID"),
current_user: Dict[str, Any] = Depends(get_current_user_dep),
db: AsyncSession = Depends(get_db)
):
"""
Get quick health status for dashboards
Returns simplified health metrics for UI display.
"""
try:
service = PerformanceMonitoringService(db)
# Get 7-day summary for quick health check
summary = await service.get_accuracy_summary(
tenant_id=tenant_id,
days=7
)
if summary.get("status") == "no_data":
return {
"status": "unknown",
"message": "No recent validation data available",
"health_status": "unknown"
}
return {
"status": "ok",
"health_status": summary.get("health_status"),
"current_mape": summary["average_metrics"].get("mape"),
"accuracy_percentage": summary["average_metrics"].get("accuracy_percentage"),
"validation_coverage": summary.get("coverage_percentage"),
"last_7_days": {
"validation_runs": summary.get("validation_runs"),
"forecasts_evaluated": summary.get("total_forecasts_evaluated")
}
}
except Exception as e:
logger.error(
"Failed to get health status",
tenant_id=tenant_id,
error=str(e)
)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to get health status: {str(e)}"
)