288 lines
8.8 KiB
Python
288 lines
8.8 KiB
Python
|
|
# ================================================================
|
||
|
|
# services/forecasting/app/api/performance_monitoring.py
|
||
|
|
# ================================================================
|
||
|
|
"""
|
||
|
|
Performance Monitoring API - Track and analyze forecast accuracy over time
|
||
|
|
"""
|
||
|
|
|
||
|
|
from fastapi import APIRouter, Depends, HTTPException, Path, Query, status
|
||
|
|
from typing import Dict, Any
|
||
|
|
from uuid import UUID
|
||
|
|
import structlog
|
||
|
|
|
||
|
|
from pydantic import BaseModel, Field
|
||
|
|
from app.services.performance_monitoring_service import PerformanceMonitoringService
|
||
|
|
from shared.auth.decorators import get_current_user_dep
|
||
|
|
from shared.auth.access_control import require_user_role
|
||
|
|
from shared.routing import RouteBuilder
|
||
|
|
from app.core.database import get_db
|
||
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||
|
|
|
||
|
|
route_builder = RouteBuilder('forecasting')
|
||
|
|
router = APIRouter(tags=["performance-monitoring"])
|
||
|
|
logger = structlog.get_logger()
|
||
|
|
|
||
|
|
|
||
|
|
# ================================================================
|
||
|
|
# Request/Response Schemas
|
||
|
|
# ================================================================
|
||
|
|
|
||
|
|
class AccuracySummaryRequest(BaseModel):
|
||
|
|
"""Request model for accuracy summary"""
|
||
|
|
days: int = Field(default=30, ge=1, le=365, description="Analysis period in days")
|
||
|
|
|
||
|
|
|
||
|
|
class DegradationAnalysisRequest(BaseModel):
|
||
|
|
"""Request model for degradation analysis"""
|
||
|
|
lookback_days: int = Field(default=30, ge=7, le=365, description="Days to analyze")
|
||
|
|
|
||
|
|
|
||
|
|
class ModelAgeCheckRequest(BaseModel):
|
||
|
|
"""Request model for model age check"""
|
||
|
|
max_age_days: int = Field(default=30, ge=1, le=90, description="Max acceptable model age")
|
||
|
|
|
||
|
|
|
||
|
|
class PerformanceReportRequest(BaseModel):
|
||
|
|
"""Request model for comprehensive performance report"""
|
||
|
|
days: int = Field(default=30, ge=1, le=365, description="Analysis period in days")
|
||
|
|
|
||
|
|
|
||
|
|
# ================================================================
|
||
|
|
# Endpoints
|
||
|
|
# ================================================================
|
||
|
|
|
||
|
|
@router.get(
|
||
|
|
route_builder.build_base_route("monitoring/accuracy-summary"),
|
||
|
|
status_code=status.HTTP_200_OK
|
||
|
|
)
|
||
|
|
@require_user_role(['admin', 'owner', 'member'])
|
||
|
|
async def get_accuracy_summary(
|
||
|
|
tenant_id: UUID = Path(..., description="Tenant ID"),
|
||
|
|
days: int = Query(30, ge=1, le=365, description="Analysis period in days"),
|
||
|
|
current_user: Dict[str, Any] = Depends(get_current_user_dep),
|
||
|
|
db: AsyncSession = Depends(get_db)
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Get forecast accuracy summary for recent period
|
||
|
|
|
||
|
|
Returns overall metrics, validation coverage, and health status.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
logger.info(
|
||
|
|
"Getting accuracy summary",
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
days=days,
|
||
|
|
user_id=current_user.get("user_id")
|
||
|
|
)
|
||
|
|
|
||
|
|
service = PerformanceMonitoringService(db)
|
||
|
|
|
||
|
|
summary = await service.get_accuracy_summary(
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
days=days
|
||
|
|
)
|
||
|
|
|
||
|
|
return summary
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(
|
||
|
|
"Failed to get accuracy summary",
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
error=str(e)
|
||
|
|
)
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||
|
|
detail=f"Failed to get accuracy summary: {str(e)}"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@router.get(
|
||
|
|
route_builder.build_base_route("monitoring/degradation-analysis"),
|
||
|
|
status_code=status.HTTP_200_OK
|
||
|
|
)
|
||
|
|
@require_user_role(['admin', 'owner', 'member'])
|
||
|
|
async def analyze_performance_degradation(
|
||
|
|
tenant_id: UUID = Path(..., description="Tenant ID"),
|
||
|
|
lookback_days: int = Query(30, ge=7, le=365, description="Days to analyze"),
|
||
|
|
current_user: Dict[str, Any] = Depends(get_current_user_dep),
|
||
|
|
db: AsyncSession = Depends(get_db)
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Detect if forecast performance is degrading over time
|
||
|
|
|
||
|
|
Compares first half vs second half of period and identifies poor performers.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
logger.info(
|
||
|
|
"Analyzing performance degradation",
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
lookback_days=lookback_days,
|
||
|
|
user_id=current_user.get("user_id")
|
||
|
|
)
|
||
|
|
|
||
|
|
service = PerformanceMonitoringService(db)
|
||
|
|
|
||
|
|
analysis = await service.detect_performance_degradation(
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
lookback_days=lookback_days
|
||
|
|
)
|
||
|
|
|
||
|
|
return analysis
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(
|
||
|
|
"Failed to analyze degradation",
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
error=str(e)
|
||
|
|
)
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||
|
|
detail=f"Failed to analyze degradation: {str(e)}"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@router.get(
|
||
|
|
route_builder.build_base_route("monitoring/model-age"),
|
||
|
|
status_code=status.HTTP_200_OK
|
||
|
|
)
|
||
|
|
@require_user_role(['admin', 'owner', 'member'])
|
||
|
|
async def check_model_age(
|
||
|
|
tenant_id: UUID = Path(..., description="Tenant ID"),
|
||
|
|
max_age_days: int = Query(30, ge=1, le=90, description="Max acceptable model age"),
|
||
|
|
current_user: Dict[str, Any] = Depends(get_current_user_dep),
|
||
|
|
db: AsyncSession = Depends(get_db)
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Check if models are outdated and need retraining
|
||
|
|
|
||
|
|
Returns models in use and identifies those needing updates.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
logger.info(
|
||
|
|
"Checking model age",
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
max_age_days=max_age_days,
|
||
|
|
user_id=current_user.get("user_id")
|
||
|
|
)
|
||
|
|
|
||
|
|
service = PerformanceMonitoringService(db)
|
||
|
|
|
||
|
|
analysis = await service.check_model_age(
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
max_age_days=max_age_days
|
||
|
|
)
|
||
|
|
|
||
|
|
return analysis
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(
|
||
|
|
"Failed to check model age",
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
error=str(e)
|
||
|
|
)
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||
|
|
detail=f"Failed to check model age: {str(e)}"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@router.post(
|
||
|
|
route_builder.build_base_route("monitoring/performance-report"),
|
||
|
|
status_code=status.HTTP_200_OK
|
||
|
|
)
|
||
|
|
@require_user_role(['admin', 'owner', 'member'])
|
||
|
|
async def generate_performance_report(
|
||
|
|
request: PerformanceReportRequest,
|
||
|
|
tenant_id: UUID = Path(..., description="Tenant ID"),
|
||
|
|
current_user: Dict[str, Any] = Depends(get_current_user_dep),
|
||
|
|
db: AsyncSession = Depends(get_db)
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Generate comprehensive performance report
|
||
|
|
|
||
|
|
Combines accuracy summary, degradation analysis, and model age check
|
||
|
|
with actionable recommendations.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
logger.info(
|
||
|
|
"Generating performance report",
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
days=request.days,
|
||
|
|
user_id=current_user.get("user_id")
|
||
|
|
)
|
||
|
|
|
||
|
|
service = PerformanceMonitoringService(db)
|
||
|
|
|
||
|
|
report = await service.generate_performance_report(
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
days=request.days
|
||
|
|
)
|
||
|
|
|
||
|
|
return report
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(
|
||
|
|
"Failed to generate performance report",
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
error=str(e)
|
||
|
|
)
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||
|
|
detail=f"Failed to generate performance report: {str(e)}"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@router.get(
|
||
|
|
route_builder.build_base_route("monitoring/health"),
|
||
|
|
status_code=status.HTTP_200_OK
|
||
|
|
)
|
||
|
|
@require_user_role(['admin', 'owner', 'member'])
|
||
|
|
async def get_health_status(
|
||
|
|
tenant_id: UUID = Path(..., description="Tenant ID"),
|
||
|
|
current_user: Dict[str, Any] = Depends(get_current_user_dep),
|
||
|
|
db: AsyncSession = Depends(get_db)
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Get quick health status for dashboards
|
||
|
|
|
||
|
|
Returns simplified health metrics for UI display.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
service = PerformanceMonitoringService(db)
|
||
|
|
|
||
|
|
# Get 7-day summary for quick health check
|
||
|
|
summary = await service.get_accuracy_summary(
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
days=7
|
||
|
|
)
|
||
|
|
|
||
|
|
if summary.get("status") == "no_data":
|
||
|
|
return {
|
||
|
|
"status": "unknown",
|
||
|
|
"message": "No recent validation data available",
|
||
|
|
"health_status": "unknown"
|
||
|
|
}
|
||
|
|
|
||
|
|
return {
|
||
|
|
"status": "ok",
|
||
|
|
"health_status": summary.get("health_status"),
|
||
|
|
"current_mape": summary["average_metrics"].get("mape"),
|
||
|
|
"accuracy_percentage": summary["average_metrics"].get("accuracy_percentage"),
|
||
|
|
"validation_coverage": summary.get("coverage_percentage"),
|
||
|
|
"last_7_days": {
|
||
|
|
"validation_runs": summary.get("validation_runs"),
|
||
|
|
"forecasts_evaluated": summary.get("total_forecasts_evaluated")
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(
|
||
|
|
"Failed to get health status",
|
||
|
|
tenant_id=tenant_id,
|
||
|
|
error=str(e)
|
||
|
|
)
|
||
|
|
raise HTTPException(
|
||
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||
|
|
detail=f"Failed to get health status: {str(e)}"
|
||
|
|
)
|