411 lines
14 KiB
Python
411 lines
14 KiB
Python
|
|
"""
|
||
|
|
Monitoring and Observability Endpoints
|
||
|
|
Real-time service monitoring and diagnostics
|
||
|
|
"""
|
||
|
|
|
||
|
|
from fastapi import APIRouter, Query
|
||
|
|
from typing import Dict, Any, List, Optional
|
||
|
|
from datetime import datetime, timezone, timedelta
|
||
|
|
from sqlalchemy import text, func
|
||
|
|
import logging
|
||
|
|
|
||
|
|
from app.core.database import database_manager
|
||
|
|
from app.utils.circuit_breaker import circuit_breaker_registry
|
||
|
|
from app.models.training import ModelTrainingLog, TrainingJobQueue, TrainedModel
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
router = APIRouter()
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/monitoring/circuit-breakers")
|
||
|
|
async def get_circuit_breaker_status() -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Get status of all circuit breakers.
|
||
|
|
Useful for monitoring external service health.
|
||
|
|
"""
|
||
|
|
breakers = circuit_breaker_registry.get_all_states()
|
||
|
|
|
||
|
|
return {
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||
|
|
"circuit_breakers": breakers,
|
||
|
|
"summary": {
|
||
|
|
"total": len(breakers),
|
||
|
|
"open": sum(1 for b in breakers.values() if b["state"] == "open"),
|
||
|
|
"half_open": sum(1 for b in breakers.values() if b["state"] == "half_open"),
|
||
|
|
"closed": sum(1 for b in breakers.values() if b["state"] == "closed")
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.post("/monitoring/circuit-breakers/{name}/reset")
|
||
|
|
async def reset_circuit_breaker(name: str) -> Dict[str, str]:
|
||
|
|
"""
|
||
|
|
Manually reset a circuit breaker.
|
||
|
|
Use with caution - only reset if you know the service has recovered.
|
||
|
|
"""
|
||
|
|
circuit_breaker_registry.reset(name)
|
||
|
|
|
||
|
|
return {
|
||
|
|
"status": "success",
|
||
|
|
"message": f"Circuit breaker '{name}' has been reset",
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat()
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/monitoring/training-jobs")
|
||
|
|
async def get_training_job_stats(
|
||
|
|
hours: int = Query(default=24, ge=1, le=168, description="Look back period in hours")
|
||
|
|
) -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Get training job statistics for the specified period.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
since = datetime.now(timezone.utc) - timedelta(hours=hours)
|
||
|
|
|
||
|
|
async with database_manager.get_session() as session:
|
||
|
|
# Get job counts by status
|
||
|
|
result = await session.execute(
|
||
|
|
text("""
|
||
|
|
SELECT status, COUNT(*) as count
|
||
|
|
FROM model_training_logs
|
||
|
|
WHERE created_at >= :since
|
||
|
|
GROUP BY status
|
||
|
|
"""),
|
||
|
|
{"since": since}
|
||
|
|
)
|
||
|
|
status_counts = dict(result.fetchall())
|
||
|
|
|
||
|
|
# Get average training time for completed jobs
|
||
|
|
result = await session.execute(
|
||
|
|
text("""
|
||
|
|
SELECT AVG(EXTRACT(EPOCH FROM (end_time - start_time))) as avg_duration
|
||
|
|
FROM model_training_logs
|
||
|
|
WHERE status = 'completed'
|
||
|
|
AND created_at >= :since
|
||
|
|
AND end_time IS NOT NULL
|
||
|
|
"""),
|
||
|
|
{"since": since}
|
||
|
|
)
|
||
|
|
avg_duration = result.scalar()
|
||
|
|
|
||
|
|
# Get failure rate
|
||
|
|
total = sum(status_counts.values())
|
||
|
|
failed = status_counts.get('failed', 0)
|
||
|
|
failure_rate = (failed / total * 100) if total > 0 else 0
|
||
|
|
|
||
|
|
# Get recent jobs
|
||
|
|
result = await session.execute(
|
||
|
|
text("""
|
||
|
|
SELECT job_id, tenant_id, status, progress, start_time, end_time
|
||
|
|
FROM model_training_logs
|
||
|
|
WHERE created_at >= :since
|
||
|
|
ORDER BY created_at DESC
|
||
|
|
LIMIT 10
|
||
|
|
"""),
|
||
|
|
{"since": since}
|
||
|
|
)
|
||
|
|
recent_jobs = [
|
||
|
|
{
|
||
|
|
"job_id": row.job_id,
|
||
|
|
"tenant_id": str(row.tenant_id),
|
||
|
|
"status": row.status,
|
||
|
|
"progress": row.progress,
|
||
|
|
"start_time": row.start_time.isoformat() if row.start_time else None,
|
||
|
|
"end_time": row.end_time.isoformat() if row.end_time else None
|
||
|
|
}
|
||
|
|
for row in result.fetchall()
|
||
|
|
]
|
||
|
|
|
||
|
|
return {
|
||
|
|
"period_hours": hours,
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||
|
|
"summary": {
|
||
|
|
"total_jobs": total,
|
||
|
|
"by_status": status_counts,
|
||
|
|
"failure_rate_percent": round(failure_rate, 2),
|
||
|
|
"avg_duration_seconds": round(avg_duration, 2) if avg_duration else None
|
||
|
|
},
|
||
|
|
"recent_jobs": recent_jobs
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Failed to get training job stats: {e}")
|
||
|
|
return {
|
||
|
|
"error": str(e),
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat()
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/monitoring/models")
|
||
|
|
async def get_model_stats() -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Get statistics about trained models.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
async with database_manager.get_session() as session:
|
||
|
|
# Total models
|
||
|
|
result = await session.execute(
|
||
|
|
text("SELECT COUNT(*) FROM trained_models")
|
||
|
|
)
|
||
|
|
total_models = result.scalar()
|
||
|
|
|
||
|
|
# Active models
|
||
|
|
result = await session.execute(
|
||
|
|
text("SELECT COUNT(*) FROM trained_models WHERE is_active = true")
|
||
|
|
)
|
||
|
|
active_models = result.scalar()
|
||
|
|
|
||
|
|
# Production models
|
||
|
|
result = await session.execute(
|
||
|
|
text("SELECT COUNT(*) FROM trained_models WHERE is_production = true")
|
||
|
|
)
|
||
|
|
production_models = result.scalar()
|
||
|
|
|
||
|
|
# Models by type
|
||
|
|
result = await session.execute(
|
||
|
|
text("""
|
||
|
|
SELECT model_type, COUNT(*) as count
|
||
|
|
FROM trained_models
|
||
|
|
GROUP BY model_type
|
||
|
|
""")
|
||
|
|
)
|
||
|
|
models_by_type = dict(result.fetchall())
|
||
|
|
|
||
|
|
# Average model performance (MAPE)
|
||
|
|
result = await session.execute(
|
||
|
|
text("""
|
||
|
|
SELECT AVG(mape) as avg_mape
|
||
|
|
FROM trained_models
|
||
|
|
WHERE mape IS NOT NULL
|
||
|
|
AND is_active = true
|
||
|
|
""")
|
||
|
|
)
|
||
|
|
avg_mape = result.scalar()
|
||
|
|
|
||
|
|
# Models created today
|
||
|
|
today = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
|
||
|
|
result = await session.execute(
|
||
|
|
text("""
|
||
|
|
SELECT COUNT(*) FROM trained_models
|
||
|
|
WHERE created_at >= :today
|
||
|
|
"""),
|
||
|
|
{"today": today}
|
||
|
|
)
|
||
|
|
models_today = result.scalar()
|
||
|
|
|
||
|
|
return {
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||
|
|
"summary": {
|
||
|
|
"total_models": total_models,
|
||
|
|
"active_models": active_models,
|
||
|
|
"production_models": production_models,
|
||
|
|
"models_created_today": models_today,
|
||
|
|
"average_mape_percent": round(avg_mape, 2) if avg_mape else None
|
||
|
|
},
|
||
|
|
"by_type": models_by_type
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Failed to get model stats: {e}")
|
||
|
|
return {
|
||
|
|
"error": str(e),
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat()
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/monitoring/queue")
|
||
|
|
async def get_queue_status() -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Get training job queue status.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
async with database_manager.get_session() as session:
|
||
|
|
# Queued jobs
|
||
|
|
result = await session.execute(
|
||
|
|
text("""
|
||
|
|
SELECT COUNT(*) FROM training_job_queue
|
||
|
|
WHERE status = 'queued'
|
||
|
|
""")
|
||
|
|
)
|
||
|
|
queued = result.scalar()
|
||
|
|
|
||
|
|
# Running jobs
|
||
|
|
result = await session.execute(
|
||
|
|
text("""
|
||
|
|
SELECT COUNT(*) FROM training_job_queue
|
||
|
|
WHERE status = 'running'
|
||
|
|
""")
|
||
|
|
)
|
||
|
|
running = result.scalar()
|
||
|
|
|
||
|
|
# Get oldest queued job
|
||
|
|
result = await session.execute(
|
||
|
|
text("""
|
||
|
|
SELECT created_at FROM training_job_queue
|
||
|
|
WHERE status = 'queued'
|
||
|
|
ORDER BY created_at ASC
|
||
|
|
LIMIT 1
|
||
|
|
""")
|
||
|
|
)
|
||
|
|
oldest_queued = result.scalar()
|
||
|
|
|
||
|
|
# Calculate wait time
|
||
|
|
if oldest_queued:
|
||
|
|
wait_time_seconds = (datetime.now(timezone.utc) - oldest_queued).total_seconds()
|
||
|
|
else:
|
||
|
|
wait_time_seconds = 0
|
||
|
|
|
||
|
|
return {
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||
|
|
"queue": {
|
||
|
|
"queued": queued,
|
||
|
|
"running": running,
|
||
|
|
"oldest_wait_time_seconds": round(wait_time_seconds, 2) if oldest_queued else 0,
|
||
|
|
"oldest_queued_at": oldest_queued.isoformat() if oldest_queued else None
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Failed to get queue status: {e}")
|
||
|
|
return {
|
||
|
|
"error": str(e),
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat()
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/monitoring/performance")
|
||
|
|
async def get_performance_metrics(
|
||
|
|
tenant_id: Optional[str] = Query(None, description="Filter by tenant ID")
|
||
|
|
) -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Get model performance metrics.
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
async with database_manager.get_session() as session:
|
||
|
|
query_params = {}
|
||
|
|
where_clause = ""
|
||
|
|
|
||
|
|
if tenant_id:
|
||
|
|
where_clause = "WHERE tenant_id = :tenant_id"
|
||
|
|
query_params["tenant_id"] = tenant_id
|
||
|
|
|
||
|
|
# Get performance distribution
|
||
|
|
result = await session.execute(
|
||
|
|
text(f"""
|
||
|
|
SELECT
|
||
|
|
COUNT(*) as total,
|
||
|
|
AVG(mape) as avg_mape,
|
||
|
|
MIN(mape) as min_mape,
|
||
|
|
MAX(mape) as max_mape,
|
||
|
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY mape) as median_mape,
|
||
|
|
AVG(mae) as avg_mae,
|
||
|
|
AVG(rmse) as avg_rmse
|
||
|
|
FROM model_performance_metrics
|
||
|
|
{where_clause}
|
||
|
|
"""),
|
||
|
|
query_params
|
||
|
|
)
|
||
|
|
stats = result.fetchone()
|
||
|
|
|
||
|
|
# Get accuracy distribution (buckets)
|
||
|
|
result = await session.execute(
|
||
|
|
text(f"""
|
||
|
|
SELECT
|
||
|
|
CASE
|
||
|
|
WHEN mape <= 10 THEN 'excellent'
|
||
|
|
WHEN mape <= 20 THEN 'good'
|
||
|
|
WHEN mape <= 30 THEN 'acceptable'
|
||
|
|
ELSE 'poor'
|
||
|
|
END as accuracy_category,
|
||
|
|
COUNT(*) as count
|
||
|
|
FROM model_performance_metrics
|
||
|
|
{where_clause}
|
||
|
|
GROUP BY accuracy_category
|
||
|
|
"""),
|
||
|
|
query_params
|
||
|
|
)
|
||
|
|
distribution = dict(result.fetchall())
|
||
|
|
|
||
|
|
return {
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||
|
|
"tenant_id": tenant_id,
|
||
|
|
"statistics": {
|
||
|
|
"total_metrics": stats.total if stats else 0,
|
||
|
|
"avg_mape_percent": round(stats.avg_mape, 2) if stats and stats.avg_mape else None,
|
||
|
|
"min_mape_percent": round(stats.min_mape, 2) if stats and stats.min_mape else None,
|
||
|
|
"max_mape_percent": round(stats.max_mape, 2) if stats and stats.max_mape else None,
|
||
|
|
"median_mape_percent": round(stats.median_mape, 2) if stats and stats.median_mape else None,
|
||
|
|
"avg_mae": round(stats.avg_mae, 2) if stats and stats.avg_mae else None,
|
||
|
|
"avg_rmse": round(stats.avg_rmse, 2) if stats and stats.avg_rmse else None
|
||
|
|
},
|
||
|
|
"distribution": distribution
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Failed to get performance metrics: {e}")
|
||
|
|
return {
|
||
|
|
"error": str(e),
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat()
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@router.get("/monitoring/alerts")
|
||
|
|
async def get_alerts() -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Get active alerts and warnings based on system state.
|
||
|
|
"""
|
||
|
|
alerts = []
|
||
|
|
warnings = []
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Check circuit breakers
|
||
|
|
breakers = circuit_breaker_registry.get_all_states()
|
||
|
|
for name, state in breakers.items():
|
||
|
|
if state["state"] == "open":
|
||
|
|
alerts.append({
|
||
|
|
"type": "circuit_breaker_open",
|
||
|
|
"severity": "high",
|
||
|
|
"message": f"Circuit breaker '{name}' is OPEN - service unavailable",
|
||
|
|
"details": state
|
||
|
|
})
|
||
|
|
elif state["state"] == "half_open":
|
||
|
|
warnings.append({
|
||
|
|
"type": "circuit_breaker_recovering",
|
||
|
|
"severity": "medium",
|
||
|
|
"message": f"Circuit breaker '{name}' is recovering",
|
||
|
|
"details": state
|
||
|
|
})
|
||
|
|
|
||
|
|
# Check queue backlog
|
||
|
|
async with database_manager.get_session() as session:
|
||
|
|
result = await session.execute(
|
||
|
|
text("SELECT COUNT(*) FROM training_job_queue WHERE status = 'queued'")
|
||
|
|
)
|
||
|
|
queued = result.scalar()
|
||
|
|
|
||
|
|
if queued > 10:
|
||
|
|
warnings.append({
|
||
|
|
"type": "queue_backlog",
|
||
|
|
"severity": "medium",
|
||
|
|
"message": f"Training queue has {queued} pending jobs",
|
||
|
|
"count": queued
|
||
|
|
})
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Failed to generate alerts: {e}")
|
||
|
|
alerts.append({
|
||
|
|
"type": "monitoring_error",
|
||
|
|
"severity": "high",
|
||
|
|
"message": f"Failed to check system alerts: {str(e)}"
|
||
|
|
})
|
||
|
|
|
||
|
|
return {
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||
|
|
"summary": {
|
||
|
|
"total_alerts": len(alerts),
|
||
|
|
"total_warnings": len(warnings)
|
||
|
|
},
|
||
|
|
"alerts": alerts,
|
||
|
|
"warnings": warnings
|
||
|
|
}
|