Files
bakery-ia/services/training/app/api/monitoring.py

411 lines
14 KiB
Python
Raw Normal View History

"""
Monitoring and Observability Endpoints
Real-time service monitoring and diagnostics
"""
from fastapi import APIRouter, Query
from typing import Dict, Any, List, Optional
from datetime import datetime, timezone, timedelta
from sqlalchemy import text, func
import logging
from app.core.database import database_manager
from app.utils.circuit_breaker import circuit_breaker_registry
from app.models.training import ModelTrainingLog, TrainingJobQueue, TrainedModel
logger = logging.getLogger(__name__)
router = APIRouter()
@router.get("/monitoring/circuit-breakers")
async def get_circuit_breaker_status() -> Dict[str, Any]:
"""
Get status of all circuit breakers.
Useful for monitoring external service health.
"""
breakers = circuit_breaker_registry.get_all_states()
return {
"timestamp": datetime.now(timezone.utc).isoformat(),
"circuit_breakers": breakers,
"summary": {
"total": len(breakers),
"open": sum(1 for b in breakers.values() if b["state"] == "open"),
"half_open": sum(1 for b in breakers.values() if b["state"] == "half_open"),
"closed": sum(1 for b in breakers.values() if b["state"] == "closed")
}
}
@router.post("/monitoring/circuit-breakers/{name}/reset")
async def reset_circuit_breaker(name: str) -> Dict[str, str]:
"""
Manually reset a circuit breaker.
Use with caution - only reset if you know the service has recovered.
"""
circuit_breaker_registry.reset(name)
return {
"status": "success",
"message": f"Circuit breaker '{name}' has been reset",
"timestamp": datetime.now(timezone.utc).isoformat()
}
@router.get("/monitoring/training-jobs")
async def get_training_job_stats(
hours: int = Query(default=24, ge=1, le=168, description="Look back period in hours")
) -> Dict[str, Any]:
"""
Get training job statistics for the specified period.
"""
try:
since = datetime.now(timezone.utc) - timedelta(hours=hours)
async with database_manager.get_session() as session:
# Get job counts by status
result = await session.execute(
text("""
SELECT status, COUNT(*) as count
FROM model_training_logs
WHERE created_at >= :since
GROUP BY status
"""),
{"since": since}
)
status_counts = dict(result.fetchall())
# Get average training time for completed jobs
result = await session.execute(
text("""
SELECT AVG(EXTRACT(EPOCH FROM (end_time - start_time))) as avg_duration
FROM model_training_logs
WHERE status = 'completed'
AND created_at >= :since
AND end_time IS NOT NULL
"""),
{"since": since}
)
avg_duration = result.scalar()
# Get failure rate
total = sum(status_counts.values())
failed = status_counts.get('failed', 0)
failure_rate = (failed / total * 100) if total > 0 else 0
# Get recent jobs
result = await session.execute(
text("""
SELECT job_id, tenant_id, status, progress, start_time, end_time
FROM model_training_logs
WHERE created_at >= :since
ORDER BY created_at DESC
LIMIT 10
"""),
{"since": since}
)
recent_jobs = [
{
"job_id": row.job_id,
"tenant_id": str(row.tenant_id),
"status": row.status,
"progress": row.progress,
"start_time": row.start_time.isoformat() if row.start_time else None,
"end_time": row.end_time.isoformat() if row.end_time else None
}
for row in result.fetchall()
]
return {
"period_hours": hours,
"timestamp": datetime.now(timezone.utc).isoformat(),
"summary": {
"total_jobs": total,
"by_status": status_counts,
"failure_rate_percent": round(failure_rate, 2),
"avg_duration_seconds": round(avg_duration, 2) if avg_duration else None
},
"recent_jobs": recent_jobs
}
except Exception as e:
logger.error(f"Failed to get training job stats: {e}")
return {
"error": str(e),
"timestamp": datetime.now(timezone.utc).isoformat()
}
@router.get("/monitoring/models")
async def get_model_stats() -> Dict[str, Any]:
"""
Get statistics about trained models.
"""
try:
async with database_manager.get_session() as session:
# Total models
result = await session.execute(
text("SELECT COUNT(*) FROM trained_models")
)
total_models = result.scalar()
# Active models
result = await session.execute(
text("SELECT COUNT(*) FROM trained_models WHERE is_active = true")
)
active_models = result.scalar()
# Production models
result = await session.execute(
text("SELECT COUNT(*) FROM trained_models WHERE is_production = true")
)
production_models = result.scalar()
# Models by type
result = await session.execute(
text("""
SELECT model_type, COUNT(*) as count
FROM trained_models
GROUP BY model_type
""")
)
models_by_type = dict(result.fetchall())
# Average model performance (MAPE)
result = await session.execute(
text("""
SELECT AVG(mape) as avg_mape
FROM trained_models
WHERE mape IS NOT NULL
AND is_active = true
""")
)
avg_mape = result.scalar()
# Models created today
today = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
result = await session.execute(
text("""
SELECT COUNT(*) FROM trained_models
WHERE created_at >= :today
"""),
{"today": today}
)
models_today = result.scalar()
return {
"timestamp": datetime.now(timezone.utc).isoformat(),
"summary": {
"total_models": total_models,
"active_models": active_models,
"production_models": production_models,
"models_created_today": models_today,
"average_mape_percent": round(avg_mape, 2) if avg_mape else None
},
"by_type": models_by_type
}
except Exception as e:
logger.error(f"Failed to get model stats: {e}")
return {
"error": str(e),
"timestamp": datetime.now(timezone.utc).isoformat()
}
@router.get("/monitoring/queue")
async def get_queue_status() -> Dict[str, Any]:
"""
Get training job queue status.
"""
try:
async with database_manager.get_session() as session:
# Queued jobs
result = await session.execute(
text("""
SELECT COUNT(*) FROM training_job_queue
WHERE status = 'queued'
""")
)
queued = result.scalar()
# Running jobs
result = await session.execute(
text("""
SELECT COUNT(*) FROM training_job_queue
WHERE status = 'running'
""")
)
running = result.scalar()
# Get oldest queued job
result = await session.execute(
text("""
SELECT created_at FROM training_job_queue
WHERE status = 'queued'
ORDER BY created_at ASC
LIMIT 1
""")
)
oldest_queued = result.scalar()
# Calculate wait time
if oldest_queued:
wait_time_seconds = (datetime.now(timezone.utc) - oldest_queued).total_seconds()
else:
wait_time_seconds = 0
return {
"timestamp": datetime.now(timezone.utc).isoformat(),
"queue": {
"queued": queued,
"running": running,
"oldest_wait_time_seconds": round(wait_time_seconds, 2) if oldest_queued else 0,
"oldest_queued_at": oldest_queued.isoformat() if oldest_queued else None
}
}
except Exception as e:
logger.error(f"Failed to get queue status: {e}")
return {
"error": str(e),
"timestamp": datetime.now(timezone.utc).isoformat()
}
@router.get("/monitoring/performance")
async def get_performance_metrics(
tenant_id: Optional[str] = Query(None, description="Filter by tenant ID")
) -> Dict[str, Any]:
"""
Get model performance metrics.
"""
try:
async with database_manager.get_session() as session:
query_params = {}
where_clause = ""
if tenant_id:
where_clause = "WHERE tenant_id = :tenant_id"
query_params["tenant_id"] = tenant_id
# Get performance distribution
result = await session.execute(
text(f"""
SELECT
COUNT(*) as total,
AVG(mape) as avg_mape,
MIN(mape) as min_mape,
MAX(mape) as max_mape,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY mape) as median_mape,
AVG(mae) as avg_mae,
AVG(rmse) as avg_rmse
FROM model_performance_metrics
{where_clause}
"""),
query_params
)
stats = result.fetchone()
# Get accuracy distribution (buckets)
result = await session.execute(
text(f"""
SELECT
CASE
WHEN mape <= 10 THEN 'excellent'
WHEN mape <= 20 THEN 'good'
WHEN mape <= 30 THEN 'acceptable'
ELSE 'poor'
END as accuracy_category,
COUNT(*) as count
FROM model_performance_metrics
{where_clause}
GROUP BY accuracy_category
"""),
query_params
)
distribution = dict(result.fetchall())
return {
"timestamp": datetime.now(timezone.utc).isoformat(),
"tenant_id": tenant_id,
"statistics": {
"total_metrics": stats.total if stats else 0,
"avg_mape_percent": round(stats.avg_mape, 2) if stats and stats.avg_mape else None,
"min_mape_percent": round(stats.min_mape, 2) if stats and stats.min_mape else None,
"max_mape_percent": round(stats.max_mape, 2) if stats and stats.max_mape else None,
"median_mape_percent": round(stats.median_mape, 2) if stats and stats.median_mape else None,
"avg_mae": round(stats.avg_mae, 2) if stats and stats.avg_mae else None,
"avg_rmse": round(stats.avg_rmse, 2) if stats and stats.avg_rmse else None
},
"distribution": distribution
}
except Exception as e:
logger.error(f"Failed to get performance metrics: {e}")
return {
"error": str(e),
"timestamp": datetime.now(timezone.utc).isoformat()
}
@router.get("/monitoring/alerts")
async def get_alerts() -> Dict[str, Any]:
"""
Get active alerts and warnings based on system state.
"""
alerts = []
warnings = []
try:
# Check circuit breakers
breakers = circuit_breaker_registry.get_all_states()
for name, state in breakers.items():
if state["state"] == "open":
alerts.append({
"type": "circuit_breaker_open",
"severity": "high",
"message": f"Circuit breaker '{name}' is OPEN - service unavailable",
"details": state
})
elif state["state"] == "half_open":
warnings.append({
"type": "circuit_breaker_recovering",
"severity": "medium",
"message": f"Circuit breaker '{name}' is recovering",
"details": state
})
# Check queue backlog
async with database_manager.get_session() as session:
result = await session.execute(
text("SELECT COUNT(*) FROM training_job_queue WHERE status = 'queued'")
)
queued = result.scalar()
if queued > 10:
warnings.append({
"type": "queue_backlog",
"severity": "medium",
"message": f"Training queue has {queued} pending jobs",
"count": queued
})
except Exception as e:
logger.error(f"Failed to generate alerts: {e}")
alerts.append({
"type": "monitoring_error",
"severity": "high",
"message": f"Failed to check system alerts: {str(e)}"
})
return {
"timestamp": datetime.now(timezone.utc).isoformat(),
"summary": {
"total_alerts": len(alerts),
"total_warnings": len(warnings)
},
"alerts": alerts,
"warnings": warnings
}