""" Monitoring and Observability Endpoints Real-time service monitoring and diagnostics """ from fastapi import APIRouter, Query from typing import Dict, Any, List, Optional from datetime import datetime, timezone, timedelta from sqlalchemy import text, func import logging from app.core.database import database_manager from app.utils.circuit_breaker import circuit_breaker_registry from app.models.training import ModelTrainingLog, TrainingJobQueue, TrainedModel logger = logging.getLogger(__name__) router = APIRouter() @router.get("/monitoring/circuit-breakers") async def get_circuit_breaker_status() -> Dict[str, Any]: """ Get status of all circuit breakers. Useful for monitoring external service health. """ breakers = circuit_breaker_registry.get_all_states() return { "timestamp": datetime.now(timezone.utc).isoformat(), "circuit_breakers": breakers, "summary": { "total": len(breakers), "open": sum(1 for b in breakers.values() if b["state"] == "open"), "half_open": sum(1 for b in breakers.values() if b["state"] == "half_open"), "closed": sum(1 for b in breakers.values() if b["state"] == "closed") } } @router.post("/monitoring/circuit-breakers/{name}/reset") async def reset_circuit_breaker(name: str) -> Dict[str, str]: """ Manually reset a circuit breaker. Use with caution - only reset if you know the service has recovered. """ circuit_breaker_registry.reset(name) return { "status": "success", "message": f"Circuit breaker '{name}' has been reset", "timestamp": datetime.now(timezone.utc).isoformat() } @router.get("/monitoring/training-jobs") async def get_training_job_stats( hours: int = Query(default=24, ge=1, le=168, description="Look back period in hours") ) -> Dict[str, Any]: """ Get training job statistics for the specified period. """ try: since = datetime.now(timezone.utc) - timedelta(hours=hours) async with database_manager.get_session() as session: # Get job counts by status result = await session.execute( text(""" SELECT status, COUNT(*) as count FROM model_training_logs WHERE created_at >= :since GROUP BY status """), {"since": since} ) status_counts = dict(result.fetchall()) # Get average training time for completed jobs result = await session.execute( text(""" SELECT AVG(EXTRACT(EPOCH FROM (end_time - start_time))) as avg_duration FROM model_training_logs WHERE status = 'completed' AND created_at >= :since AND end_time IS NOT NULL """), {"since": since} ) avg_duration = result.scalar() # Get failure rate total = sum(status_counts.values()) failed = status_counts.get('failed', 0) failure_rate = (failed / total * 100) if total > 0 else 0 # Get recent jobs result = await session.execute( text(""" SELECT job_id, tenant_id, status, progress, start_time, end_time FROM model_training_logs WHERE created_at >= :since ORDER BY created_at DESC LIMIT 10 """), {"since": since} ) recent_jobs = [ { "job_id": row.job_id, "tenant_id": str(row.tenant_id), "status": row.status, "progress": row.progress, "start_time": row.start_time.isoformat() if row.start_time else None, "end_time": row.end_time.isoformat() if row.end_time else None } for row in result.fetchall() ] return { "period_hours": hours, "timestamp": datetime.now(timezone.utc).isoformat(), "summary": { "total_jobs": total, "by_status": status_counts, "failure_rate_percent": round(failure_rate, 2), "avg_duration_seconds": round(avg_duration, 2) if avg_duration else None }, "recent_jobs": recent_jobs } except Exception as e: logger.error(f"Failed to get training job stats: {e}") return { "error": str(e), "timestamp": datetime.now(timezone.utc).isoformat() } @router.get("/monitoring/models") async def get_model_stats() -> Dict[str, Any]: """ Get statistics about trained models. """ try: async with database_manager.get_session() as session: # Total models result = await session.execute( text("SELECT COUNT(*) FROM trained_models") ) total_models = result.scalar() # Active models result = await session.execute( text("SELECT COUNT(*) FROM trained_models WHERE is_active = true") ) active_models = result.scalar() # Production models result = await session.execute( text("SELECT COUNT(*) FROM trained_models WHERE is_production = true") ) production_models = result.scalar() # Models by type result = await session.execute( text(""" SELECT model_type, COUNT(*) as count FROM trained_models GROUP BY model_type """) ) models_by_type = dict(result.fetchall()) # Average model performance (MAPE) result = await session.execute( text(""" SELECT AVG(mape) as avg_mape FROM trained_models WHERE mape IS NOT NULL AND is_active = true """) ) avg_mape = result.scalar() # Models created today today = datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0) result = await session.execute( text(""" SELECT COUNT(*) FROM trained_models WHERE created_at >= :today """), {"today": today} ) models_today = result.scalar() return { "timestamp": datetime.now(timezone.utc).isoformat(), "summary": { "total_models": total_models, "active_models": active_models, "production_models": production_models, "models_created_today": models_today, "average_mape_percent": round(avg_mape, 2) if avg_mape else None }, "by_type": models_by_type } except Exception as e: logger.error(f"Failed to get model stats: {e}") return { "error": str(e), "timestamp": datetime.now(timezone.utc).isoformat() } @router.get("/monitoring/queue") async def get_queue_status() -> Dict[str, Any]: """ Get training job queue status. """ try: async with database_manager.get_session() as session: # Queued jobs result = await session.execute( text(""" SELECT COUNT(*) FROM training_job_queue WHERE status = 'queued' """) ) queued = result.scalar() # Running jobs result = await session.execute( text(""" SELECT COUNT(*) FROM training_job_queue WHERE status = 'running' """) ) running = result.scalar() # Get oldest queued job result = await session.execute( text(""" SELECT created_at FROM training_job_queue WHERE status = 'queued' ORDER BY created_at ASC LIMIT 1 """) ) oldest_queued = result.scalar() # Calculate wait time if oldest_queued: wait_time_seconds = (datetime.now(timezone.utc) - oldest_queued).total_seconds() else: wait_time_seconds = 0 return { "timestamp": datetime.now(timezone.utc).isoformat(), "queue": { "queued": queued, "running": running, "oldest_wait_time_seconds": round(wait_time_seconds, 2) if oldest_queued else 0, "oldest_queued_at": oldest_queued.isoformat() if oldest_queued else None } } except Exception as e: logger.error(f"Failed to get queue status: {e}") return { "error": str(e), "timestamp": datetime.now(timezone.utc).isoformat() } @router.get("/monitoring/performance") async def get_performance_metrics( tenant_id: Optional[str] = Query(None, description="Filter by tenant ID") ) -> Dict[str, Any]: """ Get model performance metrics. """ try: async with database_manager.get_session() as session: query_params = {} where_clause = "" if tenant_id: where_clause = "WHERE tenant_id = :tenant_id" query_params["tenant_id"] = tenant_id # Get performance distribution result = await session.execute( text(f""" SELECT COUNT(*) as total, AVG(mape) as avg_mape, MIN(mape) as min_mape, MAX(mape) as max_mape, PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY mape) as median_mape, AVG(mae) as avg_mae, AVG(rmse) as avg_rmse FROM model_performance_metrics {where_clause} """), query_params ) stats = result.fetchone() # Get accuracy distribution (buckets) result = await session.execute( text(f""" SELECT CASE WHEN mape <= 10 THEN 'excellent' WHEN mape <= 20 THEN 'good' WHEN mape <= 30 THEN 'acceptable' ELSE 'poor' END as accuracy_category, COUNT(*) as count FROM model_performance_metrics {where_clause} GROUP BY accuracy_category """), query_params ) distribution = dict(result.fetchall()) return { "timestamp": datetime.now(timezone.utc).isoformat(), "tenant_id": tenant_id, "statistics": { "total_metrics": stats.total if stats else 0, "avg_mape_percent": round(stats.avg_mape, 2) if stats and stats.avg_mape else None, "min_mape_percent": round(stats.min_mape, 2) if stats and stats.min_mape else None, "max_mape_percent": round(stats.max_mape, 2) if stats and stats.max_mape else None, "median_mape_percent": round(stats.median_mape, 2) if stats and stats.median_mape else None, "avg_mae": round(stats.avg_mae, 2) if stats and stats.avg_mae else None, "avg_rmse": round(stats.avg_rmse, 2) if stats and stats.avg_rmse else None }, "distribution": distribution } except Exception as e: logger.error(f"Failed to get performance metrics: {e}") return { "error": str(e), "timestamp": datetime.now(timezone.utc).isoformat() } @router.get("/monitoring/alerts") async def get_alerts() -> Dict[str, Any]: """ Get active alerts and warnings based on system state. """ alerts = [] warnings = [] try: # Check circuit breakers breakers = circuit_breaker_registry.get_all_states() for name, state in breakers.items(): if state["state"] == "open": alerts.append({ "type": "circuit_breaker_open", "severity": "high", "message": f"Circuit breaker '{name}' is OPEN - service unavailable", "details": state }) elif state["state"] == "half_open": warnings.append({ "type": "circuit_breaker_recovering", "severity": "medium", "message": f"Circuit breaker '{name}' is recovering", "details": state }) # Check queue backlog async with database_manager.get_session() as session: result = await session.execute( text("SELECT COUNT(*) FROM training_job_queue WHERE status = 'queued'") ) queued = result.scalar() if queued > 10: warnings.append({ "type": "queue_backlog", "severity": "medium", "message": f"Training queue has {queued} pending jobs", "count": queued }) except Exception as e: logger.error(f"Failed to generate alerts: {e}") alerts.append({ "type": "monitoring_error", "severity": "high", "message": f"Failed to check system alerts: {str(e)}" }) return { "timestamp": datetime.now(timezone.utc).isoformat(), "summary": { "total_alerts": len(alerts), "total_warnings": len(warnings) }, "alerts": alerts, "warnings": warnings }