Improve metrics

This commit is contained in:
Urtzi Alfaro
2026-01-08 20:48:24 +01:00
parent 29d19087f1
commit e8fda39e50
21 changed files with 615 additions and 3019 deletions

View File

@@ -15,6 +15,7 @@ from app.api import training_jobs, training_operations, models, health, monitori
from app.services.training_events import setup_messaging, cleanup_messaging
from app.websocket.events import setup_websocket_event_consumer, cleanup_websocket_consumers
from shared.service_base import StandardFastAPIService
from shared.monitoring.system_metrics import SystemMetricsCollector
class TrainingService(StandardFastAPIService):
@@ -77,6 +78,11 @@ class TrainingService(StandardFastAPIService):
async def on_startup(self, app: FastAPI):
"""Custom startup logic including migration verification"""
await self.verify_migrations()
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("training")
self.logger.info("System metrics collection started")
self.logger.info("Training service startup completed")
async def on_shutdown(self, app: FastAPI):
@@ -132,12 +138,14 @@ class TrainingService(StandardFastAPIService):
def setup_custom_endpoints(self):
"""Setup custom endpoints for training service"""
@self.app.get("/metrics")
async def get_metrics():
"""Prometheus metrics endpoint"""
if self.metrics_collector:
return self.metrics_collector.get_metrics()
return {"status": "metrics not available"}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
# @self.app.get("/metrics")
# async def get_metrics():
# """Prometheus metrics endpoint"""
# if self.metrics_collector:
# return self.metrics_collector.get_metrics()
# return {"status": "metrics not available"}
@self.app.get("/")
async def root():