Improve metrics

This commit is contained in:
Urtzi Alfaro
2026-01-08 20:48:24 +01:00
parent 29d19087f1
commit e8fda39e50
21 changed files with 615 additions and 3019 deletions

View File

@@ -15,6 +15,7 @@ from app.api import demo_sessions, demo_accounts, demo_operations, internal
from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
# OpenTelemetry imports
from opentelemetry import trace
@@ -69,9 +70,12 @@ async def lifespan(app: FastAPI):
max_connections=50
)
# Start metrics server
metrics_collector.start_metrics_server(8080)
logger.info("Metrics server started on port 8080")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("demo-session")
logger.info("System metrics collection started")
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
logger.info("Metrics export configured via OpenTelemetry OTLP")
logger.info("Demo Session Service started successfully")
@@ -164,13 +168,8 @@ async def health():
}
@app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint"""
return Response(
content=metrics_collector.get_metrics(),
media_type="text/plain; version=0.0.4; charset=utf-8"
)
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__":

View File

@@ -1,85 +0,0 @@
"""
Prometheus metrics for demo session service
"""
from prometheus_client import Counter, Histogram, Gauge
# Counters
demo_sessions_created_total = Counter(
'demo_sessions_created_total',
'Total number of demo sessions created',
['tier', 'status']
)
demo_sessions_deleted_total = Counter(
'demo_sessions_deleted_total',
'Total number of demo sessions deleted',
['tier', 'status']
)
demo_cloning_errors_total = Counter(
'demo_cloning_errors_total',
'Total number of cloning errors',
['tier', 'service', 'error_type']
)
# Histograms (for latency percentiles)
demo_session_creation_duration_seconds = Histogram(
'demo_session_creation_duration_seconds',
'Duration of demo session creation',
['tier'],
buckets=[1, 2, 5, 7, 10, 12, 15, 18, 20, 25, 30, 40, 50, 60]
)
demo_service_clone_duration_seconds = Histogram(
'demo_service_clone_duration_seconds',
'Duration of individual service cloning',
['tier', 'service'],
buckets=[0.5, 1, 2, 3, 5, 10, 15, 20, 30, 40, 50]
)
demo_session_cleanup_duration_seconds = Histogram(
'demo_session_cleanup_duration_seconds',
'Duration of demo session cleanup',
['tier'],
buckets=[0.5, 1, 2, 5, 10, 15, 20, 30]
)
# Gauges
demo_sessions_active = Gauge(
'demo_sessions_active',
'Number of currently active demo sessions',
['tier']
)
demo_sessions_pending_cleanup = Gauge(
'demo_sessions_pending_cleanup',
'Number of demo sessions pending cleanup'
)
# Alert generation metrics
demo_alerts_generated_total = Counter(
'demo_alerts_generated_total',
'Total number of alerts generated post-clone',
['tier', 'alert_type']
)
demo_ai_insights_generated_total = Counter(
'demo_ai_insights_generated_total',
'Total number of AI insights generated post-clone',
['tier', 'insight_type']
)
# Cross-service metrics
demo_cross_service_calls_total = Counter(
'demo_cross_service_calls_total',
'Total number of cross-service API calls during cloning',
['source_service', 'target_service', 'status']
)
demo_cross_service_call_duration_seconds = Histogram(
'demo_cross_service_call_duration_seconds',
'Duration of cross-service API calls during cloning',
['source_service', 'target_service'],
buckets=[0.1, 0.2, 0.5, 1, 2, 5, 10, 15, 20, 30]
)

View File

@@ -14,11 +14,6 @@ import os
from app.models import DemoSession, DemoSessionStatus
from datetime import datetime, timezone, timedelta
from app.core.redis_wrapper import DemoRedisWrapper
from app.monitoring.metrics import (
demo_sessions_deleted_total,
demo_session_cleanup_duration_seconds,
demo_sessions_active
)
logger = structlog.get_logger()

View File

@@ -15,17 +15,6 @@ from shared.clients.inventory_client import InventoryServiceClient
from shared.clients.production_client import ProductionServiceClient
from shared.clients.procurement_client import ProcurementServiceClient
from shared.config.base import BaseServiceSettings
from app.monitoring.metrics import (
demo_sessions_created_total,
demo_session_creation_duration_seconds,
demo_service_clone_duration_seconds,
demo_cloning_errors_total,
demo_sessions_active,
demo_alerts_generated_total,
demo_ai_insights_generated_total,
demo_cross_service_calls_total,
demo_cross_service_call_duration_seconds
)
logger = structlog.get_logger()