Improve metrics
This commit is contained in:
@@ -15,6 +15,7 @@ from app.api import demo_sessions, demo_accounts, demo_operations, internal
|
||||
from shared.redis_utils import initialize_redis, close_redis
|
||||
from shared.monitoring.logging import setup_logging
|
||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
||||
from shared.monitoring.system_metrics import SystemMetricsCollector
|
||||
|
||||
# OpenTelemetry imports
|
||||
from opentelemetry import trace
|
||||
@@ -69,9 +70,12 @@ async def lifespan(app: FastAPI):
|
||||
max_connections=50
|
||||
)
|
||||
|
||||
# Start metrics server
|
||||
metrics_collector.start_metrics_server(8080)
|
||||
logger.info("Metrics server started on port 8080")
|
||||
# Initialize system metrics collection
|
||||
system_metrics = SystemMetricsCollector("demo-session")
|
||||
logger.info("System metrics collection started")
|
||||
|
||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
|
||||
logger.info("Metrics export configured via OpenTelemetry OTLP")
|
||||
|
||||
logger.info("Demo Session Service started successfully")
|
||||
|
||||
@@ -164,13 +168,8 @@ async def health():
|
||||
}
|
||||
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
return Response(
|
||||
content=metrics_collector.get_metrics(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8"
|
||||
)
|
||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
|
||||
# The /metrics endpoint is not needed as metrics are pushed automatically
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,85 +0,0 @@
|
||||
"""
|
||||
Prometheus metrics for demo session service
|
||||
"""
|
||||
|
||||
from prometheus_client import Counter, Histogram, Gauge
|
||||
|
||||
# Counters
|
||||
demo_sessions_created_total = Counter(
|
||||
'demo_sessions_created_total',
|
||||
'Total number of demo sessions created',
|
||||
['tier', 'status']
|
||||
)
|
||||
|
||||
demo_sessions_deleted_total = Counter(
|
||||
'demo_sessions_deleted_total',
|
||||
'Total number of demo sessions deleted',
|
||||
['tier', 'status']
|
||||
)
|
||||
|
||||
demo_cloning_errors_total = Counter(
|
||||
'demo_cloning_errors_total',
|
||||
'Total number of cloning errors',
|
||||
['tier', 'service', 'error_type']
|
||||
)
|
||||
|
||||
# Histograms (for latency percentiles)
|
||||
demo_session_creation_duration_seconds = Histogram(
|
||||
'demo_session_creation_duration_seconds',
|
||||
'Duration of demo session creation',
|
||||
['tier'],
|
||||
buckets=[1, 2, 5, 7, 10, 12, 15, 18, 20, 25, 30, 40, 50, 60]
|
||||
)
|
||||
|
||||
demo_service_clone_duration_seconds = Histogram(
|
||||
'demo_service_clone_duration_seconds',
|
||||
'Duration of individual service cloning',
|
||||
['tier', 'service'],
|
||||
buckets=[0.5, 1, 2, 3, 5, 10, 15, 20, 30, 40, 50]
|
||||
)
|
||||
|
||||
demo_session_cleanup_duration_seconds = Histogram(
|
||||
'demo_session_cleanup_duration_seconds',
|
||||
'Duration of demo session cleanup',
|
||||
['tier'],
|
||||
buckets=[0.5, 1, 2, 5, 10, 15, 20, 30]
|
||||
)
|
||||
|
||||
# Gauges
|
||||
demo_sessions_active = Gauge(
|
||||
'demo_sessions_active',
|
||||
'Number of currently active demo sessions',
|
||||
['tier']
|
||||
)
|
||||
|
||||
demo_sessions_pending_cleanup = Gauge(
|
||||
'demo_sessions_pending_cleanup',
|
||||
'Number of demo sessions pending cleanup'
|
||||
)
|
||||
|
||||
# Alert generation metrics
|
||||
demo_alerts_generated_total = Counter(
|
||||
'demo_alerts_generated_total',
|
||||
'Total number of alerts generated post-clone',
|
||||
['tier', 'alert_type']
|
||||
)
|
||||
|
||||
demo_ai_insights_generated_total = Counter(
|
||||
'demo_ai_insights_generated_total',
|
||||
'Total number of AI insights generated post-clone',
|
||||
['tier', 'insight_type']
|
||||
)
|
||||
|
||||
# Cross-service metrics
|
||||
demo_cross_service_calls_total = Counter(
|
||||
'demo_cross_service_calls_total',
|
||||
'Total number of cross-service API calls during cloning',
|
||||
['source_service', 'target_service', 'status']
|
||||
)
|
||||
|
||||
demo_cross_service_call_duration_seconds = Histogram(
|
||||
'demo_cross_service_call_duration_seconds',
|
||||
'Duration of cross-service API calls during cloning',
|
||||
['source_service', 'target_service'],
|
||||
buckets=[0.1, 0.2, 0.5, 1, 2, 5, 10, 15, 20, 30]
|
||||
)
|
||||
@@ -14,11 +14,6 @@ import os
|
||||
from app.models import DemoSession, DemoSessionStatus
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from app.core.redis_wrapper import DemoRedisWrapper
|
||||
from app.monitoring.metrics import (
|
||||
demo_sessions_deleted_total,
|
||||
demo_session_cleanup_duration_seconds,
|
||||
demo_sessions_active
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
@@ -15,17 +15,6 @@ from shared.clients.inventory_client import InventoryServiceClient
|
||||
from shared.clients.production_client import ProductionServiceClient
|
||||
from shared.clients.procurement_client import ProcurementServiceClient
|
||||
from shared.config.base import BaseServiceSettings
|
||||
from app.monitoring.metrics import (
|
||||
demo_sessions_created_total,
|
||||
demo_session_creation_duration_seconds,
|
||||
demo_service_clone_duration_seconds,
|
||||
demo_cloning_errors_total,
|
||||
demo_sessions_active,
|
||||
demo_alerts_generated_total,
|
||||
demo_ai_insights_generated_total,
|
||||
demo_cross_service_calls_total,
|
||||
demo_cross_service_call_duration_seconds
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user