Improve metrics

This commit is contained in:
Urtzi Alfaro
2026-01-08 20:48:24 +01:00
parent 29d19087f1
commit e8fda39e50
21 changed files with 615 additions and 3019 deletions

View File

@@ -11,6 +11,7 @@ from app.core.database import init_db, close_db
from app.api import insights
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
# OpenTelemetry imports
from opentelemetry import trace
@@ -56,9 +57,12 @@ async def lifespan(app: FastAPI):
await init_db()
logger.info("Database initialized")
# Start metrics server
metrics_collector.start_metrics_server(8080)
logger.info("Metrics server started on port 8080")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("ai-insights")
logger.info("System metrics collection started")
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
logger.info("Metrics export configured via OpenTelemetry OTLP")
yield
@@ -131,13 +135,8 @@ async def health_check():
}
@app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint"""
return Response(
content=metrics_collector.get_metrics(),
media_type="text/plain; version=0.0.4; charset=utf-8"
)
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__":

View File

@@ -16,6 +16,7 @@ from app.api import alerts, sse
from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
# OpenTelemetry imports
from opentelemetry import trace
@@ -82,9 +83,12 @@ async def lifespan(app: FastAPI):
await consumer.start()
logger.info("alert_processor_started")
# Start metrics server
metrics_collector.start_metrics_server(8080)
logger.info("Metrics server started on port 8080")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("alert-processor")
logger.info("System metrics collection started")
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
logger.info("Metrics export configured via OpenTelemetry OTLP")
except Exception as e:
logger.error("alert_processor_startup_failed", error=str(e))
raise
@@ -175,13 +179,8 @@ async def root():
}
@app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint"""
return Response(
content=metrics_collector.get_metrics(),
media_type="text/plain; version=0.0.4; charset=utf-8"
)
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__":

View File

@@ -15,6 +15,7 @@ from app.api import demo_sessions, demo_accounts, demo_operations, internal
from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
# OpenTelemetry imports
from opentelemetry import trace
@@ -69,9 +70,12 @@ async def lifespan(app: FastAPI):
max_connections=50
)
# Start metrics server
metrics_collector.start_metrics_server(8080)
logger.info("Metrics server started on port 8080")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("demo-session")
logger.info("System metrics collection started")
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
logger.info("Metrics export configured via OpenTelemetry OTLP")
logger.info("Demo Session Service started successfully")
@@ -164,13 +168,8 @@ async def health():
}
@app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint"""
return Response(
content=metrics_collector.get_metrics(),
media_type="text/plain; version=0.0.4; charset=utf-8"
)
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__":

View File

@@ -1,85 +0,0 @@
"""
Prometheus metrics for demo session service
"""
from prometheus_client import Counter, Histogram, Gauge
# Counters
demo_sessions_created_total = Counter(
'demo_sessions_created_total',
'Total number of demo sessions created',
['tier', 'status']
)
demo_sessions_deleted_total = Counter(
'demo_sessions_deleted_total',
'Total number of demo sessions deleted',
['tier', 'status']
)
demo_cloning_errors_total = Counter(
'demo_cloning_errors_total',
'Total number of cloning errors',
['tier', 'service', 'error_type']
)
# Histograms (for latency percentiles)
demo_session_creation_duration_seconds = Histogram(
'demo_session_creation_duration_seconds',
'Duration of demo session creation',
['tier'],
buckets=[1, 2, 5, 7, 10, 12, 15, 18, 20, 25, 30, 40, 50, 60]
)
demo_service_clone_duration_seconds = Histogram(
'demo_service_clone_duration_seconds',
'Duration of individual service cloning',
['tier', 'service'],
buckets=[0.5, 1, 2, 3, 5, 10, 15, 20, 30, 40, 50]
)
demo_session_cleanup_duration_seconds = Histogram(
'demo_session_cleanup_duration_seconds',
'Duration of demo session cleanup',
['tier'],
buckets=[0.5, 1, 2, 5, 10, 15, 20, 30]
)
# Gauges
demo_sessions_active = Gauge(
'demo_sessions_active',
'Number of currently active demo sessions',
['tier']
)
demo_sessions_pending_cleanup = Gauge(
'demo_sessions_pending_cleanup',
'Number of demo sessions pending cleanup'
)
# Alert generation metrics
demo_alerts_generated_total = Counter(
'demo_alerts_generated_total',
'Total number of alerts generated post-clone',
['tier', 'alert_type']
)
demo_ai_insights_generated_total = Counter(
'demo_ai_insights_generated_total',
'Total number of AI insights generated post-clone',
['tier', 'insight_type']
)
# Cross-service metrics
demo_cross_service_calls_total = Counter(
'demo_cross_service_calls_total',
'Total number of cross-service API calls during cloning',
['source_service', 'target_service', 'status']
)
demo_cross_service_call_duration_seconds = Histogram(
'demo_cross_service_call_duration_seconds',
'Duration of cross-service API calls during cloning',
['source_service', 'target_service'],
buckets=[0.1, 0.2, 0.5, 1, 2, 5, 10, 15, 20, 30]
)

View File

@@ -14,11 +14,6 @@ import os
from app.models import DemoSession, DemoSessionStatus
from datetime import datetime, timezone, timedelta
from app.core.redis_wrapper import DemoRedisWrapper
from app.monitoring.metrics import (
demo_sessions_deleted_total,
demo_session_cleanup_duration_seconds,
demo_sessions_active
)
logger = structlog.get_logger()

View File

@@ -15,17 +15,6 @@ from shared.clients.inventory_client import InventoryServiceClient
from shared.clients.production_client import ProductionServiceClient
from shared.clients.procurement_client import ProcurementServiceClient
from shared.config.base import BaseServiceSettings
from app.monitoring.metrics import (
demo_sessions_created_total,
demo_session_creation_duration_seconds,
demo_service_clone_duration_seconds,
demo_cloning_errors_total,
demo_sessions_active,
demo_alerts_generated_total,
demo_ai_insights_generated_total,
demo_cross_service_calls_total,
demo_cross_service_call_duration_seconds
)
logger = structlog.get_logger()

View File

@@ -22,6 +22,7 @@ from app.services.whatsapp_service import WhatsAppService
from app.consumers.po_event_consumer import POEventConsumer
from shared.service_base import StandardFastAPIService
from shared.clients.tenant_client import TenantServiceClient
from shared.monitoring.system_metrics import SystemMetricsCollector
import asyncio
@@ -184,6 +185,10 @@ class NotificationService(StandardFastAPIService):
self.email_service = EmailService()
self.whatsapp_service = WhatsAppService(tenant_client=self.tenant_client)
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("notification")
self.logger.info("System metrics collection started")
# Initialize SSE service
self.sse_service = SSEService()
await self.sse_service.initialize(settings.REDIS_URL)
@@ -271,12 +276,14 @@ class NotificationService(StandardFastAPIService):
return {"error": "SSE service not available"}
# Metrics endpoint
@self.app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint"""
if self.metrics_collector:
return self.metrics_collector.get_metrics()
return {"metrics": "not_available"}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
# @self.app.get("/metrics")
# async def metrics():
# """Prometheus metrics endpoint"""
# if self.metrics_collector:
# return self.metrics_collector.get_metrics()
# return {"metrics": "not_available"}
# Create service instance

View File

@@ -9,6 +9,7 @@ from app.core.config import settings
from app.core.database import database_manager
from app.api import tenants, tenant_members, tenant_operations, webhooks, plans, subscription, tenant_settings, whatsapp_admin, usage_forecast, enterprise_upgrade, tenant_locations, tenant_hierarchy, internal_demo, network_alerts, onboarding
from shared.service_base import StandardFastAPIService
from shared.monitoring.system_metrics import SystemMetricsCollector
class TenantService(StandardFastAPIService):
@@ -77,6 +78,10 @@ class TenantService(StandardFastAPIService):
redis_client = await get_redis_client()
self.logger.info("Redis initialized successfully")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("tenant")
self.logger.info("System metrics collection started")
# Start usage tracking scheduler
from app.jobs.usage_tracking_scheduler import start_scheduler
await start_scheduler(self.database_manager, redis_client, settings)
@@ -108,12 +113,14 @@ class TenantService(StandardFastAPIService):
def setup_custom_endpoints(self):
"""Setup custom endpoints for tenant service"""
@self.app.get("/metrics")
async def metrics():
"""Prometheus metrics endpoint"""
if self.metrics_collector:
return self.metrics_collector.get_metrics()
return {"metrics": "not_available"}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
# @self.app.get("/metrics")
# async def metrics():
# """Prometheus metrics endpoint"""
# if self.metrics_collector:
# return self.metrics_collector.get_metrics()
# return {"metrics": "not_available"}
# Create service instance

View File

@@ -15,6 +15,7 @@ from app.api import training_jobs, training_operations, models, health, monitori
from app.services.training_events import setup_messaging, cleanup_messaging
from app.websocket.events import setup_websocket_event_consumer, cleanup_websocket_consumers
from shared.service_base import StandardFastAPIService
from shared.monitoring.system_metrics import SystemMetricsCollector
class TrainingService(StandardFastAPIService):
@@ -77,6 +78,11 @@ class TrainingService(StandardFastAPIService):
async def on_startup(self, app: FastAPI):
"""Custom startup logic including migration verification"""
await self.verify_migrations()
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("training")
self.logger.info("System metrics collection started")
self.logger.info("Training service startup completed")
async def on_shutdown(self, app: FastAPI):
@@ -132,12 +138,14 @@ class TrainingService(StandardFastAPIService):
def setup_custom_endpoints(self):
"""Setup custom endpoints for training service"""
@self.app.get("/metrics")
async def get_metrics():
"""Prometheus metrics endpoint"""
if self.metrics_collector:
return self.metrics_collector.get_metrics()
return {"status": "metrics not available"}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
# @self.app.get("/metrics")
# async def get_metrics():
# """Prometheus metrics endpoint"""
# if self.metrics_collector:
# return self.metrics_collector.get_metrics()
# return {"status": "metrics not available"}
@self.app.get("/")
async def root():