Improve metrics
This commit is contained in:
@@ -11,6 +11,7 @@ from app.core.database import init_db, close_db
|
||||
from app.api import insights
|
||||
from shared.monitoring.logging import setup_logging
|
||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
||||
from shared.monitoring.system_metrics import SystemMetricsCollector
|
||||
|
||||
# OpenTelemetry imports
|
||||
from opentelemetry import trace
|
||||
@@ -56,9 +57,12 @@ async def lifespan(app: FastAPI):
|
||||
await init_db()
|
||||
logger.info("Database initialized")
|
||||
|
||||
# Start metrics server
|
||||
metrics_collector.start_metrics_server(8080)
|
||||
logger.info("Metrics server started on port 8080")
|
||||
# Initialize system metrics collection
|
||||
system_metrics = SystemMetricsCollector("ai-insights")
|
||||
logger.info("System metrics collection started")
|
||||
|
||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
|
||||
logger.info("Metrics export configured via OpenTelemetry OTLP")
|
||||
|
||||
yield
|
||||
|
||||
@@ -131,13 +135,8 @@ async def health_check():
|
||||
}
|
||||
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
return Response(
|
||||
content=metrics_collector.get_metrics(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8"
|
||||
)
|
||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
|
||||
# The /metrics endpoint is not needed as metrics are pushed automatically
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -16,6 +16,7 @@ from app.api import alerts, sse
|
||||
from shared.redis_utils import initialize_redis, close_redis
|
||||
from shared.monitoring.logging import setup_logging
|
||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
||||
from shared.monitoring.system_metrics import SystemMetricsCollector
|
||||
|
||||
# OpenTelemetry imports
|
||||
from opentelemetry import trace
|
||||
@@ -82,9 +83,12 @@ async def lifespan(app: FastAPI):
|
||||
await consumer.start()
|
||||
logger.info("alert_processor_started")
|
||||
|
||||
# Start metrics server
|
||||
metrics_collector.start_metrics_server(8080)
|
||||
logger.info("Metrics server started on port 8080")
|
||||
# Initialize system metrics collection
|
||||
system_metrics = SystemMetricsCollector("alert-processor")
|
||||
logger.info("System metrics collection started")
|
||||
|
||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
|
||||
logger.info("Metrics export configured via OpenTelemetry OTLP")
|
||||
except Exception as e:
|
||||
logger.error("alert_processor_startup_failed", error=str(e))
|
||||
raise
|
||||
@@ -175,13 +179,8 @@ async def root():
|
||||
}
|
||||
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
return Response(
|
||||
content=metrics_collector.get_metrics(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8"
|
||||
)
|
||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
|
||||
# The /metrics endpoint is not needed as metrics are pushed automatically
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -15,6 +15,7 @@ from app.api import demo_sessions, demo_accounts, demo_operations, internal
|
||||
from shared.redis_utils import initialize_redis, close_redis
|
||||
from shared.monitoring.logging import setup_logging
|
||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
||||
from shared.monitoring.system_metrics import SystemMetricsCollector
|
||||
|
||||
# OpenTelemetry imports
|
||||
from opentelemetry import trace
|
||||
@@ -69,9 +70,12 @@ async def lifespan(app: FastAPI):
|
||||
max_connections=50
|
||||
)
|
||||
|
||||
# Start metrics server
|
||||
metrics_collector.start_metrics_server(8080)
|
||||
logger.info("Metrics server started on port 8080")
|
||||
# Initialize system metrics collection
|
||||
system_metrics = SystemMetricsCollector("demo-session")
|
||||
logger.info("System metrics collection started")
|
||||
|
||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
|
||||
logger.info("Metrics export configured via OpenTelemetry OTLP")
|
||||
|
||||
logger.info("Demo Session Service started successfully")
|
||||
|
||||
@@ -164,13 +168,8 @@ async def health():
|
||||
}
|
||||
|
||||
|
||||
@app.get("/metrics")
|
||||
async def metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
return Response(
|
||||
content=metrics_collector.get_metrics(),
|
||||
media_type="text/plain; version=0.0.4; charset=utf-8"
|
||||
)
|
||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
|
||||
# The /metrics endpoint is not needed as metrics are pushed automatically
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,85 +0,0 @@
|
||||
"""
|
||||
Prometheus metrics for demo session service
|
||||
"""
|
||||
|
||||
from prometheus_client import Counter, Histogram, Gauge
|
||||
|
||||
# Counters
|
||||
demo_sessions_created_total = Counter(
|
||||
'demo_sessions_created_total',
|
||||
'Total number of demo sessions created',
|
||||
['tier', 'status']
|
||||
)
|
||||
|
||||
demo_sessions_deleted_total = Counter(
|
||||
'demo_sessions_deleted_total',
|
||||
'Total number of demo sessions deleted',
|
||||
['tier', 'status']
|
||||
)
|
||||
|
||||
demo_cloning_errors_total = Counter(
|
||||
'demo_cloning_errors_total',
|
||||
'Total number of cloning errors',
|
||||
['tier', 'service', 'error_type']
|
||||
)
|
||||
|
||||
# Histograms (for latency percentiles)
|
||||
demo_session_creation_duration_seconds = Histogram(
|
||||
'demo_session_creation_duration_seconds',
|
||||
'Duration of demo session creation',
|
||||
['tier'],
|
||||
buckets=[1, 2, 5, 7, 10, 12, 15, 18, 20, 25, 30, 40, 50, 60]
|
||||
)
|
||||
|
||||
demo_service_clone_duration_seconds = Histogram(
|
||||
'demo_service_clone_duration_seconds',
|
||||
'Duration of individual service cloning',
|
||||
['tier', 'service'],
|
||||
buckets=[0.5, 1, 2, 3, 5, 10, 15, 20, 30, 40, 50]
|
||||
)
|
||||
|
||||
demo_session_cleanup_duration_seconds = Histogram(
|
||||
'demo_session_cleanup_duration_seconds',
|
||||
'Duration of demo session cleanup',
|
||||
['tier'],
|
||||
buckets=[0.5, 1, 2, 5, 10, 15, 20, 30]
|
||||
)
|
||||
|
||||
# Gauges
|
||||
demo_sessions_active = Gauge(
|
||||
'demo_sessions_active',
|
||||
'Number of currently active demo sessions',
|
||||
['tier']
|
||||
)
|
||||
|
||||
demo_sessions_pending_cleanup = Gauge(
|
||||
'demo_sessions_pending_cleanup',
|
||||
'Number of demo sessions pending cleanup'
|
||||
)
|
||||
|
||||
# Alert generation metrics
|
||||
demo_alerts_generated_total = Counter(
|
||||
'demo_alerts_generated_total',
|
||||
'Total number of alerts generated post-clone',
|
||||
['tier', 'alert_type']
|
||||
)
|
||||
|
||||
demo_ai_insights_generated_total = Counter(
|
||||
'demo_ai_insights_generated_total',
|
||||
'Total number of AI insights generated post-clone',
|
||||
['tier', 'insight_type']
|
||||
)
|
||||
|
||||
# Cross-service metrics
|
||||
demo_cross_service_calls_total = Counter(
|
||||
'demo_cross_service_calls_total',
|
||||
'Total number of cross-service API calls during cloning',
|
||||
['source_service', 'target_service', 'status']
|
||||
)
|
||||
|
||||
demo_cross_service_call_duration_seconds = Histogram(
|
||||
'demo_cross_service_call_duration_seconds',
|
||||
'Duration of cross-service API calls during cloning',
|
||||
['source_service', 'target_service'],
|
||||
buckets=[0.1, 0.2, 0.5, 1, 2, 5, 10, 15, 20, 30]
|
||||
)
|
||||
@@ -14,11 +14,6 @@ import os
|
||||
from app.models import DemoSession, DemoSessionStatus
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from app.core.redis_wrapper import DemoRedisWrapper
|
||||
from app.monitoring.metrics import (
|
||||
demo_sessions_deleted_total,
|
||||
demo_session_cleanup_duration_seconds,
|
||||
demo_sessions_active
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
@@ -15,17 +15,6 @@ from shared.clients.inventory_client import InventoryServiceClient
|
||||
from shared.clients.production_client import ProductionServiceClient
|
||||
from shared.clients.procurement_client import ProcurementServiceClient
|
||||
from shared.config.base import BaseServiceSettings
|
||||
from app.monitoring.metrics import (
|
||||
demo_sessions_created_total,
|
||||
demo_session_creation_duration_seconds,
|
||||
demo_service_clone_duration_seconds,
|
||||
demo_cloning_errors_total,
|
||||
demo_sessions_active,
|
||||
demo_alerts_generated_total,
|
||||
demo_ai_insights_generated_total,
|
||||
demo_cross_service_calls_total,
|
||||
demo_cross_service_call_duration_seconds
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ from app.services.whatsapp_service import WhatsAppService
|
||||
from app.consumers.po_event_consumer import POEventConsumer
|
||||
from shared.service_base import StandardFastAPIService
|
||||
from shared.clients.tenant_client import TenantServiceClient
|
||||
from shared.monitoring.system_metrics import SystemMetricsCollector
|
||||
import asyncio
|
||||
|
||||
|
||||
@@ -184,6 +185,10 @@ class NotificationService(StandardFastAPIService):
|
||||
self.email_service = EmailService()
|
||||
self.whatsapp_service = WhatsAppService(tenant_client=self.tenant_client)
|
||||
|
||||
# Initialize system metrics collection
|
||||
system_metrics = SystemMetricsCollector("notification")
|
||||
self.logger.info("System metrics collection started")
|
||||
|
||||
# Initialize SSE service
|
||||
self.sse_service = SSEService()
|
||||
await self.sse_service.initialize(settings.REDIS_URL)
|
||||
@@ -271,12 +276,14 @@ class NotificationService(StandardFastAPIService):
|
||||
return {"error": "SSE service not available"}
|
||||
|
||||
# Metrics endpoint
|
||||
@self.app.get("/metrics")
|
||||
async def metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
if self.metrics_collector:
|
||||
return self.metrics_collector.get_metrics()
|
||||
return {"metrics": "not_available"}
|
||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
|
||||
# The /metrics endpoint is not needed as metrics are pushed automatically
|
||||
# @self.app.get("/metrics")
|
||||
# async def metrics():
|
||||
# """Prometheus metrics endpoint"""
|
||||
# if self.metrics_collector:
|
||||
# return self.metrics_collector.get_metrics()
|
||||
# return {"metrics": "not_available"}
|
||||
|
||||
|
||||
# Create service instance
|
||||
|
||||
@@ -9,6 +9,7 @@ from app.core.config import settings
|
||||
from app.core.database import database_manager
|
||||
from app.api import tenants, tenant_members, tenant_operations, webhooks, plans, subscription, tenant_settings, whatsapp_admin, usage_forecast, enterprise_upgrade, tenant_locations, tenant_hierarchy, internal_demo, network_alerts, onboarding
|
||||
from shared.service_base import StandardFastAPIService
|
||||
from shared.monitoring.system_metrics import SystemMetricsCollector
|
||||
|
||||
|
||||
class TenantService(StandardFastAPIService):
|
||||
@@ -77,6 +78,10 @@ class TenantService(StandardFastAPIService):
|
||||
redis_client = await get_redis_client()
|
||||
self.logger.info("Redis initialized successfully")
|
||||
|
||||
# Initialize system metrics collection
|
||||
system_metrics = SystemMetricsCollector("tenant")
|
||||
self.logger.info("System metrics collection started")
|
||||
|
||||
# Start usage tracking scheduler
|
||||
from app.jobs.usage_tracking_scheduler import start_scheduler
|
||||
await start_scheduler(self.database_manager, redis_client, settings)
|
||||
@@ -108,12 +113,14 @@ class TenantService(StandardFastAPIService):
|
||||
|
||||
def setup_custom_endpoints(self):
|
||||
"""Setup custom endpoints for tenant service"""
|
||||
@self.app.get("/metrics")
|
||||
async def metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
if self.metrics_collector:
|
||||
return self.metrics_collector.get_metrics()
|
||||
return {"metrics": "not_available"}
|
||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
|
||||
# The /metrics endpoint is not needed as metrics are pushed automatically
|
||||
# @self.app.get("/metrics")
|
||||
# async def metrics():
|
||||
# """Prometheus metrics endpoint"""
|
||||
# if self.metrics_collector:
|
||||
# return self.metrics_collector.get_metrics()
|
||||
# return {"metrics": "not_available"}
|
||||
|
||||
|
||||
# Create service instance
|
||||
|
||||
@@ -15,6 +15,7 @@ from app.api import training_jobs, training_operations, models, health, monitori
|
||||
from app.services.training_events import setup_messaging, cleanup_messaging
|
||||
from app.websocket.events import setup_websocket_event_consumer, cleanup_websocket_consumers
|
||||
from shared.service_base import StandardFastAPIService
|
||||
from shared.monitoring.system_metrics import SystemMetricsCollector
|
||||
|
||||
|
||||
class TrainingService(StandardFastAPIService):
|
||||
@@ -77,6 +78,11 @@ class TrainingService(StandardFastAPIService):
|
||||
async def on_startup(self, app: FastAPI):
|
||||
"""Custom startup logic including migration verification"""
|
||||
await self.verify_migrations()
|
||||
|
||||
# Initialize system metrics collection
|
||||
system_metrics = SystemMetricsCollector("training")
|
||||
self.logger.info("System metrics collection started")
|
||||
|
||||
self.logger.info("Training service startup completed")
|
||||
|
||||
async def on_shutdown(self, app: FastAPI):
|
||||
@@ -132,12 +138,14 @@ class TrainingService(StandardFastAPIService):
|
||||
|
||||
def setup_custom_endpoints(self):
|
||||
"""Setup custom endpoints for training service"""
|
||||
@self.app.get("/metrics")
|
||||
async def get_metrics():
|
||||
"""Prometheus metrics endpoint"""
|
||||
if self.metrics_collector:
|
||||
return self.metrics_collector.get_metrics()
|
||||
return {"status": "metrics not available"}
|
||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
|
||||
# The /metrics endpoint is not needed as metrics are pushed automatically
|
||||
# @self.app.get("/metrics")
|
||||
# async def get_metrics():
|
||||
# """Prometheus metrics endpoint"""
|
||||
# if self.metrics_collector:
|
||||
# return self.metrics_collector.get_metrics()
|
||||
# return {"status": "metrics not available"}
|
||||
|
||||
@self.app.get("/")
|
||||
async def root():
|
||||
|
||||
Reference in New Issue
Block a user