Imporve monitoring 5
This commit is contained in:
@@ -8,13 +8,12 @@ import json
|
|||||||
import structlog
|
import structlog
|
||||||
import resource
|
import resource
|
||||||
import os
|
import os
|
||||||
from fastapi import FastAPI, Request, HTTPException, Depends, WebSocket, WebSocketDisconnect
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse, Response
|
|
||||||
import httpx
|
|
||||||
import time
|
import time
|
||||||
|
from fastapi import Request, HTTPException, WebSocket, WebSocketDisconnect
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
import httpx
|
||||||
from shared.redis_utils import initialize_redis, close_redis, get_redis_client
|
from shared.redis_utils import initialize_redis, close_redis, get_redis_client
|
||||||
from typing import Dict, Any
|
from shared.service_base import StandardFastAPIService
|
||||||
|
|
||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
from app.middleware.request_id import RequestIDMiddleware
|
from app.middleware.request_id import RequestIDMiddleware
|
||||||
@@ -26,128 +25,84 @@ from app.middleware.subscription import SubscriptionMiddleware
|
|||||||
from app.middleware.demo_middleware import DemoMiddleware
|
from app.middleware.demo_middleware import DemoMiddleware
|
||||||
from app.middleware.read_only_mode import ReadOnlyModeMiddleware
|
from app.middleware.read_only_mode import ReadOnlyModeMiddleware
|
||||||
from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context
|
from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context
|
||||||
from shared.monitoring.logging import setup_logging
|
|
||||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
|
||||||
from shared.monitoring.system_metrics import SystemMetricsCollector
|
|
||||||
|
|
||||||
# OpenTelemetry imports
|
# Initialize logger
|
||||||
from opentelemetry import trace
|
|
||||||
from opentelemetry.sdk.trace import TracerProvider
|
|
||||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
||||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
||||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
|
||||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
|
||||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
|
||||||
from opentelemetry.sdk.resources import Resource
|
|
||||||
|
|
||||||
# Configure OpenTelemetry tracing
|
|
||||||
def setup_tracing(service_name: str = "gateway"):
|
|
||||||
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
|
|
||||||
# Create resource with service name
|
|
||||||
resource = Resource.create({"service.name": service_name})
|
|
||||||
|
|
||||||
# Configure OTLP exporter (sends to OpenTelemetry Collector)
|
|
||||||
otlp_exporter = OTLPSpanExporter(
|
|
||||||
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
|
|
||||||
insecure=True # Use insecure connection for internal cluster communication
|
|
||||||
)
|
|
||||||
|
|
||||||
# Configure tracer provider
|
|
||||||
provider = TracerProvider(resource=resource)
|
|
||||||
processor = BatchSpanProcessor(otlp_exporter)
|
|
||||||
provider.add_span_processor(processor)
|
|
||||||
|
|
||||||
# Set global tracer provider
|
|
||||||
trace.set_tracer_provider(provider)
|
|
||||||
|
|
||||||
return provider
|
|
||||||
|
|
||||||
# Initialize tracing
|
|
||||||
tracer_provider = setup_tracing("gateway")
|
|
||||||
|
|
||||||
# Setup logging
|
|
||||||
setup_logging("gateway", settings.LOG_LEVEL)
|
|
||||||
logger = structlog.get_logger()
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
# Check file descriptor limits and warn if too low
|
# Check file descriptor limits
|
||||||
try:
|
try:
|
||||||
soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
|
soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
|
||||||
if soft_limit < 1024:
|
if soft_limit < 1024:
|
||||||
logger.warning(f"Low file descriptor limit detected: {soft_limit}. Gateway may experience 'too many open files' errors.")
|
logger.warning(f"Low file descriptor limit detected: {soft_limit}")
|
||||||
logger.warning(f"Recommended: Increase limit with 'ulimit -n 4096' or higher for production.")
|
|
||||||
if soft_limit < 256:
|
|
||||||
logger.error(f"Critical: File descriptor limit ({soft_limit}) is too low for gateway operation!")
|
|
||||||
else:
|
else:
|
||||||
logger.info(f"File descriptor limit: {soft_limit} (sufficient)")
|
logger.info(f"File descriptor limit: {soft_limit} (sufficient)")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Could not check file descriptor limits: {e}")
|
logger.debug(f"Could not check file descriptor limits: {e}")
|
||||||
|
|
||||||
# Check and log current working directory and permissions
|
|
||||||
try:
|
|
||||||
cwd = os.getcwd()
|
|
||||||
logger.info(f"Current working directory: {cwd}")
|
|
||||||
|
|
||||||
# Check if we can write to common log locations
|
|
||||||
test_locations = ["/var/log", "./logs", "."]
|
|
||||||
for location in test_locations:
|
|
||||||
try:
|
|
||||||
test_file = os.path.join(location, ".gateway_permission_test")
|
|
||||||
with open(test_file, 'w') as f:
|
|
||||||
f.write("test")
|
|
||||||
os.remove(test_file)
|
|
||||||
logger.info(f"Write permission confirmed for: {location}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Cannot write to {location}: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Could not check directory permissions: {e}")
|
|
||||||
|
|
||||||
# Create FastAPI app
|
|
||||||
app = FastAPI(
|
|
||||||
title="Bakery Forecasting API Gateway",
|
|
||||||
description="Central API Gateway for bakery forecasting microservices",
|
|
||||||
version="1.0.0",
|
|
||||||
docs_url="/docs",
|
|
||||||
redoc_url="/redoc",
|
|
||||||
redirect_slashes=False # Disable automatic trailing slash redirects
|
|
||||||
)
|
|
||||||
|
|
||||||
# Instrument FastAPI with OpenTelemetry
|
|
||||||
FastAPIInstrumentor.instrument_app(app)
|
|
||||||
|
|
||||||
# Instrument httpx for outgoing requests
|
|
||||||
HTTPXClientInstrumentor().instrument()
|
|
||||||
|
|
||||||
# Instrument Redis (will be active once redis client is initialized)
|
|
||||||
RedisInstrumentor().instrument()
|
|
||||||
|
|
||||||
# Initialize metrics collector
|
|
||||||
metrics_collector = MetricsCollector("gateway")
|
|
||||||
|
|
||||||
# Add metrics middleware to track HTTP requests
|
|
||||||
add_metrics_middleware(app, metrics_collector)
|
|
||||||
|
|
||||||
# Redis client for SSE streaming
|
# Redis client for SSE streaming
|
||||||
redis_client = None
|
redis_client = None
|
||||||
|
|
||||||
# CORS middleware - Add first
|
|
||||||
app.add_middleware(
|
class GatewayService(StandardFastAPIService):
|
||||||
CORSMiddleware,
|
"""Gateway Service with standardized monitoring setup"""
|
||||||
allow_origins=settings.CORS_ORIGINS_LIST,
|
|
||||||
allow_credentials=True,
|
async def on_startup(self, app):
|
||||||
allow_methods=["*"],
|
"""Custom startup logic for Gateway"""
|
||||||
allow_headers=["*"],
|
global redis_client
|
||||||
|
|
||||||
|
# Initialize Redis
|
||||||
|
try:
|
||||||
|
await initialize_redis(settings.REDIS_URL, db=0, max_connections=50)
|
||||||
|
redis_client = await get_redis_client()
|
||||||
|
logger.info("Connected to Redis for SSE streaming")
|
||||||
|
|
||||||
|
# Add API rate limiting middleware with Redis client
|
||||||
|
app.add_middleware(APIRateLimitMiddleware, redis_client=redis_client)
|
||||||
|
logger.info("API rate limiting middleware enabled")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to connect to Redis: {e}")
|
||||||
|
|
||||||
|
# Register custom metrics for gateway-specific operations
|
||||||
|
if self.telemetry_providers and self.telemetry_providers.app_metrics:
|
||||||
|
logger.info("Gateway-specific metrics tracking enabled")
|
||||||
|
|
||||||
|
await super().on_startup(app)
|
||||||
|
|
||||||
|
async def on_shutdown(self, app):
|
||||||
|
"""Custom shutdown logic for Gateway"""
|
||||||
|
await super().on_shutdown(app)
|
||||||
|
|
||||||
|
# Close Redis
|
||||||
|
await close_redis()
|
||||||
|
logger.info("Redis connection closed")
|
||||||
|
|
||||||
|
|
||||||
|
# Create service instance
|
||||||
|
service = GatewayService(
|
||||||
|
service_name="gateway",
|
||||||
|
app_name="Bakery Forecasting API Gateway",
|
||||||
|
description="Central API Gateway for bakery forecasting microservices",
|
||||||
|
version="1.0.0",
|
||||||
|
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
|
||||||
|
cors_origins=settings.CORS_ORIGINS_LIST,
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_health_checks=True,
|
||||||
|
enable_tracing=True,
|
||||||
|
enable_cors=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Custom middleware - Add in REVERSE order (last added = first executed)
|
# Create FastAPI app
|
||||||
|
app = service.create_app()
|
||||||
|
|
||||||
|
# Add gateway-specific middleware (in REVERSE order of execution)
|
||||||
# Execution order: RequestIDMiddleware -> DemoMiddleware -> AuthMiddleware -> ReadOnlyModeMiddleware -> SubscriptionMiddleware -> APIRateLimitMiddleware -> RateLimitMiddleware -> LoggingMiddleware
|
# Execution order: RequestIDMiddleware -> DemoMiddleware -> AuthMiddleware -> ReadOnlyModeMiddleware -> SubscriptionMiddleware -> APIRateLimitMiddleware -> RateLimitMiddleware -> LoggingMiddleware
|
||||||
app.add_middleware(LoggingMiddleware) # Executes 8th (outermost)
|
app.add_middleware(LoggingMiddleware)
|
||||||
app.add_middleware(RateLimitMiddleware, calls_per_minute=300) # Executes 7th - Simple rate limit
|
app.add_middleware(RateLimitMiddleware, calls_per_minute=300)
|
||||||
# Note: APIRateLimitMiddleware will be added on startup with Redis client
|
app.add_middleware(SubscriptionMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)
|
||||||
app.add_middleware(SubscriptionMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL) # Executes 5th
|
app.add_middleware(ReadOnlyModeMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)
|
||||||
app.add_middleware(ReadOnlyModeMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL) # Executes 4th - Enforce read-only mode
|
app.add_middleware(AuthMiddleware)
|
||||||
app.add_middleware(AuthMiddleware) # Executes 3rd - Checks for demo context
|
app.add_middleware(DemoMiddleware)
|
||||||
app.add_middleware(DemoMiddleware) # Executes 2nd - Sets demo user context
|
app.add_middleware(RequestIDMiddleware)
|
||||||
app.add_middleware(RequestIDMiddleware) # Executes 1st (innermost) - Generates request ID for tracing
|
|
||||||
|
|
||||||
# Include routers
|
# Include routers
|
||||||
app.include_router(auth.router, prefix="/api/v1/auth", tags=["authentication"])
|
app.include_router(auth.router, prefix="/api/v1/auth", tags=["authentication"])
|
||||||
@@ -156,114 +111,18 @@ app.include_router(subscription.router, prefix="/api/v1", tags=["subscriptions"]
|
|||||||
app.include_router(notification.router, prefix="/api/v1/notifications", tags=["notifications"])
|
app.include_router(notification.router, prefix="/api/v1/notifications", tags=["notifications"])
|
||||||
app.include_router(nominatim.router, prefix="/api/v1/nominatim", tags=["location"])
|
app.include_router(nominatim.router, prefix="/api/v1/nominatim", tags=["location"])
|
||||||
app.include_router(geocoding.router, prefix="/api/v1/geocoding", tags=["geocoding"])
|
app.include_router(geocoding.router, prefix="/api/v1/geocoding", tags=["geocoding"])
|
||||||
# app.include_router(poi_context.router, prefix="/api/v1/poi-context", tags=["poi-context"]) # Removed to implement tenant-based architecture
|
|
||||||
app.include_router(pos.router, prefix="/api/v1/pos", tags=["pos"])
|
app.include_router(pos.router, prefix="/api/v1/pos", tags=["pos"])
|
||||||
app.include_router(demo.router, prefix="/api/v1", tags=["demo"])
|
app.include_router(demo.router, prefix="/api/v1", tags=["demo"])
|
||||||
|
|
||||||
|
|
||||||
@app.on_event("startup")
|
|
||||||
async def startup_event():
|
|
||||||
"""Application startup"""
|
|
||||||
global redis_client
|
|
||||||
|
|
||||||
logger.info("Starting API Gateway")
|
|
||||||
|
|
||||||
# Initialize shared Redis connection
|
|
||||||
try:
|
|
||||||
await initialize_redis(settings.REDIS_URL, db=0, max_connections=50)
|
|
||||||
redis_client = await get_redis_client()
|
|
||||||
logger.info("Connected to Redis for SSE streaming")
|
|
||||||
|
|
||||||
# Add API rate limiting middleware with Redis client
|
|
||||||
app.add_middleware(APIRateLimitMiddleware, redis_client=redis_client)
|
|
||||||
logger.info("API rate limiting middleware enabled with subscription-based quotas")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to connect to Redis: {e}")
|
|
||||||
logger.warning("API rate limiting middleware will fail open (allow all requests)")
|
|
||||||
|
|
||||||
metrics_collector.register_counter(
|
|
||||||
"gateway_auth_requests_total",
|
|
||||||
"Total authentication requests"
|
|
||||||
)
|
|
||||||
metrics_collector.register_counter(
|
|
||||||
"gateway_auth_responses_total",
|
|
||||||
"Total authentication responses"
|
|
||||||
)
|
|
||||||
metrics_collector.register_counter(
|
|
||||||
"gateway_auth_errors_total",
|
|
||||||
"Total authentication errors"
|
|
||||||
)
|
|
||||||
|
|
||||||
metrics_collector.register_histogram(
|
|
||||||
"gateway_request_duration_seconds",
|
|
||||||
"Request duration in seconds"
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info("Metrics registered successfully")
|
|
||||||
|
|
||||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
|
|
||||||
# Initialize system metrics collection
|
|
||||||
system_metrics = SystemMetricsCollector("gateway")
|
|
||||||
logger.info("System metrics collection started")
|
|
||||||
|
|
||||||
logger.info("Metrics export configured via OpenTelemetry OTLP")
|
|
||||||
|
|
||||||
logger.info("API Gateway started successfully")
|
|
||||||
|
|
||||||
@app.on_event("shutdown")
|
|
||||||
async def shutdown_event():
|
|
||||||
"""Application shutdown"""
|
|
||||||
logger.info("Shutting down API Gateway")
|
|
||||||
|
|
||||||
# Close shared Redis connection
|
|
||||||
await close_redis()
|
|
||||||
|
|
||||||
# Clean up service discovery
|
|
||||||
# await service_discovery.cleanup()
|
|
||||||
|
|
||||||
logger.info("API Gateway shutdown complete")
|
|
||||||
|
|
||||||
@app.get("/health")
|
|
||||||
async def health_check():
|
|
||||||
"""Health check endpoint"""
|
|
||||||
return {
|
|
||||||
"status": "healthy",
|
|
||||||
"service": "api-gateway",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"timestamp": time.time()
|
|
||||||
}
|
|
||||||
|
|
||||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
|
|
||||||
# The /metrics endpoint is not needed as metrics are pushed automatically
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS
|
# SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list:
|
def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list:
|
||||||
"""
|
"""Determine which Redis channels to subscribe to based on filters"""
|
||||||
Determine which Redis channels to subscribe to based on filters.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
tenant_id: Tenant identifier
|
|
||||||
channel_filters: List of channel patterns (e.g., ["inventory.alerts", "*.notifications"])
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of full channel names to subscribe to
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
>>> _get_subscription_channels("abc", ["inventory.alerts"])
|
|
||||||
["tenant:abc:inventory.alerts"]
|
|
||||||
|
|
||||||
>>> _get_subscription_channels("abc", ["*.alerts"])
|
|
||||||
["tenant:abc:inventory.alerts", "tenant:abc:production.alerts", ...]
|
|
||||||
|
|
||||||
>>> _get_subscription_channels("abc", [])
|
|
||||||
["tenant:abc:inventory.alerts", "tenant:abc:inventory.notifications", ...]
|
|
||||||
"""
|
|
||||||
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
|
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
|
||||||
all_classes = ["alerts", "notifications"]
|
all_classes = ["alerts", "notifications"]
|
||||||
|
|
||||||
channels = []
|
channels = []
|
||||||
|
|
||||||
if not channel_filters:
|
if not channel_filters:
|
||||||
@@ -271,70 +130,49 @@ def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list:
|
|||||||
for domain in all_domains:
|
for domain in all_domains:
|
||||||
for event_class in all_classes:
|
for event_class in all_classes:
|
||||||
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
|
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
|
||||||
# Also subscribe to recommendations (tenant-wide)
|
|
||||||
channels.append(f"tenant:{tenant_id}:recommendations")
|
channels.append(f"tenant:{tenant_id}:recommendations")
|
||||||
# Also subscribe to legacy channel for backward compatibility
|
channels.append(f"alerts:{tenant_id}") # Legacy
|
||||||
channels.append(f"alerts:{tenant_id}")
|
|
||||||
return channels
|
return channels
|
||||||
|
|
||||||
# Parse filters and expand wildcards
|
# Parse filters and expand wildcards
|
||||||
for filter_pattern in channel_filters:
|
for filter_pattern in channel_filters:
|
||||||
if filter_pattern == "*.*":
|
if filter_pattern == "*.*":
|
||||||
# All channels
|
|
||||||
for domain in all_domains:
|
for domain in all_domains:
|
||||||
for event_class in all_classes:
|
for event_class in all_classes:
|
||||||
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
|
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
|
||||||
channels.append(f"tenant:{tenant_id}:recommendations")
|
channels.append(f"tenant:{tenant_id}:recommendations")
|
||||||
|
|
||||||
elif filter_pattern.endswith(".*"):
|
elif filter_pattern.endswith(".*"):
|
||||||
# Domain wildcard (e.g., "inventory.*")
|
|
||||||
domain = filter_pattern.split(".")[0]
|
domain = filter_pattern.split(".")[0]
|
||||||
for event_class in all_classes:
|
for event_class in all_classes:
|
||||||
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
|
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
|
||||||
|
|
||||||
elif filter_pattern.startswith("*."):
|
elif filter_pattern.startswith("*."):
|
||||||
# Class wildcard (e.g., "*.alerts")
|
|
||||||
event_class = filter_pattern.split(".")[1]
|
event_class = filter_pattern.split(".")[1]
|
||||||
if event_class == "recommendations":
|
if event_class == "recommendations":
|
||||||
channels.append(f"tenant:{tenant_id}:recommendations")
|
channels.append(f"tenant:{tenant_id}:recommendations")
|
||||||
else:
|
else:
|
||||||
for domain in all_domains:
|
for domain in all_domains:
|
||||||
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
|
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
|
||||||
|
|
||||||
elif filter_pattern == "recommendations":
|
elif filter_pattern == "recommendations":
|
||||||
# Recommendations channel
|
|
||||||
channels.append(f"tenant:{tenant_id}:recommendations")
|
channels.append(f"tenant:{tenant_id}:recommendations")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Specific channel (e.g., "inventory.alerts")
|
|
||||||
channels.append(f"tenant:{tenant_id}:{filter_pattern}")
|
channels.append(f"tenant:{tenant_id}:{filter_pattern}")
|
||||||
|
|
||||||
return list(set(channels)) # Remove duplicates
|
return list(set(channels))
|
||||||
|
|
||||||
|
|
||||||
async def _load_initial_state(redis_client, tenant_id: str, channel_filters: list) -> list:
|
async def _load_initial_state(redis_client, tenant_id: str, channel_filters: list) -> list:
|
||||||
"""
|
"""Load initial state from Redis cache based on channel filters"""
|
||||||
Load initial state from Redis cache based on channel filters.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
redis_client: Redis client
|
|
||||||
tenant_id: Tenant identifier
|
|
||||||
channel_filters: List of channel patterns
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of initial events
|
|
||||||
"""
|
|
||||||
initial_events = []
|
initial_events = []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not channel_filters:
|
if not channel_filters:
|
||||||
# Load from legacy cache if no filters (backward compat)
|
# Legacy cache
|
||||||
legacy_cache_key = f"active_alerts:{tenant_id}"
|
legacy_cache_key = f"active_alerts:{tenant_id}"
|
||||||
cached_data = await redis_client.get(legacy_cache_key)
|
cached_data = await redis_client.get(legacy_cache_key)
|
||||||
if cached_data:
|
if cached_data:
|
||||||
return json.loads(cached_data)
|
return json.loads(cached_data)
|
||||||
|
|
||||||
# Also try loading from new domain-specific caches
|
# New domain-specific caches
|
||||||
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
|
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
|
||||||
all_classes = ["alerts", "notifications"]
|
all_classes = ["alerts", "notifications"]
|
||||||
|
|
||||||
@@ -343,10 +181,9 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
|
|||||||
cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
|
cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
|
||||||
cached_data = await redis_client.get(cache_key)
|
cached_data = await redis_client.get(cache_key)
|
||||||
if cached_data:
|
if cached_data:
|
||||||
events = json.loads(cached_data)
|
initial_events.extend(json.loads(cached_data))
|
||||||
initial_events.extend(events)
|
|
||||||
|
|
||||||
# Load recommendations
|
# Recommendations
|
||||||
recommendations_cache_key = f"active_events:{tenant_id}:recommendations"
|
recommendations_cache_key = f"active_events:{tenant_id}:recommendations"
|
||||||
cached_data = await redis_client.get(recommendations_cache_key)
|
cached_data = await redis_client.get(recommendations_cache_key)
|
||||||
if cached_data:
|
if cached_data:
|
||||||
@@ -356,36 +193,29 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
|
|||||||
|
|
||||||
# Load based on specific filters
|
# Load based on specific filters
|
||||||
for filter_pattern in channel_filters:
|
for filter_pattern in channel_filters:
|
||||||
# Extract domain and class from filter
|
|
||||||
if "." in filter_pattern:
|
if "." in filter_pattern:
|
||||||
parts = filter_pattern.split(".")
|
parts = filter_pattern.split(".")
|
||||||
domain = parts[0] if parts[0] != "*" else None
|
domain = parts[0] if parts[0] != "*" else None
|
||||||
event_class = parts[1] if len(parts) > 1 and parts[1] != "*" else None
|
event_class = parts[1] if len(parts) > 1 and parts[1] != "*" else None
|
||||||
|
|
||||||
if domain and event_class:
|
if domain and event_class:
|
||||||
# Specific cache (e.g., "inventory.alerts")
|
|
||||||
cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
|
cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
|
||||||
cached_data = await redis_client.get(cache_key)
|
cached_data = await redis_client.get(cache_key)
|
||||||
if cached_data:
|
if cached_data:
|
||||||
initial_events.extend(json.loads(cached_data))
|
initial_events.extend(json.loads(cached_data))
|
||||||
|
|
||||||
elif domain and not event_class:
|
elif domain and not event_class:
|
||||||
# Domain wildcard (e.g., "inventory.*")
|
|
||||||
for ec in ["alerts", "notifications"]:
|
for ec in ["alerts", "notifications"]:
|
||||||
cache_key = f"active_events:{tenant_id}:{domain}.{ec}"
|
cache_key = f"active_events:{tenant_id}:{domain}.{ec}"
|
||||||
cached_data = await redis_client.get(cache_key)
|
cached_data = await redis_client.get(cache_key)
|
||||||
if cached_data:
|
if cached_data:
|
||||||
initial_events.extend(json.loads(cached_data))
|
initial_events.extend(json.loads(cached_data))
|
||||||
|
|
||||||
elif not domain and event_class:
|
elif not domain and event_class:
|
||||||
# Class wildcard (e.g., "*.alerts")
|
|
||||||
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
|
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
|
||||||
for d in all_domains:
|
for d in all_domains:
|
||||||
cache_key = f"active_events:{tenant_id}:{d}.{event_class}s"
|
cache_key = f"active_events:{tenant_id}:{d}.{event_class}s"
|
||||||
cached_data = await redis_client.get(cache_key)
|
cached_data = await redis_client.get(cache_key)
|
||||||
if cached_data:
|
if cached_data:
|
||||||
initial_events.extend(json.loads(cached_data))
|
initial_events.extend(json.loads(cached_data))
|
||||||
|
|
||||||
elif filter_pattern == "recommendations":
|
elif filter_pattern == "recommendations":
|
||||||
cache_key = f"active_events:{tenant_id}:recommendations"
|
cache_key = f"active_events:{tenant_id}:recommendations"
|
||||||
cached_data = await redis_client.get(cache_key)
|
cached_data = await redis_client.get(cache_key)
|
||||||
@@ -400,27 +230,14 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
|
|||||||
|
|
||||||
|
|
||||||
def _determine_event_type(event_data: dict) -> str:
|
def _determine_event_type(event_data: dict) -> str:
|
||||||
"""
|
"""Determine SSE event type from event data"""
|
||||||
Determine SSE event type from event data.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
event_data: Event data dictionary
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
SSE event type: 'alert', 'notification', or 'recommendation'
|
|
||||||
"""
|
|
||||||
# New event architecture uses 'event_class'
|
|
||||||
if 'event_class' in event_data:
|
if 'event_class' in event_data:
|
||||||
return event_data['event_class'] # 'alert', 'notification', or 'recommendation'
|
return event_data['event_class']
|
||||||
|
|
||||||
# Legacy format uses 'item_type'
|
|
||||||
if 'item_type' in event_data:
|
if 'item_type' in event_data:
|
||||||
if event_data['item_type'] == 'recommendation':
|
if event_data['item_type'] == 'recommendation':
|
||||||
return 'recommendation'
|
return 'recommendation'
|
||||||
else:
|
else:
|
||||||
return 'alert'
|
return 'alert'
|
||||||
|
|
||||||
# Default to 'alert' for backward compatibility
|
|
||||||
return 'alert'
|
return 'alert'
|
||||||
|
|
||||||
|
|
||||||
@@ -432,42 +249,25 @@ def _determine_event_type(event_data: dict) -> str:
|
|||||||
async def events_stream(
|
async def events_stream(
|
||||||
request: Request,
|
request: Request,
|
||||||
tenant_id: str,
|
tenant_id: str,
|
||||||
channels: str = None # Comma-separated channel filters (e.g., "inventory.alerts,production.notifications")
|
channels: str = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Server-Sent Events stream for real-time notifications with multi-channel support.
|
Server-Sent Events stream for real-time notifications with multi-channel support.
|
||||||
|
|
||||||
Authentication is handled by auth middleware via query param token.
|
|
||||||
User context is available in request.state.user (injected by middleware).
|
|
||||||
|
|
||||||
Query Parameters:
|
Query Parameters:
|
||||||
tenant_id: Tenant identifier (required)
|
tenant_id: Tenant identifier (required)
|
||||||
channels: Comma-separated channel filters (optional)
|
channels: Comma-separated channel filters (optional)
|
||||||
Examples:
|
|
||||||
- "inventory.alerts,production.notifications" - Specific channels
|
|
||||||
- "*.alerts" - All alert channels
|
|
||||||
- "inventory.*" - All inventory events
|
|
||||||
- None - All channels (default, backward compatible)
|
|
||||||
|
|
||||||
New channel pattern: tenant:{tenant_id}:{domain}.{class}
|
|
||||||
Examples:
|
|
||||||
- tenant:abc:inventory.alerts
|
|
||||||
- tenant:abc:production.notifications
|
|
||||||
- tenant:abc:recommendations
|
|
||||||
|
|
||||||
Legacy channel (backward compat): alerts:{tenant_id}
|
|
||||||
"""
|
"""
|
||||||
global redis_client
|
global redis_client
|
||||||
|
|
||||||
if not redis_client:
|
if not redis_client:
|
||||||
raise HTTPException(status_code=503, detail="SSE service unavailable")
|
raise HTTPException(status_code=503, detail="SSE service unavailable")
|
||||||
|
|
||||||
# Extract user context from request state (set by auth middleware)
|
# Extract user context from request state
|
||||||
user_context = request.state.user
|
user_context = request.state.user
|
||||||
user_id = user_context.get('user_id')
|
user_id = user_context.get('user_id')
|
||||||
email = user_context.get('email')
|
email = user_context.get('email')
|
||||||
|
|
||||||
# Validate tenant_id parameter
|
|
||||||
if not tenant_id:
|
if not tenant_id:
|
||||||
raise HTTPException(status_code=400, detail="tenant_id query parameter is required")
|
raise HTTPException(status_code=400, detail="tenant_id query parameter is required")
|
||||||
|
|
||||||
@@ -479,79 +279,53 @@ async def events_stream(
|
|||||||
logger.info(f"SSE connection request for user {email}, tenant {tenant_id}, channels: {channel_filters or 'all'}")
|
logger.info(f"SSE connection request for user {email}, tenant {tenant_id}, channels: {channel_filters or 'all'}")
|
||||||
|
|
||||||
async def event_generator():
|
async def event_generator():
|
||||||
"""Generate server-sent events from Redis pub/sub with multi-channel support"""
|
"""Generate server-sent events from Redis pub/sub"""
|
||||||
pubsub = None
|
pubsub = None
|
||||||
try:
|
try:
|
||||||
# Create pubsub connection with resource monitoring
|
|
||||||
pubsub = redis_client.pubsub()
|
pubsub = redis_client.pubsub()
|
||||||
logger.debug(f"Created Redis pubsub connection for tenant: {tenant_id}")
|
logger.debug(f"Created Redis pubsub connection for tenant: {tenant_id}")
|
||||||
|
|
||||||
# Monitor connection count
|
# Determine channels
|
||||||
try:
|
|
||||||
connection_info = await redis_client.info('clients')
|
|
||||||
connected_clients = connection_info.get('connected_clients', 'unknown')
|
|
||||||
logger.debug(f"Redis connected clients: {connected_clients}")
|
|
||||||
except Exception:
|
|
||||||
# Don't fail if we can't get connection info
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Determine which channels to subscribe to
|
|
||||||
subscription_channels = _get_subscription_channels(tenant_id, channel_filters)
|
subscription_channels = _get_subscription_channels(tenant_id, channel_filters)
|
||||||
|
|
||||||
# Subscribe to all determined channels
|
# Subscribe
|
||||||
if subscription_channels:
|
if subscription_channels:
|
||||||
await pubsub.subscribe(*subscription_channels)
|
await pubsub.subscribe(*subscription_channels)
|
||||||
logger.info(f"Subscribed to {len(subscription_channels)} channels for tenant {tenant_id}")
|
logger.info(f"Subscribed to {len(subscription_channels)} channels for tenant {tenant_id}")
|
||||||
else:
|
else:
|
||||||
# Fallback to legacy channel if no channels specified
|
|
||||||
legacy_channel = f"alerts:{tenant_id}"
|
legacy_channel = f"alerts:{tenant_id}"
|
||||||
await pubsub.subscribe(legacy_channel)
|
await pubsub.subscribe(legacy_channel)
|
||||||
logger.info(f"Subscribed to legacy channel: {legacy_channel}")
|
|
||||||
|
|
||||||
# Send initial connection event
|
# Connection event
|
||||||
yield f"event: connection\n"
|
yield f"event: connection\n"
|
||||||
yield f"data: {json.dumps({'type': 'connected', 'message': 'SSE connection established', 'channels': subscription_channels or ['all'], 'timestamp': time.time()})}\n\n"
|
yield f"data: {json.dumps({'type': 'connected', 'message': 'SSE connection established', 'channels': subscription_channels or ['all'], 'timestamp': time.time()})}\n\n"
|
||||||
|
|
||||||
# Fetch and send initial state from cache (domain-specific or legacy)
|
# Initial state
|
||||||
initial_events = await _load_initial_state(redis_client, tenant_id, channel_filters)
|
initial_events = await _load_initial_state(redis_client, tenant_id, channel_filters)
|
||||||
if initial_events:
|
if initial_events:
|
||||||
logger.info(f"Sending {len(initial_events)} initial events to tenant {tenant_id}")
|
logger.info(f"Sending {len(initial_events)} initial events to tenant {tenant_id}")
|
||||||
yield f"event: initial_state\n"
|
yield f"event: initial_state\n"
|
||||||
yield f"data: {json.dumps(initial_events)}\n\n"
|
yield f"data: {json.dumps(initial_events)}\n\n"
|
||||||
else:
|
|
||||||
# Send empty initial state for compatibility
|
|
||||||
yield f"event: initial_state\n"
|
|
||||||
yield f"data: {json.dumps([])}\n\n"
|
|
||||||
|
|
||||||
heartbeat_counter = 0
|
heartbeat_counter = 0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
# Check if client has disconnected
|
|
||||||
if await request.is_disconnected():
|
if await request.is_disconnected():
|
||||||
logger.info(f"SSE client disconnected for tenant: {tenant_id}")
|
logger.info(f"SSE client disconnected for tenant: {tenant_id}")
|
||||||
break
|
break
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get message from Redis with timeout
|
|
||||||
message = await asyncio.wait_for(pubsub.get_message(ignore_subscribe_messages=True), timeout=10.0)
|
message = await asyncio.wait_for(pubsub.get_message(ignore_subscribe_messages=True), timeout=10.0)
|
||||||
|
|
||||||
if message and message['type'] == 'message':
|
if message and message['type'] == 'message':
|
||||||
# Forward the event from Redis
|
|
||||||
event_data = json.loads(message['data'])
|
event_data = json.loads(message['data'])
|
||||||
|
|
||||||
# Determine event type for SSE
|
|
||||||
event_type = _determine_event_type(event_data)
|
event_type = _determine_event_type(event_data)
|
||||||
|
|
||||||
# Add channel metadata for frontend routing
|
|
||||||
event_data['_channel'] = message['channel'].decode('utf-8') if isinstance(message['channel'], bytes) else message['channel']
|
event_data['_channel'] = message['channel'].decode('utf-8') if isinstance(message['channel'], bytes) else message['channel']
|
||||||
|
|
||||||
yield f"event: {event_type}\n"
|
yield f"event: {event_type}\n"
|
||||||
yield f"data: {json.dumps(event_data)}\n\n"
|
yield f"data: {json.dumps(event_data)}\n\n"
|
||||||
|
|
||||||
logger.debug(f"SSE event sent to tenant {tenant_id}: {event_type} - {event_data.get('title')}")
|
|
||||||
|
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
# Send heartbeat every 10 timeouts (100 seconds)
|
|
||||||
heartbeat_counter += 1
|
heartbeat_counter += 1
|
||||||
if heartbeat_counter >= 10:
|
if heartbeat_counter >= 10:
|
||||||
yield f"event: heartbeat\n"
|
yield f"event: heartbeat\n"
|
||||||
@@ -563,24 +337,13 @@ async def events_stream(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"SSE error for tenant {tenant_id}: {e}", exc_info=True)
|
logger.error(f"SSE error for tenant {tenant_id}: {e}", exc_info=True)
|
||||||
finally:
|
finally:
|
||||||
try:
|
|
||||||
if pubsub:
|
if pubsub:
|
||||||
try:
|
try:
|
||||||
# Unsubscribe from all channels
|
|
||||||
await pubsub.unsubscribe()
|
await pubsub.unsubscribe()
|
||||||
logger.debug(f"Unsubscribed from Redis channels for tenant: {tenant_id}")
|
|
||||||
except Exception as unsubscribe_error:
|
|
||||||
logger.error(f"Failed to unsubscribe Redis pubsub for tenant {tenant_id}: {unsubscribe_error}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Close pubsub connection
|
|
||||||
await pubsub.close()
|
await pubsub.close()
|
||||||
logger.debug(f"Closed Redis pubsub connection for tenant: {tenant_id}")
|
except Exception as e:
|
||||||
except Exception as close_error:
|
logger.error(f"Error closing pubsub: {e}")
|
||||||
logger.error(f"Failed to close Redis pubsub for tenant {tenant_id}: {close_error}")
|
|
||||||
logger.info(f"SSE connection closed for tenant: {tenant_id}")
|
logger.info(f"SSE connection closed for tenant: {tenant_id}")
|
||||||
except Exception as finally_error:
|
|
||||||
logger.error(f"Error in SSE cleanup for tenant {tenant_id}: {finally_error}")
|
|
||||||
|
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
event_generator(),
|
event_generator(),
|
||||||
@@ -593,55 +356,35 @@ async def events_stream(
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# WEBSOCKET ROUTING FOR TRAINING SERVICE
|
# WEBSOCKET ROUTING FOR TRAINING SERVICE
|
||||||
# ================================================================
|
# ================================================================
|
||||||
|
|
||||||
@app.websocket("/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live")
|
@app.websocket("/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live")
|
||||||
async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_id: str):
|
async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_id: str):
|
||||||
"""
|
"""WebSocket proxy with token verification for training service"""
|
||||||
Simple WebSocket proxy with token verification only.
|
|
||||||
Validates the token and forwards the connection to the training service.
|
|
||||||
"""
|
|
||||||
# Get token from query params
|
|
||||||
token = websocket.query_params.get("token")
|
token = websocket.query_params.get("token")
|
||||||
if not token:
|
if not token:
|
||||||
logger.warning("WebSocket proxy rejected - missing token",
|
|
||||||
job_id=job_id,
|
|
||||||
tenant_id=tenant_id)
|
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
await websocket.close(code=1008, reason="Authentication token required")
|
await websocket.close(code=1008, reason="Authentication token required")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Verify token
|
# Verify token
|
||||||
from shared.auth.jwt_handler import JWTHandler
|
from shared.auth.jwt_handler import JWTHandler
|
||||||
|
|
||||||
jwt_handler = JWTHandler(settings.JWT_SECRET_KEY, settings.JWT_ALGORITHM)
|
jwt_handler = JWTHandler(settings.JWT_SECRET_KEY, settings.JWT_ALGORITHM)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
payload = jwt_handler.verify_token(token)
|
payload = jwt_handler.verify_token(token)
|
||||||
if not payload or not payload.get('user_id'):
|
if not payload or not payload.get('user_id'):
|
||||||
logger.warning("WebSocket proxy rejected - invalid token",
|
|
||||||
job_id=job_id,
|
|
||||||
tenant_id=tenant_id)
|
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
await websocket.close(code=1008, reason="Invalid token")
|
await websocket.close(code=1008, reason="Invalid token")
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info("WebSocket proxy - token verified",
|
|
||||||
user_id=payload.get('user_id'),
|
|
||||||
tenant_id=tenant_id,
|
|
||||||
job_id=job_id)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("WebSocket proxy - token verification failed",
|
|
||||||
job_id=job_id,
|
|
||||||
error=str(e))
|
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
await websocket.close(code=1008, reason="Token verification failed")
|
await websocket.close(code=1008, reason="Token verification failed")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Accept the connection
|
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
|
|
||||||
# Build WebSocket URL to training service
|
# Build WebSocket URL to training service
|
||||||
@@ -649,33 +392,24 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
|
|||||||
training_ws_url = training_service_base.replace('http://', 'ws://').replace('https://', 'wss://')
|
training_ws_url = training_service_base.replace('http://', 'ws://').replace('https://', 'wss://')
|
||||||
training_ws_url = f"{training_ws_url}/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live?token={token}"
|
training_ws_url = f"{training_ws_url}/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live?token={token}"
|
||||||
|
|
||||||
logger.info("Gateway proxying WebSocket to training service",
|
|
||||||
job_id=job_id,
|
|
||||||
training_ws_url=training_ws_url.replace(token, '***'))
|
|
||||||
|
|
||||||
training_ws = None
|
training_ws = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Connect to training service WebSocket
|
|
||||||
import websockets
|
import websockets
|
||||||
from websockets.protocol import State
|
from websockets.protocol import State
|
||||||
|
|
||||||
training_ws = await websockets.connect(
|
training_ws = await websockets.connect(
|
||||||
training_ws_url,
|
training_ws_url,
|
||||||
ping_interval=120, # Send ping every 2 minutes (tolerates long training operations)
|
ping_interval=120,
|
||||||
ping_timeout=60, # Wait up to 1 minute for pong (graceful timeout)
|
ping_timeout=60,
|
||||||
close_timeout=60, # Increase close timeout for graceful shutdown
|
close_timeout=60,
|
||||||
open_timeout=30
|
open_timeout=30
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Gateway connected to training service WebSocket", job_id=job_id)
|
|
||||||
|
|
||||||
async def forward_frontend_to_training():
|
async def forward_frontend_to_training():
|
||||||
"""Forward messages from frontend to training service"""
|
|
||||||
try:
|
try:
|
||||||
while training_ws and training_ws.state == State.OPEN:
|
while training_ws and training_ws.state == State.OPEN:
|
||||||
data = await websocket.receive()
|
data = await websocket.receive()
|
||||||
|
|
||||||
if data.get("type") == "websocket.receive":
|
if data.get("type") == "websocket.receive":
|
||||||
if "text" in data:
|
if "text" in data:
|
||||||
await training_ws.send(data["text"])
|
await training_ws.send(data["text"])
|
||||||
@@ -683,30 +417,17 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
|
|||||||
await training_ws.send(data["bytes"])
|
await training_ws.send(data["bytes"])
|
||||||
elif data.get("type") == "websocket.disconnect":
|
elif data.get("type") == "websocket.disconnect":
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception:
|
||||||
logger.debug("Frontend to training forward ended", error=str(e))
|
pass
|
||||||
|
|
||||||
async def forward_training_to_frontend():
|
async def forward_training_to_frontend():
|
||||||
"""Forward messages from training service to frontend"""
|
|
||||||
message_count = 0
|
|
||||||
try:
|
try:
|
||||||
while training_ws and training_ws.state == State.OPEN:
|
while training_ws and training_ws.state == State.OPEN:
|
||||||
message = await training_ws.recv()
|
message = await training_ws.recv()
|
||||||
await websocket.send_text(message)
|
await websocket.send_text(message)
|
||||||
message_count += 1
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Log every 10th message to track connectivity
|
|
||||||
if message_count % 10 == 0:
|
|
||||||
logger.debug("WebSocket proxy active",
|
|
||||||
job_id=job_id,
|
|
||||||
messages_forwarded=message_count)
|
|
||||||
except Exception as e:
|
|
||||||
logger.info("Training to frontend forward ended",
|
|
||||||
job_id=job_id,
|
|
||||||
messages_forwarded=message_count,
|
|
||||||
error=str(e))
|
|
||||||
|
|
||||||
# Run both forwarding tasks concurrently
|
|
||||||
await asyncio.gather(
|
await asyncio.gather(
|
||||||
forward_frontend_to_training(),
|
forward_frontend_to_training(),
|
||||||
forward_training_to_frontend(),
|
forward_training_to_frontend(),
|
||||||
@@ -716,20 +437,17 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("WebSocket proxy error", job_id=job_id, error=str(e))
|
logger.error("WebSocket proxy error", job_id=job_id, error=str(e))
|
||||||
finally:
|
finally:
|
||||||
# Cleanup
|
|
||||||
if training_ws and training_ws.state == State.OPEN:
|
if training_ws and training_ws.state == State.OPEN:
|
||||||
try:
|
try:
|
||||||
await training_ws.close()
|
await training_ws.close()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not websocket.client_state.name == 'DISCONNECTED':
|
if not websocket.client_state.name == 'DISCONNECTED':
|
||||||
await websocket.close(code=1000, reason="Proxy closed")
|
await websocket.close(code=1000, reason="Proxy closed")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
logger.info("WebSocket proxy connection closed", job_id=job_id)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|||||||
@@ -48,9 +48,9 @@ signoz:
|
|||||||
signoz_traces_ttl_duration_hrs: "168"
|
signoz_traces_ttl_duration_hrs: "168"
|
||||||
signoz_metrics_ttl_duration_hrs: "168"
|
signoz_metrics_ttl_duration_hrs: "168"
|
||||||
signoz_logs_ttl_duration_hrs: "168"
|
signoz_logs_ttl_duration_hrs: "168"
|
||||||
# OpAMP Server Configuration
|
# OpAMP Server Configuration - DISABLED for dev (causes gRPC instability)
|
||||||
signoz_opamp_server_enabled: "true"
|
signoz_opamp_server_enabled: "false"
|
||||||
signoz_opamp_server_endpoint: "0.0.0.0:4320"
|
# signoz_opamp_server_endpoint: "0.0.0.0:4320"
|
||||||
|
|
||||||
persistence:
|
persistence:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -149,9 +149,10 @@ otelCollector:
|
|||||||
repository: signoz/signoz-otel-collector
|
repository: signoz/signoz-otel-collector
|
||||||
tag: v0.129.12 # Latest recommended version
|
tag: v0.129.12 # Latest recommended version
|
||||||
|
|
||||||
# OpAMP Configuration - Enabled for dynamic configuration management
|
# OpAMP Configuration - DISABLED for development
|
||||||
# Note: OpAMP allows remote configuration management via SigNoz backend
|
# OpAMP is designed for production with remote config management
|
||||||
# This replaces the manual kubectl patch approach
|
# In dev, it causes gRPC instability and collector reloads
|
||||||
|
# We use static configuration instead
|
||||||
|
|
||||||
# Init containers for the Otel Collector pod
|
# Init containers for the Otel Collector pod
|
||||||
initContainers:
|
initContainers:
|
||||||
@@ -231,6 +232,9 @@ otelCollector:
|
|||||||
secretName: postgres-tls
|
secretName: postgres-tls
|
||||||
- name: postgres-tls-fixed
|
- name: postgres-tls-fixed
|
||||||
emptyDir: {}
|
emptyDir: {}
|
||||||
|
- name: varlogpods
|
||||||
|
hostPath:
|
||||||
|
path: /var/log/pods
|
||||||
|
|
||||||
extraVolumeMounts:
|
extraVolumeMounts:
|
||||||
- name: redis-tls
|
- name: redis-tls
|
||||||
@@ -242,13 +246,16 @@ otelCollector:
|
|||||||
- name: postgres-tls-fixed
|
- name: postgres-tls-fixed
|
||||||
mountPath: /etc/postgres-tls
|
mountPath: /etc/postgres-tls
|
||||||
readOnly: false
|
readOnly: false
|
||||||
|
- name: varlogpods
|
||||||
|
mountPath: /var/log/pods
|
||||||
|
readOnly: true
|
||||||
|
|
||||||
# Enable OpAMP for dynamic configuration management
|
# Disable OpAMP - use static configuration only
|
||||||
|
# Use 'args' instead of 'extraArgs' to completely override the command
|
||||||
command:
|
command:
|
||||||
name: /signoz-otel-collector
|
name: /signoz-otel-collector
|
||||||
extraArgs:
|
args:
|
||||||
- --config=/conf/otel-collector-config.yaml
|
- --config=/conf/otel-collector-config.yaml
|
||||||
- --manager-config=/conf/otel-collector-opamp-config.yaml
|
|
||||||
- --feature-gates=-pkg.translator.prometheus.NormalizeName
|
- --feature-gates=-pkg.translator.prometheus.NormalizeName
|
||||||
|
|
||||||
# OpenTelemetry Collector configuration
|
# OpenTelemetry Collector configuration
|
||||||
@@ -275,6 +282,63 @@ otelCollector:
|
|||||||
allowed_origins:
|
allowed_origins:
|
||||||
- "*"
|
- "*"
|
||||||
|
|
||||||
|
# Filelog receiver for Kubernetes pod logs
|
||||||
|
# Collects container stdout/stderr from /var/log/pods
|
||||||
|
filelog:
|
||||||
|
include:
|
||||||
|
- /var/log/pods/*/*/*.log
|
||||||
|
exclude:
|
||||||
|
# Exclude SigNoz's own logs to avoid recursive collection
|
||||||
|
- /var/log/pods/bakery-ia_signoz-*/*/*.log
|
||||||
|
include_file_path: true
|
||||||
|
include_file_name: false
|
||||||
|
operators:
|
||||||
|
# Parse CRI-O / containerd log format
|
||||||
|
- type: regex_parser
|
||||||
|
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
|
||||||
|
timestamp:
|
||||||
|
parse_from: attributes.time
|
||||||
|
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
|
||||||
|
# Fix timestamp parsing - extract from the parsed time field
|
||||||
|
- type: move
|
||||||
|
from: attributes.time
|
||||||
|
to: attributes.timestamp
|
||||||
|
# Extract Kubernetes metadata from file path
|
||||||
|
- type: regex_parser
|
||||||
|
id: extract_metadata_from_filepath
|
||||||
|
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
|
||||||
|
parse_from: attributes["log.file.path"]
|
||||||
|
# Move metadata to resource attributes
|
||||||
|
- type: move
|
||||||
|
from: attributes.namespace
|
||||||
|
to: resource["k8s.namespace.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.pod_name
|
||||||
|
to: resource["k8s.pod.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.container_name
|
||||||
|
to: resource["k8s.container.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.log
|
||||||
|
to: body
|
||||||
|
|
||||||
|
# Kubernetes Cluster Receiver - Collects cluster-level metrics
|
||||||
|
# Provides information about nodes, namespaces, pods, and other cluster resources
|
||||||
|
k8s_cluster:
|
||||||
|
collection_interval: 30s
|
||||||
|
node_conditions_to_report:
|
||||||
|
- Ready
|
||||||
|
- MemoryPressure
|
||||||
|
- DiskPressure
|
||||||
|
- PIDPressure
|
||||||
|
- NetworkUnavailable
|
||||||
|
allocatable_types_to_report:
|
||||||
|
- cpu
|
||||||
|
- memory
|
||||||
|
- pods
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# PostgreSQL receivers for database metrics
|
# PostgreSQL receivers for database metrics
|
||||||
# ENABLED: Monitor users configured and credentials stored in secrets
|
# ENABLED: Monitor users configured and credentials stored in secrets
|
||||||
# Collects metrics directly from PostgreSQL databases with proper TLS
|
# Collects metrics directly from PostgreSQL databases with proper TLS
|
||||||
@@ -538,6 +602,43 @@ otelCollector:
|
|||||||
password: ${env:RABBITMQ_PASSWORD}
|
password: ${env:RABBITMQ_PASSWORD}
|
||||||
collection_interval: 30s
|
collection_interval: 30s
|
||||||
|
|
||||||
|
# Prometheus Receiver - Scrapes metrics from Kubernetes API
|
||||||
|
# Simplified configuration using only Kubernetes API metrics
|
||||||
|
prometheus:
|
||||||
|
config:
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: 'kubernetes-nodes-cadvisor'
|
||||||
|
scrape_interval: 30s
|
||||||
|
scrape_timeout: 10s
|
||||||
|
scheme: https
|
||||||
|
tls_config:
|
||||||
|
insecure_skip_verify: true
|
||||||
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: node
|
||||||
|
relabel_configs:
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: kubernetes.default.svc:443
|
||||||
|
- source_labels: [__meta_kubernetes_node_name]
|
||||||
|
regex: (.+)
|
||||||
|
target_label: __metrics_path__
|
||||||
|
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
||||||
|
- job_name: 'kubernetes-apiserver'
|
||||||
|
scrape_interval: 30s
|
||||||
|
scrape_timeout: 10s
|
||||||
|
scheme: https
|
||||||
|
tls_config:
|
||||||
|
insecure_skip_verify: true
|
||||||
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: endpoints
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||||
|
action: keep
|
||||||
|
regex: default;kubernetes;https
|
||||||
|
|
||||||
processors:
|
processors:
|
||||||
# Batch processor for better performance (optimized for high throughput)
|
# Batch processor for better performance (optimized for high throughput)
|
||||||
batch:
|
batch:
|
||||||
@@ -562,6 +663,25 @@ otelCollector:
|
|||||||
detectors: [env, system, docker]
|
detectors: [env, system, docker]
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
|
|
||||||
|
# Kubernetes attributes processor - CRITICAL for logs
|
||||||
|
# Extracts pod, namespace, container metadata from log attributes
|
||||||
|
k8sattributes:
|
||||||
|
auth_type: "serviceAccount"
|
||||||
|
passthrough: false
|
||||||
|
extract:
|
||||||
|
metadata:
|
||||||
|
- k8s.pod.name
|
||||||
|
- k8s.pod.uid
|
||||||
|
- k8s.deployment.name
|
||||||
|
- k8s.namespace.name
|
||||||
|
- k8s.node.name
|
||||||
|
- k8s.container.name
|
||||||
|
labels:
|
||||||
|
- tag_name: "app"
|
||||||
|
- tag_name: "pod-template-hash"
|
||||||
|
annotations:
|
||||||
|
- tag_name: "description"
|
||||||
|
|
||||||
# SigNoz span metrics processor with delta aggregation (recommended)
|
# SigNoz span metrics processor with delta aggregation (recommended)
|
||||||
# Generates RED metrics (Rate, Error, Duration) from trace spans
|
# Generates RED metrics (Rate, Error, Duration) from trace spans
|
||||||
signozspanmetrics/delta:
|
signozspanmetrics/delta:
|
||||||
@@ -643,7 +763,7 @@ otelCollector:
|
|||||||
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
|
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
|
||||||
postgresql/production, postgresql/recipes, postgresql/sales,
|
postgresql/production, postgresql/recipes, postgresql/sales,
|
||||||
postgresql/suppliers, postgresql/tenant, postgresql/training,
|
postgresql/suppliers, postgresql/tenant, postgresql/training,
|
||||||
redis, rabbitmq]
|
redis, rabbitmq, k8s_cluster, prometheus]
|
||||||
processors: [memory_limiter, batch, resourcedetection]
|
processors: [memory_limiter, batch, resourcedetection]
|
||||||
exporters: [signozclickhousemetrics]
|
exporters: [signozclickhousemetrics]
|
||||||
|
|
||||||
@@ -653,17 +773,38 @@ otelCollector:
|
|||||||
processors: [batch/meter]
|
processors: [batch/meter]
|
||||||
exporters: [signozclickhousemeter]
|
exporters: [signozclickhousemeter]
|
||||||
|
|
||||||
# Logs pipeline
|
# Logs pipeline - includes both OTLP and Kubernetes pod logs
|
||||||
logs:
|
logs:
|
||||||
receivers: [otlp]
|
receivers: [otlp, filelog]
|
||||||
processors: [memory_limiter, batch, resourcedetection]
|
processors: [memory_limiter, batch, resourcedetection, k8sattributes]
|
||||||
exporters: [clickhouselogsexporter]
|
exporters: [clickhouselogsexporter]
|
||||||
|
|
||||||
# Additional Configuration
|
# Additional Configuration
|
||||||
serviceAccount:
|
serviceAccount:
|
||||||
create: true
|
create: true
|
||||||
annotations: {}
|
annotations: {}
|
||||||
name: ""
|
name: "signoz-otel-collector"
|
||||||
|
|
||||||
|
# RBAC Configuration for Kubernetes monitoring
|
||||||
|
# Required for k8s_cluster and kubeletstats receivers to access Kubernetes API
|
||||||
|
rbac:
|
||||||
|
create: true
|
||||||
|
rules:
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: ["apps"]
|
||||||
|
resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: ["batch"]
|
||||||
|
resources: ["jobs", "cronjobs"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: ["extensions"]
|
||||||
|
resources: ["deployments", "daemonsets", "replicasets"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: ["metrics.k8s.io"]
|
||||||
|
resources: ["nodes", "pods"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
|
||||||
# Security Context
|
# Security Context
|
||||||
securityContext:
|
securityContext:
|
||||||
|
|||||||
@@ -66,6 +66,11 @@ signoz:
|
|||||||
signoz_traces_ttl_duration_hrs: "720"
|
signoz_traces_ttl_duration_hrs: "720"
|
||||||
signoz_metrics_ttl_duration_hrs: "720"
|
signoz_metrics_ttl_duration_hrs: "720"
|
||||||
signoz_logs_ttl_duration_hrs: "720"
|
signoz_logs_ttl_duration_hrs: "720"
|
||||||
|
# OpAMP Server Configuration
|
||||||
|
# WARNING: OpAMP can cause gRPC instability and collector reloads
|
||||||
|
# Only enable if you have a stable OpAMP backend server
|
||||||
|
signoz_opamp_server_enabled: "false"
|
||||||
|
# signoz_opamp_server_endpoint: "0.0.0.0:4320"
|
||||||
# SMTP configuration for email alerts
|
# SMTP configuration for email alerts
|
||||||
signoz_smtp_enabled: "true"
|
signoz_smtp_enabled: "true"
|
||||||
signoz_smtp_host: "smtp.gmail.com"
|
signoz_smtp_host: "smtp.gmail.com"
|
||||||
@@ -247,17 +252,52 @@ otelCollector:
|
|||||||
tag: v0.129.12 # Updated to latest recommended version
|
tag: v0.129.12 # Updated to latest recommended version
|
||||||
pullPolicy: IfNotPresent
|
pullPolicy: IfNotPresent
|
||||||
|
|
||||||
|
# Init containers for the Otel Collector pod
|
||||||
|
initContainers:
|
||||||
|
fix-postgres-tls:
|
||||||
|
enabled: true
|
||||||
|
image:
|
||||||
|
registry: docker.io
|
||||||
|
repository: busybox
|
||||||
|
tag: 1.35
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
echo "Fixing PostgreSQL TLS file permissions..."
|
||||||
|
cp /etc/postgres-tls-source/* /etc/postgres-tls/
|
||||||
|
chmod 600 /etc/postgres-tls/server-key.pem
|
||||||
|
chmod 644 /etc/postgres-tls/server-cert.pem
|
||||||
|
chmod 644 /etc/postgres-tls/ca-cert.pem
|
||||||
|
echo "PostgreSQL TLS permissions fixed"
|
||||||
|
volumeMounts:
|
||||||
|
- name: postgres-tls-source
|
||||||
|
mountPath: /etc/postgres-tls-source
|
||||||
|
readOnly: true
|
||||||
|
- name: postgres-tls-fixed
|
||||||
|
mountPath: /etc/postgres-tls
|
||||||
|
readOnly: false
|
||||||
|
|
||||||
service:
|
service:
|
||||||
type: ClusterIP
|
type: ClusterIP
|
||||||
ports:
|
ports:
|
||||||
- name: otlp-grpc
|
- name: otlp-grpc
|
||||||
port: 4317
|
port: 4317
|
||||||
|
targetPort: 4317
|
||||||
|
protocol: TCP
|
||||||
- name: otlp-http
|
- name: otlp-http
|
||||||
port: 4318
|
port: 4318
|
||||||
|
targetPort: 4318
|
||||||
|
protocol: TCP
|
||||||
|
- name: prometheus
|
||||||
|
port: 8889
|
||||||
|
targetPort: 8889
|
||||||
|
protocol: TCP
|
||||||
- name: metrics
|
- name: metrics
|
||||||
port: 8888
|
port: 8888
|
||||||
- name: healthcheck
|
targetPort: 8888
|
||||||
port: 13133
|
protocol: TCP
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
@@ -267,6 +307,50 @@ otelCollector:
|
|||||||
cpu: 2000m
|
cpu: 2000m
|
||||||
memory: 2Gi
|
memory: 2Gi
|
||||||
|
|
||||||
|
# Additional environment variables for receivers
|
||||||
|
additionalEnvs:
|
||||||
|
POSTGRES_MONITOR_USER: "monitoring"
|
||||||
|
POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca"
|
||||||
|
REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k"
|
||||||
|
RABBITMQ_USER: "bakery"
|
||||||
|
RABBITMQ_PASSWORD: "forecast123"
|
||||||
|
|
||||||
|
# Mount TLS certificates for secure connections
|
||||||
|
extraVolumes:
|
||||||
|
- name: redis-tls
|
||||||
|
secret:
|
||||||
|
secretName: redis-tls-secret
|
||||||
|
- name: postgres-tls
|
||||||
|
secret:
|
||||||
|
secretName: postgres-tls
|
||||||
|
- name: postgres-tls-fixed
|
||||||
|
emptyDir: {}
|
||||||
|
- name: varlogpods
|
||||||
|
hostPath:
|
||||||
|
path: /var/log/pods
|
||||||
|
|
||||||
|
extraVolumeMounts:
|
||||||
|
- name: redis-tls
|
||||||
|
mountPath: /etc/redis-tls
|
||||||
|
readOnly: true
|
||||||
|
- name: postgres-tls
|
||||||
|
mountPath: /etc/postgres-tls-source
|
||||||
|
readOnly: true
|
||||||
|
- name: postgres-tls-fixed
|
||||||
|
mountPath: /etc/postgres-tls
|
||||||
|
readOnly: false
|
||||||
|
- name: varlogpods
|
||||||
|
mountPath: /var/log/pods
|
||||||
|
readOnly: true
|
||||||
|
|
||||||
|
# Enable OpAMP for dynamic configuration management
|
||||||
|
command:
|
||||||
|
name: /signoz-otel-collector
|
||||||
|
extraArgs:
|
||||||
|
- --config=/conf/otel-collector-config.yaml
|
||||||
|
- --manager-config=/conf/otel-collector-opamp-config.yaml
|
||||||
|
- --feature-gates=-pkg.translator.prometheus.NormalizeName
|
||||||
|
|
||||||
# Full OTEL Collector Configuration
|
# Full OTEL Collector Configuration
|
||||||
config:
|
config:
|
||||||
# Connectors - bridge between pipelines
|
# Connectors - bridge between pipelines
|
||||||
@@ -297,14 +381,358 @@ otelCollector:
|
|||||||
- "https://monitoring.bakewise.ai"
|
- "https://monitoring.bakewise.ai"
|
||||||
- "https://*.bakewise.ai"
|
- "https://*.bakewise.ai"
|
||||||
|
|
||||||
|
# Filelog receiver for Kubernetes pod logs
|
||||||
|
# Collects container stdout/stderr from /var/log/pods
|
||||||
|
filelog:
|
||||||
|
include:
|
||||||
|
- /var/log/pods/*/*/*.log
|
||||||
|
exclude:
|
||||||
|
# Exclude SigNoz's own logs to avoid recursive collection
|
||||||
|
- /var/log/pods/bakery-ia_signoz-*/*/*.log
|
||||||
|
include_file_path: true
|
||||||
|
include_file_name: false
|
||||||
|
operators:
|
||||||
|
# Parse CRI-O / containerd log format
|
||||||
|
- type: regex_parser
|
||||||
|
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
|
||||||
|
timestamp:
|
||||||
|
parse_from: attributes.time
|
||||||
|
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
|
||||||
|
# Fix timestamp parsing - extract from the parsed time field
|
||||||
|
- type: move
|
||||||
|
from: attributes.time
|
||||||
|
to: attributes.timestamp
|
||||||
|
# Extract Kubernetes metadata from file path
|
||||||
|
- type: regex_parser
|
||||||
|
id: extract_metadata_from_filepath
|
||||||
|
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
|
||||||
|
parse_from: attributes["log.file.path"]
|
||||||
|
# Move metadata to resource attributes
|
||||||
|
- type: move
|
||||||
|
from: attributes.namespace
|
||||||
|
to: resource["k8s.namespace.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.pod_name
|
||||||
|
to: resource["k8s.pod.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.container_name
|
||||||
|
to: resource["k8s.container.name"]
|
||||||
|
- type: move
|
||||||
|
from: attributes.log
|
||||||
|
to: body
|
||||||
|
|
||||||
|
# Kubernetes Cluster Receiver - Collects cluster-level metrics
|
||||||
|
# Provides information about nodes, namespaces, pods, and other cluster resources
|
||||||
|
k8s_cluster:
|
||||||
|
collection_interval: 30s
|
||||||
|
node_conditions_to_report:
|
||||||
|
- Ready
|
||||||
|
- MemoryPressure
|
||||||
|
- DiskPressure
|
||||||
|
- PIDPressure
|
||||||
|
- NetworkUnavailable
|
||||||
|
allocatable_types_to_report:
|
||||||
|
- cpu
|
||||||
|
- memory
|
||||||
|
- pods
|
||||||
|
|
||||||
# Prometheus receiver for scraping metrics
|
# Prometheus receiver for scraping metrics
|
||||||
prometheus:
|
prometheus:
|
||||||
config:
|
config:
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: 'otel-collector'
|
- job_name: 'kubernetes-nodes-cadvisor'
|
||||||
scrape_interval: 30s
|
scrape_interval: 30s
|
||||||
static_configs:
|
scrape_timeout: 10s
|
||||||
- targets: ['localhost:8888']
|
scheme: https
|
||||||
|
tls_config:
|
||||||
|
insecure_skip_verify: true
|
||||||
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: node
|
||||||
|
relabel_configs:
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_node_label_(.+)
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: kubernetes.default.svc:443
|
||||||
|
- source_labels: [__meta_kubernetes_node_name]
|
||||||
|
regex: (.+)
|
||||||
|
target_label: __metrics_path__
|
||||||
|
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
||||||
|
- job_name: 'kubernetes-apiserver'
|
||||||
|
scrape_interval: 30s
|
||||||
|
scrape_timeout: 10s
|
||||||
|
scheme: https
|
||||||
|
tls_config:
|
||||||
|
insecure_skip_verify: true
|
||||||
|
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: endpoints
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||||
|
action: keep
|
||||||
|
regex: default;kubernetes;https
|
||||||
|
|
||||||
|
# Redis receiver for cache metrics
|
||||||
|
# ENABLED: Using existing credentials from redis-secrets with TLS
|
||||||
|
redis:
|
||||||
|
endpoint: redis-service.bakery-ia:6379
|
||||||
|
password: ${env:REDIS_PASSWORD}
|
||||||
|
collection_interval: 60s
|
||||||
|
transport: tcp
|
||||||
|
tls:
|
||||||
|
insecure_skip_verify: false
|
||||||
|
cert_file: /etc/redis-tls/redis-cert.pem
|
||||||
|
key_file: /etc/redis-tls/redis-key.pem
|
||||||
|
ca_file: /etc/redis-tls/ca-cert.pem
|
||||||
|
metrics:
|
||||||
|
redis.maxmemory:
|
||||||
|
enabled: true
|
||||||
|
redis.cmd.latency:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# RabbitMQ receiver via management API
|
||||||
|
# ENABLED: Using existing credentials from rabbitmq-secrets
|
||||||
|
rabbitmq:
|
||||||
|
endpoint: http://rabbitmq-service.bakery-ia:15672
|
||||||
|
username: ${env:RABBITMQ_USER}
|
||||||
|
password: ${env:RABBITMQ_PASSWORD}
|
||||||
|
collection_interval: 30s
|
||||||
|
|
||||||
|
# PostgreSQL receivers for database metrics
|
||||||
|
# Monitor all databases with proper TLS configuration
|
||||||
|
postgresql/auth:
|
||||||
|
endpoint: auth-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- auth_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/inventory:
|
||||||
|
endpoint: inventory-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- inventory_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/orders:
|
||||||
|
endpoint: orders-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- orders_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/ai-insights:
|
||||||
|
endpoint: ai-insights-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- ai_insights_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/alert-processor:
|
||||||
|
endpoint: alert-processor-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- alert_processor_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/distribution:
|
||||||
|
endpoint: distribution-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- distribution_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/external:
|
||||||
|
endpoint: external-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- external_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/forecasting:
|
||||||
|
endpoint: forecasting-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- forecasting_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/notification:
|
||||||
|
endpoint: notification-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- notification_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/orchestrator:
|
||||||
|
endpoint: orchestrator-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- orchestrator_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/pos:
|
||||||
|
endpoint: pos-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- pos_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/procurement:
|
||||||
|
endpoint: procurement-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- procurement_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/production:
|
||||||
|
endpoint: production-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- production_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/recipes:
|
||||||
|
endpoint: recipes-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- recipes_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/sales:
|
||||||
|
endpoint: sales-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- sales_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/suppliers:
|
||||||
|
endpoint: suppliers-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- suppliers_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/tenant:
|
||||||
|
endpoint: tenant-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- tenant_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/training:
|
||||||
|
endpoint: training-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- training_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
processors:
|
processors:
|
||||||
# High-performance batch processing (official recommendation)
|
# High-performance batch processing (official recommendation)
|
||||||
@@ -326,7 +754,7 @@ otelCollector:
|
|||||||
|
|
||||||
# Resource detection for K8s
|
# Resource detection for K8s
|
||||||
resourcedetection:
|
resourcedetection:
|
||||||
detectors: [env, system, docker, kubernetes]
|
detectors: [env, system, docker]
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
|
|
||||||
# Add resource attributes
|
# Add resource attributes
|
||||||
@@ -339,6 +767,26 @@ otelCollector:
|
|||||||
value: bakery-ia-prod
|
value: bakery-ia-prod
|
||||||
action: upsert
|
action: upsert
|
||||||
|
|
||||||
|
# Kubernetes attributes processor - CRITICAL for logs
|
||||||
|
# Extracts pod, namespace, container metadata from log attributes
|
||||||
|
k8sattributes:
|
||||||
|
auth_type: "serviceAccount"
|
||||||
|
passthrough: false
|
||||||
|
extract:
|
||||||
|
metadata:
|
||||||
|
- k8s.pod.name
|
||||||
|
- k8s.pod.uid
|
||||||
|
- k8s.deployment.name
|
||||||
|
- k8s.namespace.name
|
||||||
|
- k8s.node.name
|
||||||
|
- k8s.container.name
|
||||||
|
labels:
|
||||||
|
- tag_name: "app"
|
||||||
|
- tag_name: "pod-template-hash"
|
||||||
|
- tag_name: "version"
|
||||||
|
annotations:
|
||||||
|
- tag_name: "description"
|
||||||
|
|
||||||
# SigNoz span metrics processor with delta aggregation (recommended)
|
# SigNoz span metrics processor with delta aggregation (recommended)
|
||||||
# Generates RED metrics (Rate, Error, Duration) from trace spans
|
# Generates RED metrics (Rate, Error, Duration) from trace spans
|
||||||
signozspanmetrics/delta:
|
signozspanmetrics/delta:
|
||||||
@@ -354,9 +802,9 @@ otelCollector:
|
|||||||
- name: signoz.collector.id
|
- name: signoz.collector.id
|
||||||
|
|
||||||
exporters:
|
exporters:
|
||||||
# Export to SigNoz ClickHouse
|
# ClickHouse exporter for traces
|
||||||
clickhousetraces:
|
clickhousetraces:
|
||||||
datasource: tcp://clickhouse:9000/?database=signoz_traces
|
datasource: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_traces
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retry_on_failure:
|
retry_on_failure:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -364,8 +812,9 @@ otelCollector:
|
|||||||
max_interval: 30s
|
max_interval: 30s
|
||||||
max_elapsed_time: 300s
|
max_elapsed_time: 300s
|
||||||
|
|
||||||
|
# ClickHouse exporter for metrics
|
||||||
signozclickhousemetrics:
|
signozclickhousemetrics:
|
||||||
endpoint: "tcp://clickhouse:9000/?database=signoz_metrics"
|
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retry_on_failure:
|
retry_on_failure:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -375,32 +824,32 @@ otelCollector:
|
|||||||
|
|
||||||
# ClickHouse exporter for meter data (usage metrics)
|
# ClickHouse exporter for meter data (usage metrics)
|
||||||
signozclickhousemeter:
|
signozclickhousemeter:
|
||||||
dsn: "tcp://clickhouse:9000/?database=signoz_meter"
|
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_meter"
|
||||||
timeout: 45s
|
timeout: 45s
|
||||||
sending_queue:
|
sending_queue:
|
||||||
enabled: false
|
enabled: false
|
||||||
|
|
||||||
|
# ClickHouse exporter for logs
|
||||||
clickhouselogsexporter:
|
clickhouselogsexporter:
|
||||||
dsn: tcp://clickhouse:9000/?database=signoz_logs
|
dsn: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_logs
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retry_on_failure:
|
retry_on_failure:
|
||||||
enabled: true
|
enabled: true
|
||||||
initial_interval: 5s
|
initial_interval: 5s
|
||||||
max_interval: 30s
|
max_interval: 30s
|
||||||
max_elapsed_time: 300s
|
|
||||||
|
|
||||||
# Metadata exporter for service metadata
|
# Metadata exporter for service metadata
|
||||||
metadataexporter:
|
metadataexporter:
|
||||||
dsn: "tcp://clickhouse:9000/?database=signoz_metadata"
|
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metadata"
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
cache:
|
cache:
|
||||||
provider: in_memory
|
provider: in_memory
|
||||||
|
|
||||||
# Debug exporter for debugging (replaces deprecated logging exporter)
|
# Debug exporter for debugging (optional)
|
||||||
debug:
|
debug:
|
||||||
verbosity: detailed
|
verbosity: detailed
|
||||||
sampling_initial: 2
|
sampling_initial: 5
|
||||||
sampling_thereafter: 500
|
sampling_thereafter: 200
|
||||||
|
|
||||||
service:
|
service:
|
||||||
extensions: [health_check, zpages]
|
extensions: [health_check, zpages]
|
||||||
@@ -411,9 +860,16 @@ otelCollector:
|
|||||||
processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource]
|
processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource]
|
||||||
exporters: [clickhousetraces, metadataexporter, signozmeter]
|
exporters: [clickhousetraces, metadataexporter, signozmeter]
|
||||||
|
|
||||||
# Metrics pipeline
|
# Metrics pipeline - includes all infrastructure receivers
|
||||||
metrics:
|
metrics:
|
||||||
receivers: [otlp, prometheus]
|
receivers: [otlp,
|
||||||
|
postgresql/auth, postgresql/inventory, postgresql/orders,
|
||||||
|
postgresql/ai-insights, postgresql/alert-processor, postgresql/distribution,
|
||||||
|
postgresql/external, postgresql/forecasting, postgresql/notification,
|
||||||
|
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
|
||||||
|
postgresql/production, postgresql/recipes, postgresql/sales,
|
||||||
|
postgresql/suppliers, postgresql/tenant, postgresql/training,
|
||||||
|
redis, rabbitmq, k8s_cluster, prometheus]
|
||||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||||
exporters: [signozclickhousemetrics]
|
exporters: [signozclickhousemetrics]
|
||||||
|
|
||||||
@@ -423,10 +879,10 @@ otelCollector:
|
|||||||
processors: [batch/meter]
|
processors: [batch/meter]
|
||||||
exporters: [signozclickhousemeter]
|
exporters: [signozclickhousemeter]
|
||||||
|
|
||||||
# Logs pipeline
|
# Logs pipeline - includes both OTLP and Kubernetes pod logs
|
||||||
logs:
|
logs:
|
||||||
receivers: [otlp]
|
receivers: [otlp, filelog]
|
||||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
processors: [memory_limiter, batch, resourcedetection, resource, k8sattributes]
|
||||||
exporters: [clickhouselogsexporter]
|
exporters: [clickhouselogsexporter]
|
||||||
|
|
||||||
# HPA for OTEL Collector
|
# HPA for OTEL Collector
|
||||||
@@ -455,6 +911,27 @@ serviceAccount:
|
|||||||
annotations: {}
|
annotations: {}
|
||||||
name: "signoz"
|
name: "signoz"
|
||||||
|
|
||||||
|
# RBAC Configuration for Kubernetes monitoring
|
||||||
|
# Required for k8s_cluster receiver to access Kubernetes API
|
||||||
|
rbac:
|
||||||
|
create: true
|
||||||
|
rules:
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: ["apps"]
|
||||||
|
resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: ["batch"]
|
||||||
|
resources: ["jobs", "cronjobs"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: ["extensions"]
|
||||||
|
resources: ["deployments", "daemonsets", "replicasets"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
- apiGroups: ["metrics.k8s.io"]
|
||||||
|
resources: ["nodes", "pods"]
|
||||||
|
verbs: ["get", "list", "watch"]
|
||||||
|
|
||||||
# Security Context
|
# Security Context
|
||||||
securityContext:
|
securityContext:
|
||||||
runAsNonRoot: true
|
runAsNonRoot: true
|
||||||
|
|||||||
@@ -15,9 +15,13 @@ data:
|
|||||||
LOG_LEVEL: "INFO"
|
LOG_LEVEL: "INFO"
|
||||||
|
|
||||||
# Observability Settings - SigNoz enabled
|
# Observability Settings - SigNoz enabled
|
||||||
|
# Note: Detailed OTEL configuration is in the OBSERVABILITY section below
|
||||||
ENABLE_TRACING: "true"
|
ENABLE_TRACING: "true"
|
||||||
ENABLE_METRICS: "true"
|
ENABLE_METRICS: "true"
|
||||||
ENABLE_LOGS: "true"
|
ENABLE_LOGS: "true"
|
||||||
|
ENABLE_OTEL_METRICS: "true"
|
||||||
|
ENABLE_SYSTEM_METRICS: "true"
|
||||||
|
OTEL_LOGS_EXPORTER: "otlp"
|
||||||
|
|
||||||
# Database initialization settings
|
# Database initialization settings
|
||||||
# IMPORTANT: Services NEVER run migrations - they only verify DB is ready
|
# IMPORTANT: Services NEVER run migrations - they only verify DB is ready
|
||||||
@@ -384,15 +388,44 @@ data:
|
|||||||
# ================================================================
|
# ================================================================
|
||||||
# OBSERVABILITY - SigNoz (Unified Monitoring)
|
# OBSERVABILITY - SigNoz (Unified Monitoring)
|
||||||
# ================================================================
|
# ================================================================
|
||||||
# OpenTelemetry Configuration - Direct to SigNoz
|
# OpenTelemetry Configuration - Direct to SigNoz OTel Collector
|
||||||
# IMPORTANT: gRPC endpoints should NOT include http:// prefix
|
#
|
||||||
|
# ENDPOINT CONFIGURATION:
|
||||||
|
# - OTEL_EXPORTER_OTLP_ENDPOINT: Base gRPC endpoint (host:port format, NO http:// prefix)
|
||||||
|
# Used by traces and metrics (gRPC) by default
|
||||||
|
# Format: "host:4317" (gRPC port)
|
||||||
|
#
|
||||||
|
# PROTOCOL USAGE:
|
||||||
|
# - Traces: gRPC (port 4317) - High performance, low latency
|
||||||
|
# - Metrics: gRPC (port 4317) - Efficient batch export
|
||||||
|
# - Logs: HTTP (port 4318) - Required for OTLP log protocol
|
||||||
|
#
|
||||||
|
# The monitoring library automatically handles:
|
||||||
|
# - Converting gRPC endpoint (4317) to HTTP endpoint (4318) for logs
|
||||||
|
# - Adding proper paths (/v1/traces, /v1/metrics, /v1/logs)
|
||||||
|
# - Protocol prefixes (http:// for HTTP, none for gRPC)
|
||||||
|
#
|
||||||
|
# Base OTLP endpoint (gRPC format - used by traces and metrics)
|
||||||
OTEL_EXPORTER_OTLP_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
|
OTEL_EXPORTER_OTLP_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
|
||||||
|
|
||||||
|
# Protocol configuration (gRPC is recommended for better performance)
|
||||||
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
|
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
|
||||||
|
|
||||||
|
# Optional: Signal-specific endpoint overrides (if different from base)
|
||||||
|
# OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
|
||||||
|
# OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
|
||||||
|
# OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
|
||||||
|
|
||||||
|
# Optional: Protocol overrides per signal
|
||||||
|
# OTEL_EXPORTER_OTLP_TRACES_PROTOCOL: "grpc"
|
||||||
|
# OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: "grpc"
|
||||||
|
# Note: Logs always use HTTP protocol regardless of this setting
|
||||||
|
|
||||||
|
# Resource attributes (added to all telemetry signals)
|
||||||
OTEL_SERVICE_NAME: "bakery-ia"
|
OTEL_SERVICE_NAME: "bakery-ia"
|
||||||
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
|
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
|
||||||
OTEL_LOGS_EXPORTER: "otlp"
|
|
||||||
|
|
||||||
# SigNoz Endpoints (v0.106.0+ unified service)
|
# SigNoz service endpoints (for UI and API access)
|
||||||
SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080"
|
SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080"
|
||||||
SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local"
|
SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local"
|
||||||
|
|
||||||
|
|||||||
@@ -1,104 +1,170 @@
|
|||||||
{
|
{
|
||||||
"dashboard": {
|
|
||||||
"title": "Bakery IA - Alert Management",
|
|
||||||
"description": "Alert monitoring and management dashboard",
|
"description": "Alert monitoring and management dashboard",
|
||||||
"tags": ["alerts", "monitoring", "management"],
|
"tags": ["alerts", "monitoring", "management"],
|
||||||
"panels": [
|
"name": "bakery-ia-alert-management",
|
||||||
|
"title": "Bakery IA - Alert Management",
|
||||||
|
"uploadedGrafana": false,
|
||||||
|
"uuid": "bakery-ia-alerts-01",
|
||||||
|
"version": "v4",
|
||||||
|
"collapsableRowsMigrated": true,
|
||||||
|
"layout": [
|
||||||
{
|
{
|
||||||
|
"x": 0,
|
||||||
|
"y": 0,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "active-alerts",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 6,
|
||||||
|
"y": 0,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "alert-rate",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": {
|
||||||
|
"service": {
|
||||||
|
"id": "service-var",
|
||||||
|
"name": "service",
|
||||||
|
"description": "Filter by service name",
|
||||||
|
"type": "QUERY",
|
||||||
|
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'alerts_active' AND value != '' ORDER BY value",
|
||||||
|
"customValue": "",
|
||||||
|
"textboxValue": "",
|
||||||
|
"showALLOption": true,
|
||||||
|
"multiSelect": false,
|
||||||
|
"order": 1,
|
||||||
|
"modificationUUID": "",
|
||||||
|
"sort": "ASC",
|
||||||
|
"selectedValue": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"widgets": [
|
||||||
|
{
|
||||||
|
"id": "active-alerts",
|
||||||
"title": "Active Alerts",
|
"title": "Active Alerts",
|
||||||
"type": "stat",
|
"description": "Number of currently active alerts",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "value",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "alerts_active",
|
"builder": {
|
||||||
"aggregate": "sum",
|
"queryData": [
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "severity",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${severity}"
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "alerts_active",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
},
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
{
|
{
|
||||||
"key": "status",
|
"key": {
|
||||||
"operator": "=",
|
"key": "serviceName",
|
||||||
"value": "firing"
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"op": "AND"
|
||||||
},
|
},
|
||||||
"unit": "number"
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [],
|
||||||
|
"legend": "Active Alerts",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"id": "alert-rate",
|
||||||
"title": "Alert Rate",
|
"title": "Alert Rate",
|
||||||
"type": "timeseries",
|
"description": "Rate of alerts over time",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "alerts_total",
|
"builder": {
|
||||||
"aggregate": "rate",
|
"queryData": [
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "severity",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${severity}"
|
"aggregateOperator": "sum",
|
||||||
}
|
"aggregateAttribute": {
|
||||||
]
|
"key": "alerts_total",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
},
|
},
|
||||||
"unit": "alerts/s"
|
"timeAggregation": "rate",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
},
|
},
|
||||||
{
|
"op": "=",
|
||||||
"title": "Alerts by Severity",
|
"value": "{{.service}}"
|
||||||
"type": "pie",
|
|
||||||
"query": {
|
|
||||||
"metric": "alerts_total",
|
|
||||||
"aggregate": "sum",
|
|
||||||
"groupBy": ["severity"],
|
|
||||||
"filters": [
|
|
||||||
{
|
|
||||||
"key": "severity",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${severity}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Alerts by Status",
|
|
||||||
"type": "pie",
|
|
||||||
"query": {
|
|
||||||
"metric": "alerts_total",
|
|
||||||
"aggregate": "sum",
|
|
||||||
"groupBy": ["status"],
|
|
||||||
"filters": [
|
|
||||||
{
|
|
||||||
"key": "status",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${status}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"variables": [
|
"op": "AND"
|
||||||
{
|
|
||||||
"name": "severity",
|
|
||||||
"label": "Severity",
|
|
||||||
"type": "dropdown",
|
|
||||||
"default": "*",
|
|
||||||
"values": ["*", "critical", "high", "medium", "low"]
|
|
||||||
},
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
{
|
{
|
||||||
"name": "status",
|
"key": "serviceName",
|
||||||
"label": "Status",
|
"dataType": "string",
|
||||||
"type": "dropdown",
|
"type": "tag",
|
||||||
"default": "*",
|
"isColumn": true
|
||||||
"values": ["*", "firing", "resolved", "acknowledged"]
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"layout": {
|
"legend": "{{serviceName}}",
|
||||||
"type": "grid",
|
"reduceTo": "sum"
|
||||||
"columns": 12,
|
}
|
||||||
"gap": [16, 16]
|
],
|
||||||
|
"queryFormulas": []
|
||||||
},
|
},
|
||||||
"refresh": "15s",
|
"queryType": "builder"
|
||||||
"time": {
|
},
|
||||||
"from": "now-1h",
|
"fillSpans": false,
|
||||||
"to": "now"
|
"yAxisUnit": "alerts/s"
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
@@ -1,102 +1,351 @@
|
|||||||
{
|
{
|
||||||
"dashboard": {
|
|
||||||
"title": "Bakery IA - API Performance",
|
|
||||||
"description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints",
|
"description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints",
|
||||||
"tags": ["api", "performance", "rest", "graphql"],
|
"tags": ["api", "performance", "rest", "graphql"],
|
||||||
"panels": [
|
"name": "bakery-ia-api-performance",
|
||||||
|
"title": "Bakery IA - API Performance",
|
||||||
|
"uploadedGrafana": false,
|
||||||
|
"uuid": "bakery-ia-api-01",
|
||||||
|
"version": "v4",
|
||||||
|
"collapsableRowsMigrated": true,
|
||||||
|
"layout": [
|
||||||
{
|
{
|
||||||
"title": "Request Volume",
|
"x": 0,
|
||||||
"type": "timeseries",
|
"y": 0,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "http_server_requests_seconds_count",
|
"h": 3,
|
||||||
"aggregate": "sum",
|
"i": "request-volume",
|
||||||
"groupBy": ["api"],
|
"moved": false,
|
||||||
"filters": [
|
"static": false
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"key": "api",
|
"x": 6,
|
||||||
"operator": "=",
|
"y": 0,
|
||||||
"value": "${api}"
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "error-rate",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 0,
|
||||||
|
"y": 3,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "avg-response-time",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 6,
|
||||||
|
"y": 3,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "p95-latency",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": {
|
||||||
|
"service": {
|
||||||
|
"id": "service-var",
|
||||||
|
"name": "service",
|
||||||
|
"description": "Filter by API service",
|
||||||
|
"type": "QUERY",
|
||||||
|
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'http_server_requests_seconds_count' AND value != '' ORDER BY value",
|
||||||
|
"customValue": "",
|
||||||
|
"textboxValue": "",
|
||||||
|
"showALLOption": true,
|
||||||
|
"multiSelect": false,
|
||||||
|
"order": 1,
|
||||||
|
"modificationUUID": "",
|
||||||
|
"sort": "ASC",
|
||||||
|
"selectedValue": null
|
||||||
}
|
}
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "req/s"
|
|
||||||
},
|
},
|
||||||
|
"widgets": [
|
||||||
{
|
{
|
||||||
"title": "Error Rate",
|
"id": "request-volume",
|
||||||
"type": "timeseries",
|
"title": "Request Volume",
|
||||||
|
"description": "API request volume by service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "http_server_requests_seconds_count",
|
"builder": {
|
||||||
"aggregate": "sum",
|
"queryData": [
|
||||||
"groupBy": ["api", "status"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "api",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${api}"
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "http_server_requests_seconds_count",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "rate",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "service.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "api.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{api.name}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "req/s"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"key": "status",
|
"id": "error-rate",
|
||||||
"operator": "=~",
|
"title": "Error Rate",
|
||||||
|
"description": "API error rate by service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "http_server_requests_seconds_count",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "rate",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "api.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.api}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "status_code",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=~",
|
||||||
"value": "5.."
|
"value": "5.."
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"op": "AND"
|
||||||
},
|
},
|
||||||
"unit": "req/s"
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "api.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"key": "status_code",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{api.name}} - {{status_code}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "req/s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "avg-response-time",
|
||||||
"title": "Average Response Time",
|
"title": "Average Response Time",
|
||||||
"type": "timeseries",
|
"description": "Average API response time by endpoint",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "http_server_requests_seconds_sum",
|
"builder": {
|
||||||
"aggregate": "avg",
|
"queryData": [
|
||||||
"groupBy": ["api", "endpoint"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "api",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${api}"
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "http_server_requests_seconds_sum",
|
||||||
|
"dataType": "float64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "avg",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "api.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.api}}"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"op": "AND"
|
||||||
},
|
},
|
||||||
"unit": "seconds"
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "api.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"key": "endpoint",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{api.name}} - {{endpoint}}",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "seconds"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "p95-latency",
|
||||||
"title": "P95 Latency",
|
"title": "P95 Latency",
|
||||||
"type": "timeseries",
|
"description": "95th percentile latency by endpoint",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "http_server_requests_seconds_bucket",
|
"builder": {
|
||||||
"aggregate": "histogram_quantile",
|
"queryData": [
|
||||||
"quantile": 0.95,
|
|
||||||
"groupBy": ["api", "endpoint"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "api",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${api}"
|
"aggregateOperator": "histogram_quantile",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "http_server_requests_seconds_bucket",
|
||||||
|
"dataType": "float64",
|
||||||
|
"type": "Histogram",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "avg",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "api.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.api}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "api.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "endpoint",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{api.name}} - {{endpoint}}",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "seconds"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
|
||||||
"unit": "seconds"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"variables": [
|
|
||||||
{
|
|
||||||
"name": "api",
|
|
||||||
"label": "API Service",
|
|
||||||
"type": "dropdown",
|
|
||||||
"default": "*",
|
|
||||||
"values": ["*", "gateway-api", "auth-api", "inventory-api", "production-api", "forecasting-api", "procurement-api"]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"layout": {
|
|
||||||
"type": "grid",
|
|
||||||
"columns": 12,
|
|
||||||
"gap": [16, 16]
|
|
||||||
},
|
|
||||||
"refresh": "15s",
|
|
||||||
"time": {
|
|
||||||
"from": "now-1h",
|
|
||||||
"to": "now"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
@@ -1,101 +1,333 @@
|
|||||||
{
|
{
|
||||||
"dashboard": {
|
"description": "Application performance monitoring dashboard using distributed traces and metrics",
|
||||||
"title": "Bakery IA - Application Performance",
|
"tags": ["application", "performance", "traces", "apm"],
|
||||||
"description": "Application performance monitoring dashboard for Bakery IA microservices",
|
"name": "bakery-ia-application-performance",
|
||||||
"tags": ["application", "performance", "apm"],
|
"title": "Bakery IA - Application Performance (APM)",
|
||||||
"panels": [
|
"uploadedGrafana": false,
|
||||||
|
"uuid": "bakery-ia-apm-01",
|
||||||
|
"version": "v4",
|
||||||
|
"collapsableRowsMigrated": true,
|
||||||
|
"layout": [
|
||||||
{
|
{
|
||||||
|
"x": 0,
|
||||||
|
"y": 0,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "latency-p99",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 6,
|
||||||
|
"y": 0,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "request-rate",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 0,
|
||||||
|
"y": 3,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "error-rate",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 6,
|
||||||
|
"y": 3,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "avg-duration",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": {
|
||||||
|
"service_name": {
|
||||||
|
"id": "service-var",
|
||||||
|
"name": "service_name",
|
||||||
|
"description": "Filter by service name",
|
||||||
|
"type": "QUERY",
|
||||||
|
"queryValue": "SELECT DISTINCT(serviceName) FROM signoz_traces.distributed_signoz_index_v2 ORDER BY serviceName",
|
||||||
|
"customValue": "",
|
||||||
|
"textboxValue": "",
|
||||||
|
"showALLOption": true,
|
||||||
|
"multiSelect": false,
|
||||||
|
"order": 1,
|
||||||
|
"modificationUUID": "",
|
||||||
|
"sort": "ASC",
|
||||||
|
"selectedValue": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"widgets": [
|
||||||
|
{
|
||||||
|
"id": "latency-p99",
|
||||||
|
"title": "P99 Latency",
|
||||||
|
"description": "99th percentile latency for selected service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "traces",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "p99",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "duration_ns",
|
||||||
|
"dataType": "float64",
|
||||||
|
"type": "",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"timeAggregation": "avg",
|
||||||
|
"spaceAggregation": "p99",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service_name}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{serviceName}}",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "ms"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "request-rate",
|
||||||
"title": "Request Rate",
|
"title": "Request Rate",
|
||||||
"type": "timeseries",
|
"description": "Requests per second for the service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "http_server_requests_seconds_count",
|
"builder": {
|
||||||
"aggregate": "sum",
|
"queryData": [
|
||||||
"groupBy": ["service"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "service",
|
"dataSource": "traces",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${service}"
|
"aggregateOperator": "count",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "",
|
||||||
|
"dataType": "",
|
||||||
|
"type": "",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "rate",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service_name}}"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"op": "AND"
|
||||||
},
|
},
|
||||||
"unit": "req/s"
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{serviceName}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "reqps"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"id": "error-rate",
|
||||||
"title": "Error Rate",
|
"title": "Error Rate",
|
||||||
"type": "timeseries",
|
"description": "Error rate percentage for the service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "http_server_requests_seconds_count",
|
"builder": {
|
||||||
"aggregate": "sum",
|
"queryData": [
|
||||||
"groupBy": ["service", "status"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "service",
|
"dataSource": "traces",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${service}"
|
"aggregateOperator": "count",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "",
|
||||||
|
"dataType": "",
|
||||||
|
"type": "",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "rate",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service_name}}"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"key": "status",
|
"key": {
|
||||||
"operator": "=~",
|
"key": "status_code",
|
||||||
"value": "5.."
|
"dataType": "string",
|
||||||
}
|
"type": "tag",
|
||||||
]
|
"isColumn": true
|
||||||
},
|
},
|
||||||
"unit": "req/s"
|
"op": "=",
|
||||||
},
|
"value": "STATUS_CODE_ERROR"
|
||||||
{
|
|
||||||
"title": "Average Response Time",
|
|
||||||
"type": "timeseries",
|
|
||||||
"query": {
|
|
||||||
"metric": "http_server_requests_seconds_sum",
|
|
||||||
"aggregate": "avg",
|
|
||||||
"groupBy": ["service"],
|
|
||||||
"filters": [
|
|
||||||
{
|
|
||||||
"key": "service",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${service}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "seconds"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"title": "Throughput",
|
|
||||||
"type": "timeseries",
|
|
||||||
"query": {
|
|
||||||
"metric": "http_server_requests_seconds_count",
|
|
||||||
"aggregate": "rate",
|
|
||||||
"groupBy": ["service"],
|
|
||||||
"filters": [
|
|
||||||
{
|
|
||||||
"key": "service",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${service}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "req/s"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"variables": [
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
{
|
{
|
||||||
"name": "service",
|
"key": "serviceName",
|
||||||
"label": "Service",
|
"dataType": "string",
|
||||||
"type": "dropdown",
|
"type": "tag",
|
||||||
"default": "*",
|
"isColumn": true
|
||||||
"values": ["*", "auth-service", "gateway-service", "forecasting-service", "inventory-service", "production-service", "procurement-service"]
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"layout": {
|
"legend": "{{serviceName}}",
|
||||||
"type": "grid",
|
"reduceTo": "sum"
|
||||||
"columns": 12,
|
}
|
||||||
"gap": [16, 16]
|
],
|
||||||
|
"queryFormulas": []
|
||||||
},
|
},
|
||||||
"refresh": "15s",
|
"queryType": "builder"
|
||||||
"time": {
|
},
|
||||||
"from": "now-30m",
|
"fillSpans": false,
|
||||||
"to": "now"
|
"yAxisUnit": "reqps"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "avg-duration",
|
||||||
|
"title": "Average Duration",
|
||||||
|
"description": "Average request duration",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "traces",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "duration_ns",
|
||||||
|
"dataType": "float64",
|
||||||
|
"type": "",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"timeAggregation": "avg",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service_name}}"
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{serviceName}}",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "ms"
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
@@ -1,101 +1,425 @@
|
|||||||
{
|
{
|
||||||
"dashboard": {
|
"description": "Comprehensive database performance monitoring for PostgreSQL, Redis, and RabbitMQ",
|
||||||
|
"tags": ["database", "postgresql", "redis", "rabbitmq", "performance"],
|
||||||
|
"name": "bakery-ia-database-performance",
|
||||||
"title": "Bakery IA - Database Performance",
|
"title": "Bakery IA - Database Performance",
|
||||||
"description": "Comprehensive database performance monitoring for PostgreSQL and Redis",
|
"uploadedGrafana": false,
|
||||||
"tags": ["database", "postgresql", "redis", "performance"],
|
"uuid": "bakery-ia-db-01",
|
||||||
"panels": [
|
"version": "v4",
|
||||||
|
"collapsableRowsMigrated": true,
|
||||||
|
"layout": [
|
||||||
{
|
{
|
||||||
"title": "Database Connections",
|
"x": 0,
|
||||||
"type": "timeseries",
|
"y": 0,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "pg_stat_activity_count",
|
"h": 3,
|
||||||
"aggregate": "sum",
|
"i": "pg-connections",
|
||||||
"groupBy": ["datname"],
|
"moved": false,
|
||||||
"filters": [
|
"static": false
|
||||||
{
|
|
||||||
"key": "datname",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${database}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "number"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Active Queries",
|
"x": 6,
|
||||||
"type": "timeseries",
|
"y": 0,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "pg_stat_activity_count",
|
"h": 3,
|
||||||
"aggregate": "sum",
|
"i": "pg-db-size",
|
||||||
"groupBy": ["datname"],
|
"moved": false,
|
||||||
"filters": [
|
"static": false
|
||||||
{
|
|
||||||
"key": "datname",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${database}"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"key": "state",
|
"x": 0,
|
||||||
"operator": "=",
|
"y": 3,
|
||||||
"value": "active"
|
"w": 6,
|
||||||
}
|
"h": 3,
|
||||||
]
|
"i": "redis-connected-clients",
|
||||||
},
|
"moved": false,
|
||||||
"unit": "number"
|
"static": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Database Size",
|
"x": 6,
|
||||||
"type": "timeseries",
|
"y": 3,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "pg_database_size_bytes",
|
"h": 3,
|
||||||
"aggregate": "sum",
|
"i": "redis-memory",
|
||||||
"groupBy": ["datname"],
|
"moved": false,
|
||||||
"filters": [
|
"static": false
|
||||||
{
|
|
||||||
"key": "datname",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${database}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "bytes"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Query Execution Time",
|
"x": 0,
|
||||||
"type": "timeseries",
|
"y": 6,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "pg_stat_statements_total_time",
|
"h": 3,
|
||||||
"aggregate": "avg",
|
"i": "rabbitmq-messages",
|
||||||
"groupBy": ["datname"],
|
"moved": false,
|
||||||
"filters": [
|
"static": false
|
||||||
{
|
|
||||||
"key": "datname",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${database}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
"unit": "seconds"
|
{
|
||||||
|
"x": 6,
|
||||||
|
"y": 6,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "rabbitmq-consumers",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"variables": [
|
"variables": {
|
||||||
{
|
"database": {
|
||||||
|
"id": "database-var",
|
||||||
"name": "database",
|
"name": "database",
|
||||||
"label": "Database",
|
"description": "Filter by PostgreSQL database name",
|
||||||
"type": "dropdown",
|
"type": "QUERY",
|
||||||
"default": "*",
|
"queryValue": "SELECT DISTINCT(resource_attrs['postgresql.database.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'postgresql.db_size' AND value != '' ORDER BY value",
|
||||||
"values": ["*", "postgresql", "redis"]
|
"customValue": "",
|
||||||
|
"textboxValue": "",
|
||||||
|
"showALLOption": true,
|
||||||
|
"multiSelect": false,
|
||||||
|
"order": 1,
|
||||||
|
"modificationUUID": "",
|
||||||
|
"sort": "ASC",
|
||||||
|
"selectedValue": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"widgets": [
|
||||||
|
{
|
||||||
|
"id": "pg-connections",
|
||||||
|
"title": "PostgreSQL - Active Connections",
|
||||||
|
"description": "Number of active PostgreSQL connections",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "postgresql.backends",
|
||||||
|
"dataType": "float64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "postgresql.database.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.database}}"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"layout": {
|
"op": "AND"
|
||||||
"type": "grid",
|
|
||||||
"columns": 12,
|
|
||||||
"gap": [16, 16]
|
|
||||||
},
|
},
|
||||||
"refresh": "30s",
|
"expression": "A",
|
||||||
"time": {
|
"disabled": false,
|
||||||
"from": "now-1h",
|
"having": [],
|
||||||
"to": "now"
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "postgresql.database.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{postgresql.database.name}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "pg-db-size",
|
||||||
|
"title": "PostgreSQL - Database Size",
|
||||||
|
"description": "Size of PostgreSQL databases in bytes",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "postgresql.db_size",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "postgresql.database.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.database}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "postgresql.database.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{postgresql.database.name}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "bytes"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "redis-connected-clients",
|
||||||
|
"title": "Redis - Connected Clients",
|
||||||
|
"description": "Number of clients connected to Redis",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "redis.clients.connected",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "host.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{host.name}}",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "redis-memory",
|
||||||
|
"title": "Redis - Memory Usage",
|
||||||
|
"description": "Redis memory usage in bytes",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "redis.memory.used",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "host.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{host.name}}",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "bytes"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "rabbitmq-messages",
|
||||||
|
"title": "RabbitMQ - Current Messages",
|
||||||
|
"description": "Number of messages currently in RabbitMQ queues",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "rabbitmq.message.current",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "queue",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "Queue: {{queue}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "rabbitmq-consumers",
|
||||||
|
"title": "RabbitMQ - Consumer Count",
|
||||||
|
"description": "Number of consumers per queue",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "rabbitmq.consumer.count",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "queue",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "Queue: {{queue}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
@@ -1,105 +1,348 @@
|
|||||||
{
|
{
|
||||||
"dashboard": {
|
|
||||||
"title": "Bakery IA - Error Tracking",
|
|
||||||
"description": "Comprehensive error tracking and analysis dashboard",
|
"description": "Comprehensive error tracking and analysis dashboard",
|
||||||
"tags": ["errors", "exceptions", "tracking"],
|
"tags": ["errors", "exceptions", "tracking"],
|
||||||
"panels": [
|
"name": "bakery-ia-error-tracking",
|
||||||
|
"title": "Bakery IA - Error Tracking",
|
||||||
|
"uploadedGrafana": false,
|
||||||
|
"uuid": "bakery-ia-errors-01",
|
||||||
|
"version": "v4",
|
||||||
|
"collapsableRowsMigrated": true,
|
||||||
|
"layout": [
|
||||||
{
|
{
|
||||||
|
"x": 0,
|
||||||
|
"y": 0,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "total-errors",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 6,
|
||||||
|
"y": 0,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "error-rate",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 0,
|
||||||
|
"y": 3,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "http-5xx",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 6,
|
||||||
|
"y": 3,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "http-4xx",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": {
|
||||||
|
"service": {
|
||||||
|
"id": "service-var",
|
||||||
|
"name": "service",
|
||||||
|
"description": "Filter by service name",
|
||||||
|
"type": "QUERY",
|
||||||
|
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'error_total' AND value != '' ORDER BY value",
|
||||||
|
"customValue": "",
|
||||||
|
"textboxValue": "",
|
||||||
|
"showALLOption": true,
|
||||||
|
"multiSelect": false,
|
||||||
|
"order": 1,
|
||||||
|
"modificationUUID": "",
|
||||||
|
"sort": "ASC",
|
||||||
|
"selectedValue": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"widgets": [
|
||||||
|
{
|
||||||
|
"id": "total-errors",
|
||||||
"title": "Total Errors",
|
"title": "Total Errors",
|
||||||
"type": "stat",
|
"description": "Total number of errors across all services",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "value",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "error_total",
|
"builder": {
|
||||||
"aggregate": "sum",
|
"queryData": [
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "service",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${service}"
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "error_total",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "sum",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "service.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"op": "AND"
|
||||||
},
|
},
|
||||||
"unit": "number"
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [],
|
||||||
|
"legend": "Total Errors",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"id": "error-rate",
|
||||||
"title": "Error Rate",
|
"title": "Error Rate",
|
||||||
"type": "timeseries",
|
"description": "Error rate over time",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "error_total",
|
"builder": {
|
||||||
"aggregate": "rate",
|
"queryData": [
|
||||||
"groupBy": ["service"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "service",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${service}"
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "error_total",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "rate",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "service.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"op": "AND"
|
||||||
},
|
},
|
||||||
"unit": "errors/s"
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{serviceName}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "errors/s"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"id": "http-5xx",
|
||||||
"title": "HTTP 5xx Errors",
|
"title": "HTTP 5xx Errors",
|
||||||
"type": "timeseries",
|
"description": "Server errors (5xx status codes)",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "http_server_requests_seconds_count",
|
"builder": {
|
||||||
"aggregate": "sum",
|
"queryData": [
|
||||||
"groupBy": ["service", "status"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "service",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${service}"
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "http_server_requests_seconds_count",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "sum",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "service.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"key": "status",
|
"key": {
|
||||||
"operator": "=~",
|
"key": "status_code",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=~",
|
||||||
"value": "5.."
|
"value": "5.."
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"op": "AND"
|
||||||
},
|
},
|
||||||
"unit": "number"
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"key": "status_code",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{serviceName}} - {{status_code}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "http-4xx",
|
||||||
"title": "HTTP 4xx Errors",
|
"title": "HTTP 4xx Errors",
|
||||||
"type": "timeseries",
|
"description": "Client errors (4xx status codes)",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "http_server_requests_seconds_count",
|
"builder": {
|
||||||
"aggregate": "sum",
|
"queryData": [
|
||||||
"groupBy": ["service", "status"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "service",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${service}"
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "http_server_requests_seconds_count",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "sum",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "service.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"key": "status",
|
"key": {
|
||||||
"operator": "=~",
|
"key": "status_code",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=~",
|
||||||
"value": "4.."
|
"value": "4.."
|
||||||
}
|
}
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "number"
|
|
||||||
}
|
|
||||||
],
|
],
|
||||||
"variables": [
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
{
|
{
|
||||||
"name": "service",
|
"key": "serviceName",
|
||||||
"label": "Service",
|
"dataType": "string",
|
||||||
"type": "dropdown",
|
"type": "tag",
|
||||||
"default": "*",
|
"isColumn": true
|
||||||
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
|
},
|
||||||
|
{
|
||||||
|
"key": "status_code",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"layout": {
|
"legend": "{{serviceName}} - {{status_code}}",
|
||||||
"type": "grid",
|
"reduceTo": "sum"
|
||||||
"columns": 12,
|
}
|
||||||
"gap": [16, 16]
|
],
|
||||||
|
"queryFormulas": []
|
||||||
},
|
},
|
||||||
"refresh": "15s",
|
"queryType": "builder"
|
||||||
"time": {
|
},
|
||||||
"from": "now-1h",
|
"fillSpans": false,
|
||||||
"to": "now"
|
"yAxisUnit": "number"
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
@@ -1,105 +1,423 @@
|
|||||||
{
|
{
|
||||||
"dashboard": {
|
"description": "Comprehensive infrastructure monitoring dashboard for Bakery IA Kubernetes cluster",
|
||||||
|
"tags": ["infrastructure", "kubernetes", "k8s", "system"],
|
||||||
|
"name": "bakery-ia-infrastructure-monitoring",
|
||||||
"title": "Bakery IA - Infrastructure Monitoring",
|
"title": "Bakery IA - Infrastructure Monitoring",
|
||||||
"description": "Comprehensive infrastructure monitoring dashboard for Bakery IA system",
|
"uploadedGrafana": false,
|
||||||
"tags": ["infrastructure", "system", "kubernetes"],
|
"uuid": "bakery-ia-infra-01",
|
||||||
"panels": [
|
"version": "v4",
|
||||||
|
"collapsableRowsMigrated": true,
|
||||||
|
"layout": [
|
||||||
{
|
{
|
||||||
"title": "CPU Usage",
|
"x": 0,
|
||||||
"type": "timeseries",
|
"y": 0,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "container_cpu_usage_seconds_total",
|
"h": 3,
|
||||||
"aggregate": "sum",
|
"i": "pod-count",
|
||||||
"groupBy": ["namespace"],
|
"moved": false,
|
||||||
"filters": [
|
"static": false
|
||||||
{
|
|
||||||
"key": "namespace",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "bakery-ia"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
"unit": "percent",
|
{
|
||||||
"yAxis": {
|
"x": 6,
|
||||||
"min": 0,
|
"y": 0,
|
||||||
"max": 100
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "pod-phase",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 0,
|
||||||
|
"y": 3,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "container-restarts",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 6,
|
||||||
|
"y": 3,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "node-condition",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 0,
|
||||||
|
"y": 6,
|
||||||
|
"w": 12,
|
||||||
|
"h": 3,
|
||||||
|
"i": "deployment-status",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": {
|
||||||
|
"namespace": {
|
||||||
|
"id": "namespace-var",
|
||||||
|
"name": "namespace",
|
||||||
|
"description": "Filter by Kubernetes namespace",
|
||||||
|
"type": "QUERY",
|
||||||
|
"queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'k8s.pod.phase' AND value != '' ORDER BY value",
|
||||||
|
"customValue": "",
|
||||||
|
"textboxValue": "",
|
||||||
|
"showALLOption": true,
|
||||||
|
"multiSelect": false,
|
||||||
|
"order": 1,
|
||||||
|
"modificationUUID": "",
|
||||||
|
"sort": "ASC",
|
||||||
|
"selectedValue": "bakery-ia"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"widgets": [
|
||||||
{
|
{
|
||||||
"title": "Memory Usage",
|
"id": "pod-count",
|
||||||
"type": "timeseries",
|
"title": "Total Pods",
|
||||||
|
"description": "Total number of pods in the namespace",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "value",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "container_memory_working_set_bytes",
|
"builder": {
|
||||||
"aggregate": "sum",
|
"queryData": [
|
||||||
"groupBy": ["namespace"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "namespace",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "bakery-ia"
|
"aggregateOperator": "count",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "k8s.pod.phase",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "k8s.namespace.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.namespace}}"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"op": "AND"
|
||||||
},
|
},
|
||||||
"unit": "bytes"
|
"expression": "A",
|
||||||
},
|
"disabled": false,
|
||||||
{
|
"having": [],
|
||||||
"title": "Network Traffic",
|
"stepInterval": 60,
|
||||||
"type": "timeseries",
|
"limit": null,
|
||||||
"query": {
|
"orderBy": [],
|
||||||
"metric": "container_network_receive_bytes_total",
|
"groupBy": [],
|
||||||
"aggregate": "sum",
|
"legend": "Total Pods",
|
||||||
"groupBy": ["namespace"],
|
"reduceTo": "sum"
|
||||||
"filters": [
|
|
||||||
{
|
|
||||||
"key": "namespace",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "bakery-ia"
|
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"queryFormulas": []
|
||||||
},
|
},
|
||||||
"unit": "bytes"
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Pod Status",
|
"id": "pod-phase",
|
||||||
"type": "stat",
|
"title": "Pod Phase Distribution",
|
||||||
|
"description": "Pods by phase (Running, Pending, Failed, etc.)",
|
||||||
|
"isStacked": true,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "kube_pod_status_phase",
|
"builder": {
|
||||||
"aggregate": "count",
|
"queryData": [
|
||||||
"groupBy": ["phase"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "namespace",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "bakery-ia"
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "k8s.pod.phase",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
},
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "k8s.namespace.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
{
|
{
|
||||||
"key": "phase",
|
"key": "phase",
|
||||||
"operator": "=",
|
"dataType": "string",
|
||||||
"value": "Running"
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{phase}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "container-restarts",
|
||||||
|
"title": "Container Restarts",
|
||||||
|
"description": "Container restart count over time",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "k8s.container.restarts",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "increase",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "k8s.namespace.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "k8s.pod.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{k8s.pod.name}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "node-condition",
|
||||||
|
"title": "Node Conditions",
|
||||||
|
"description": "Node condition status (Ready, MemoryPressure, DiskPressure, etc.)",
|
||||||
|
"isStacked": true,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "k8s.node.condition_ready",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "k8s.node.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{k8s.node.name}} Ready",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "deployment-status",
|
||||||
|
"title": "Deployment Status (Desired vs Available)",
|
||||||
|
"description": "Deployment replicas: desired vs available",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "k8s.deployment.desired",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "k8s.namespace.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "k8s.deployment.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{k8s.deployment.name}} (desired)",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "B",
|
||||||
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "k8s.deployment.available",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "k8s.namespace.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "B",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "k8s.deployment.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{k8s.deployment.name}} (available)",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
|
||||||
"unit": "number"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"variables": [
|
|
||||||
{
|
|
||||||
"name": "namespace",
|
|
||||||
"label": "Namespace",
|
|
||||||
"type": "dropdown",
|
|
||||||
"default": "bakery-ia",
|
|
||||||
"values": ["bakery-ia", "default", "kube-system"]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"layout": {
|
|
||||||
"type": "grid",
|
|
||||||
"columns": 12,
|
|
||||||
"gap": [16, 16]
|
|
||||||
},
|
|
||||||
"refresh": "30s",
|
|
||||||
"time": {
|
|
||||||
"from": "now-1h",
|
|
||||||
"to": "now"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
@@ -1,99 +1,333 @@
|
|||||||
{
|
{
|
||||||
"dashboard": {
|
|
||||||
"title": "Bakery IA - Log Analysis",
|
|
||||||
"description": "Comprehensive log analysis and search dashboard",
|
"description": "Comprehensive log analysis and search dashboard",
|
||||||
"tags": ["logs", "analysis", "search"],
|
"tags": ["logs", "analysis", "search"],
|
||||||
"panels": [
|
"name": "bakery-ia-log-analysis",
|
||||||
|
"title": "Bakery IA - Log Analysis",
|
||||||
|
"uploadedGrafana": false,
|
||||||
|
"uuid": "bakery-ia-logs-01",
|
||||||
|
"version": "v4",
|
||||||
|
"collapsableRowsMigrated": true,
|
||||||
|
"layout": [
|
||||||
{
|
{
|
||||||
"title": "Log Volume",
|
"x": 0,
|
||||||
"type": "timeseries",
|
"y": 0,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "log_lines_total",
|
"h": 3,
|
||||||
"aggregate": "sum",
|
"i": "log-volume",
|
||||||
"groupBy": ["service"],
|
"moved": false,
|
||||||
"filters": [
|
"static": false
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"key": "service",
|
"x": 6,
|
||||||
"operator": "=",
|
"y": 0,
|
||||||
"value": "${service}"
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "error-logs",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 0,
|
||||||
|
"y": 3,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "logs-by-level",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"x": 6,
|
||||||
|
"y": 3,
|
||||||
|
"w": 6,
|
||||||
|
"h": 3,
|
||||||
|
"i": "logs-by-service",
|
||||||
|
"moved": false,
|
||||||
|
"static": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": {
|
||||||
|
"service": {
|
||||||
|
"id": "service-var",
|
||||||
|
"name": "service",
|
||||||
|
"description": "Filter by service name",
|
||||||
|
"type": "QUERY",
|
||||||
|
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'log_lines_total' AND value != '' ORDER BY value",
|
||||||
|
"customValue": "",
|
||||||
|
"textboxValue": "",
|
||||||
|
"showALLOption": true,
|
||||||
|
"multiSelect": false,
|
||||||
|
"order": 1,
|
||||||
|
"modificationUUID": "",
|
||||||
|
"sort": "ASC",
|
||||||
|
"selectedValue": null
|
||||||
}
|
}
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "logs/s"
|
|
||||||
},
|
},
|
||||||
|
"widgets": [
|
||||||
{
|
{
|
||||||
"title": "Error Logs",
|
"id": "log-volume",
|
||||||
"type": "timeseries",
|
"title": "Log Volume",
|
||||||
|
"description": "Total log volume by service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "log_lines_total",
|
"builder": {
|
||||||
"aggregate": "sum",
|
"queryData": [
|
||||||
"groupBy": ["service"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "service",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${service}"
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "log_lines_total",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "rate",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{serviceName}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "logs/s"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"id": "error-logs",
|
||||||
|
"title": "Error Logs",
|
||||||
|
"description": "Error log volume by service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "log_lines_total",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "rate",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
"key": "level",
|
"key": "level",
|
||||||
"operator": "=",
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
"value": "error"
|
"value": "error"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"op": "AND"
|
||||||
},
|
},
|
||||||
"unit": "logs/s"
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{serviceName}} (errors)",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "logs/s"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"id": "logs-by-level",
|
||||||
"title": "Logs by Level",
|
"title": "Logs by Level",
|
||||||
"type": "pie",
|
"description": "Distribution of logs by severity level",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "pie",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "log_lines_total",
|
"builder": {
|
||||||
"aggregate": "sum",
|
"queryData": [
|
||||||
"groupBy": ["level"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "service",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${service}"
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "log_lines_total",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "sum",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "level",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": false
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{level}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"id": "logs-by-service",
|
||||||
"title": "Logs by Service",
|
"title": "Logs by Service",
|
||||||
"type": "pie",
|
"description": "Distribution of logs by service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "pie",
|
||||||
"query": {
|
"query": {
|
||||||
"metric": "log_lines_total",
|
"builder": {
|
||||||
"aggregate": "sum",
|
"queryData": [
|
||||||
"groupBy": ["service"],
|
|
||||||
"filters": [
|
|
||||||
{
|
{
|
||||||
"key": "service",
|
"dataSource": "metrics",
|
||||||
"operator": "=",
|
"queryName": "A",
|
||||||
"value": "${service}"
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "log_lines_total",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "sum",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{serviceName}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
|
||||||
],
|
|
||||||
"variables": [
|
|
||||||
{
|
|
||||||
"name": "service",
|
|
||||||
"label": "Service",
|
|
||||||
"type": "dropdown",
|
|
||||||
"default": "*",
|
|
||||||
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"layout": {
|
|
||||||
"type": "grid",
|
|
||||||
"columns": 12,
|
|
||||||
"gap": [16, 16]
|
|
||||||
},
|
|
||||||
"refresh": "30s",
|
|
||||||
"time": {
|
|
||||||
"from": "now-1h",
|
|
||||||
"to": "now"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,92 +1,295 @@
|
|||||||
{
|
{
|
||||||
"dashboard": {
|
|
||||||
"title": "Bakery IA - System Health",
|
|
||||||
"description": "Comprehensive system health monitoring dashboard",
|
"description": "Comprehensive system health monitoring dashboard",
|
||||||
"tags": ["system", "health", "monitoring"],
|
"tags": ["system", "health", "monitoring"],
|
||||||
"panels": [
|
"name": "bakery-ia-system-health",
|
||||||
|
"title": "Bakery IA - System Health",
|
||||||
|
"uploadedGrafana": false,
|
||||||
|
"uuid": "bakery-ia-health-01",
|
||||||
|
"version": "v4",
|
||||||
|
"collapsableRowsMigrated": true,
|
||||||
|
"layout": [
|
||||||
{
|
{
|
||||||
"title": "System Availability",
|
"x": 0,
|
||||||
"type": "stat",
|
"y": 0,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "system_availability",
|
"h": 3,
|
||||||
"aggregate": "avg",
|
"i": "system-availability",
|
||||||
"filters": [
|
"moved": false,
|
||||||
{
|
"static": false
|
||||||
"key": "namespace",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${namespace}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Service Health Score",
|
"x": 6,
|
||||||
"type": "stat",
|
"y": 0,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "service_health_score",
|
"h": 3,
|
||||||
"aggregate": "avg",
|
"i": "health-score",
|
||||||
"filters": [
|
"moved": false,
|
||||||
{
|
"static": false
|
||||||
"key": "namespace",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${namespace}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "number"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "CPU Usage",
|
"x": 0,
|
||||||
"type": "timeseries",
|
"y": 3,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "system_cpu_usage",
|
"h": 3,
|
||||||
"aggregate": "avg",
|
"i": "cpu-usage",
|
||||||
"filters": [
|
"moved": false,
|
||||||
{
|
"static": false
|
||||||
"key": "namespace",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${namespace}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Memory Usage",
|
"x": 6,
|
||||||
"type": "timeseries",
|
"y": 3,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "system_memory_usage",
|
"h": 3,
|
||||||
"aggregate": "avg",
|
"i": "memory-usage",
|
||||||
"filters": [
|
"moved": false,
|
||||||
{
|
"static": false
|
||||||
"key": "namespace",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${namespace}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"variables": [
|
"variables": {
|
||||||
{
|
"namespace": {
|
||||||
|
"id": "namespace-var",
|
||||||
"name": "namespace",
|
"name": "namespace",
|
||||||
"label": "Namespace",
|
"description": "Filter by Kubernetes namespace",
|
||||||
"type": "dropdown",
|
"type": "QUERY",
|
||||||
"default": "bakery-ia",
|
"queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'system_availability' AND value != '' ORDER BY value",
|
||||||
"values": ["bakery-ia", "default"]
|
"customValue": "",
|
||||||
|
"textboxValue": "",
|
||||||
|
"showALLOption": true,
|
||||||
|
"multiSelect": false,
|
||||||
|
"order": 1,
|
||||||
|
"modificationUUID": "",
|
||||||
|
"sort": "ASC",
|
||||||
|
"selectedValue": "bakery-ia"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"widgets": [
|
||||||
|
{
|
||||||
|
"id": "system-availability",
|
||||||
|
"title": "System Availability",
|
||||||
|
"description": "Overall system availability percentage",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "value",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "system_availability",
|
||||||
|
"dataType": "float64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "k8s.namespace.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.namespace}}"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"layout": {
|
"op": "AND"
|
||||||
"type": "grid",
|
|
||||||
"columns": 12,
|
|
||||||
"gap": [16, 16]
|
|
||||||
},
|
},
|
||||||
"refresh": "30s",
|
"expression": "A",
|
||||||
"time": {
|
"disabled": false,
|
||||||
"from": "now-1h",
|
"having": [],
|
||||||
"to": "now"
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [],
|
||||||
|
"legend": "System Availability",
|
||||||
|
"reduceTo": "avg"
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "percent"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "health-score",
|
||||||
|
"title": "Service Health Score",
|
||||||
|
"description": "Overall service health score",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "value",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "service_health_score",
|
||||||
|
"dataType": "float64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "k8s.namespace.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.namespace}}"
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [],
|
||||||
|
"legend": "Health Score",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "cpu-usage",
|
||||||
|
"title": "CPU Usage",
|
||||||
|
"description": "System CPU usage over time",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "system_cpu_usage",
|
||||||
|
"dataType": "float64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "avg",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "k8s.namespace.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [],
|
||||||
|
"legend": "CPU Usage",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "percent"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "memory-usage",
|
||||||
|
"title": "Memory Usage",
|
||||||
|
"description": "System memory usage over time",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "system_memory_usage",
|
||||||
|
"dataType": "float64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "avg",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "k8s.namespace.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.namespace}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [],
|
||||||
|
"legend": "Memory Usage",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "percent"
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
@@ -1,96 +1,323 @@
|
|||||||
{
|
{
|
||||||
"dashboard": {
|
|
||||||
"title": "Bakery IA - User Activity",
|
|
||||||
"description": "User activity and behavior monitoring dashboard",
|
"description": "User activity and behavior monitoring dashboard",
|
||||||
"tags": ["user", "activity", "behavior"],
|
"tags": ["user", "activity", "behavior"],
|
||||||
"panels": [
|
"name": "bakery-ia-user-activity",
|
||||||
|
"title": "Bakery IA - User Activity",
|
||||||
|
"uploadedGrafana": false,
|
||||||
|
"uuid": "bakery-ia-user-01",
|
||||||
|
"version": "v4",
|
||||||
|
"collapsableRowsMigrated": true,
|
||||||
|
"layout": [
|
||||||
{
|
{
|
||||||
"title": "Active Users",
|
"x": 0,
|
||||||
"type": "timeseries",
|
"y": 0,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "active_users",
|
"h": 3,
|
||||||
"aggregate": "sum",
|
"i": "active-users",
|
||||||
"groupBy": ["service"],
|
"moved": false,
|
||||||
"filters": [
|
"static": false
|
||||||
{
|
|
||||||
"key": "service",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${service}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "number"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "User Sessions",
|
"x": 6,
|
||||||
"type": "timeseries",
|
"y": 0,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "user_sessions_total",
|
"h": 3,
|
||||||
"aggregate": "sum",
|
"i": "user-sessions",
|
||||||
"groupBy": ["service"],
|
"moved": false,
|
||||||
"filters": [
|
"static": false
|
||||||
{
|
|
||||||
"key": "service",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${service}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "number"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "API Calls per User",
|
"x": 0,
|
||||||
"type": "timeseries",
|
"y": 3,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "api_calls_per_user",
|
"h": 3,
|
||||||
"aggregate": "avg",
|
"i": "api-calls-per-user",
|
||||||
"groupBy": ["service"],
|
"moved": false,
|
||||||
"filters": [
|
"static": false
|
||||||
{
|
|
||||||
"key": "service",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${service}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "number"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"title": "Session Duration",
|
"x": 6,
|
||||||
"type": "timeseries",
|
"y": 3,
|
||||||
"query": {
|
"w": 6,
|
||||||
"metric": "session_duration_seconds",
|
"h": 3,
|
||||||
"aggregate": "avg",
|
"i": "session-duration",
|
||||||
"groupBy": ["service"],
|
"moved": false,
|
||||||
"filters": [
|
"static": false
|
||||||
{
|
|
||||||
"key": "service",
|
|
||||||
"operator": "=",
|
|
||||||
"value": "${service}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "seconds"
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"variables": [
|
"variables": {
|
||||||
{
|
"service": {
|
||||||
|
"id": "service-var",
|
||||||
"name": "service",
|
"name": "service",
|
||||||
"label": "Service",
|
"description": "Filter by service name",
|
||||||
"type": "dropdown",
|
"type": "QUERY",
|
||||||
"default": "*",
|
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'active_users' AND value != '' ORDER BY value",
|
||||||
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service"]
|
"customValue": "",
|
||||||
|
"textboxValue": "",
|
||||||
|
"showALLOption": true,
|
||||||
|
"multiSelect": false,
|
||||||
|
"order": 1,
|
||||||
|
"modificationUUID": "",
|
||||||
|
"sort": "ASC",
|
||||||
|
"selectedValue": null
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"widgets": [
|
||||||
|
{
|
||||||
|
"id": "active-users",
|
||||||
|
"title": "Active Users",
|
||||||
|
"description": "Number of active users by service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "active_users",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "latest",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "service.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"layout": {
|
"op": "AND"
|
||||||
"type": "grid",
|
|
||||||
"columns": 12,
|
|
||||||
"gap": [16, 16]
|
|
||||||
},
|
},
|
||||||
"refresh": "30s",
|
"expression": "A",
|
||||||
"time": {
|
"disabled": false,
|
||||||
"from": "now-1h",
|
"having": [],
|
||||||
"to": "now"
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "service.name",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "resource",
|
||||||
|
"isColumn": false
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{service.name}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "user-sessions",
|
||||||
|
"title": "User Sessions",
|
||||||
|
"description": "Total user sessions by service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "sum",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "user_sessions_total",
|
||||||
|
"dataType": "int64",
|
||||||
|
"type": "Counter",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "sum",
|
||||||
|
"spaceAggregation": "sum",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{serviceName}}",
|
||||||
|
"reduceTo": "sum"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "api-calls-per-user",
|
||||||
|
"title": "API Calls per User",
|
||||||
|
"description": "Average API calls per user by service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "api_calls_per_user",
|
||||||
|
"dataType": "float64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "avg",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{serviceName}}",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "none"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "session-duration",
|
||||||
|
"title": "Session Duration",
|
||||||
|
"description": "Average session duration by service",
|
||||||
|
"isStacked": false,
|
||||||
|
"nullZeroValues": "zero",
|
||||||
|
"opacity": "1",
|
||||||
|
"panelTypes": "graph",
|
||||||
|
"query": {
|
||||||
|
"builder": {
|
||||||
|
"queryData": [
|
||||||
|
{
|
||||||
|
"dataSource": "metrics",
|
||||||
|
"queryName": "A",
|
||||||
|
"aggregateOperator": "avg",
|
||||||
|
"aggregateAttribute": {
|
||||||
|
"key": "session_duration_seconds",
|
||||||
|
"dataType": "float64",
|
||||||
|
"type": "Gauge",
|
||||||
|
"isColumn": false
|
||||||
|
},
|
||||||
|
"timeAggregation": "avg",
|
||||||
|
"spaceAggregation": "avg",
|
||||||
|
"functions": [],
|
||||||
|
"filters": {
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"key": {
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
},
|
||||||
|
"op": "=",
|
||||||
|
"value": "{{.service}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"op": "AND"
|
||||||
|
},
|
||||||
|
"expression": "A",
|
||||||
|
"disabled": false,
|
||||||
|
"having": [],
|
||||||
|
"stepInterval": 60,
|
||||||
|
"limit": null,
|
||||||
|
"orderBy": [],
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"key": "serviceName",
|
||||||
|
"dataType": "string",
|
||||||
|
"type": "tag",
|
||||||
|
"isColumn": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"legend": "{{serviceName}}",
|
||||||
|
"reduceTo": "avg"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"queryFormulas": []
|
||||||
|
},
|
||||||
|
"queryType": "builder"
|
||||||
|
},
|
||||||
|
"fillSpans": false,
|
||||||
|
"yAxisUnit": "seconds"
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
@@ -1,160 +1,61 @@
|
|||||||
"""Main FastAPI application for AI Insights Service."""
|
"""Main FastAPI application for AI Insights Service."""
|
||||||
|
|
||||||
from fastapi import FastAPI, Response
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
import structlog
|
import structlog
|
||||||
import os
|
|
||||||
|
|
||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
from app.core.database import init_db, close_db
|
from app.core.database import init_db, close_db
|
||||||
from app.api import insights
|
from app.api import insights
|
||||||
from shared.monitoring.logging import setup_logging
|
from shared.service_base import StandardFastAPIService
|
||||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
|
||||||
from shared.monitoring.system_metrics import SystemMetricsCollector
|
|
||||||
|
|
||||||
# OpenTelemetry imports
|
# Initialize logger
|
||||||
from opentelemetry import trace
|
|
||||||
from opentelemetry.sdk.trace import TracerProvider
|
|
||||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
||||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
||||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
|
||||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
|
||||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
|
||||||
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
|
||||||
from opentelemetry.sdk.resources import Resource
|
|
||||||
|
|
||||||
# Configure OpenTelemetry tracing
|
|
||||||
def setup_tracing(service_name: str = "ai-insights"):
|
|
||||||
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
|
|
||||||
resource = Resource.create({"service.name": service_name})
|
|
||||||
|
|
||||||
otlp_exporter = OTLPSpanExporter(
|
|
||||||
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
|
|
||||||
insecure=True
|
|
||||||
)
|
|
||||||
|
|
||||||
provider = TracerProvider(resource=resource)
|
|
||||||
processor = BatchSpanProcessor(otlp_exporter)
|
|
||||||
provider.add_span_processor(processor)
|
|
||||||
trace.set_tracer_provider(provider)
|
|
||||||
|
|
||||||
return provider
|
|
||||||
|
|
||||||
# Initialize tracing
|
|
||||||
tracer_provider = setup_tracing("ai-insights")
|
|
||||||
|
|
||||||
# Setup logging
|
|
||||||
setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
|
||||||
logger = structlog.get_logger()
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
# Setup OpenTelemetry logging export if enabled
|
|
||||||
logger.info(f"OTEL_LOGS_EXPORTER env var: {os.getenv('OTEL_LOGS_EXPORTER', 'not set')}")
|
|
||||||
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
|
|
||||||
try:
|
|
||||||
logger.info("Attempting to setup OpenTelemetry logging")
|
|
||||||
from shared.monitoring.logs_exporter import setup_otel_logging
|
|
||||||
result = setup_otel_logging("ai-insights", settings.SERVICE_VERSION)
|
|
||||||
if result:
|
|
||||||
logger.info("OpenTelemetry logs export enabled for ai-insights")
|
|
||||||
else:
|
|
||||||
logger.warning("OpenTelemetry logs export setup returned None")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
|
|
||||||
else:
|
|
||||||
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
|
|
||||||
|
|
||||||
|
class AIInsightsService(StandardFastAPIService):
|
||||||
|
"""AI Insights Service with standardized monitoring setup"""
|
||||||
|
|
||||||
@asynccontextmanager
|
async def on_startup(self, app):
|
||||||
async def lifespan(app: FastAPI):
|
"""Custom startup logic for AI Insights"""
|
||||||
"""Lifespan event handler for startup and shutdown."""
|
# Initialize database
|
||||||
# Startup
|
|
||||||
logger.info("Starting AI Insights Service", service=settings.SERVICE_NAME, version=settings.SERVICE_VERSION)
|
|
||||||
await init_db()
|
await init_db()
|
||||||
logger.info("Database initialized")
|
logger.info("Database initialized")
|
||||||
|
|
||||||
# Initialize system metrics collection
|
await super().on_startup(app)
|
||||||
system_metrics = SystemMetricsCollector("ai-insights")
|
|
||||||
logger.info("System metrics collection started")
|
|
||||||
|
|
||||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
|
async def on_shutdown(self, app):
|
||||||
logger.info("Metrics export configured via OpenTelemetry OTLP")
|
"""Custom shutdown logic for AI Insights"""
|
||||||
|
await super().on_shutdown(app)
|
||||||
|
|
||||||
yield
|
# Close database
|
||||||
|
|
||||||
# Shutdown
|
|
||||||
logger.info("Shutting down AI Insights Service")
|
|
||||||
await close_db()
|
await close_db()
|
||||||
logger.info("Database connections closed")
|
logger.info("Database connections closed")
|
||||||
|
|
||||||
|
|
||||||
# Create FastAPI app
|
# Create service instance
|
||||||
app = FastAPI(
|
service = AIInsightsService(
|
||||||
title="AI Insights Service",
|
service_name="ai-insights",
|
||||||
|
app_name="AI Insights Service",
|
||||||
description="Intelligent insights and recommendations for bakery operations",
|
description="Intelligent insights and recommendations for bakery operations",
|
||||||
version=settings.SERVICE_VERSION,
|
version=settings.SERVICE_VERSION,
|
||||||
lifespan=lifespan
|
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
|
||||||
|
cors_origins=getattr(settings, 'ALLOWED_ORIGINS', ["*"]),
|
||||||
|
api_prefix=settings.API_V1_PREFIX,
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_health_checks=True,
|
||||||
|
enable_tracing=True,
|
||||||
|
enable_cors=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Instrument FastAPI with OpenTelemetry
|
# Create FastAPI app
|
||||||
FastAPIInstrumentor.instrument_app(app)
|
app = service.create_app()
|
||||||
|
|
||||||
# Instrument httpx for outgoing requests
|
# Add service-specific routers
|
||||||
HTTPXClientInstrumentor().instrument()
|
service.add_router(
|
||||||
|
|
||||||
# Instrument Redis
|
|
||||||
RedisInstrumentor().instrument()
|
|
||||||
|
|
||||||
# Instrument SQLAlchemy
|
|
||||||
SQLAlchemyInstrumentor().instrument()
|
|
||||||
|
|
||||||
# Initialize metrics collector
|
|
||||||
metrics_collector = MetricsCollector("ai-insights")
|
|
||||||
|
|
||||||
# Add metrics middleware to track HTTP requests
|
|
||||||
add_metrics_middleware(app, metrics_collector)
|
|
||||||
|
|
||||||
# CORS middleware
|
|
||||||
app.add_middleware(
|
|
||||||
CORSMiddleware,
|
|
||||||
allow_origins=settings.ALLOWED_ORIGINS,
|
|
||||||
allow_credentials=True,
|
|
||||||
allow_methods=["*"],
|
|
||||||
allow_headers=["*"],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Include routers
|
|
||||||
app.include_router(
|
|
||||||
insights.router,
|
insights.router,
|
||||||
prefix=settings.API_V1_PREFIX,
|
|
||||||
tags=["insights"]
|
tags=["insights"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
|
||||||
async def root():
|
|
||||||
"""Root endpoint."""
|
|
||||||
return {
|
|
||||||
"service": settings.SERVICE_NAME,
|
|
||||||
"version": settings.SERVICE_VERSION,
|
|
||||||
"status": "running"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
|
||||||
async def health_check():
|
|
||||||
"""Health check endpoint."""
|
|
||||||
return {
|
|
||||||
"status": "healthy",
|
|
||||||
"service": settings.SERVICE_NAME,
|
|
||||||
"version": settings.SERVICE_VERSION
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
|
|
||||||
# The /metrics endpoint is not needed as metrics are pushed automatically
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|
||||||
|
|||||||
@@ -4,90 +4,28 @@ Alert Processor Service v2.0
|
|||||||
Main FastAPI application with RabbitMQ consumer lifecycle management.
|
Main FastAPI application with RabbitMQ consumer lifecycle management.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from fastapi import FastAPI, Response
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
import structlog
|
import structlog
|
||||||
import os
|
|
||||||
|
|
||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
from app.consumer.event_consumer import EventConsumer
|
from app.consumer.event_consumer import EventConsumer
|
||||||
from app.api import alerts, sse
|
from app.api import alerts, sse
|
||||||
from shared.redis_utils import initialize_redis, close_redis
|
from shared.redis_utils import initialize_redis, close_redis
|
||||||
from shared.monitoring.logging import setup_logging
|
from shared.service_base import StandardFastAPIService
|
||||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
|
||||||
from shared.monitoring.system_metrics import SystemMetricsCollector
|
|
||||||
|
|
||||||
# OpenTelemetry imports
|
# Initialize logger
|
||||||
from opentelemetry import trace
|
|
||||||
from opentelemetry.sdk.trace import TracerProvider
|
|
||||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
||||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
||||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
|
||||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
|
||||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
|
||||||
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
|
||||||
from opentelemetry.sdk.resources import Resource
|
|
||||||
|
|
||||||
# Configure OpenTelemetry tracing
|
|
||||||
def setup_tracing(service_name: str = "alert-processor"):
|
|
||||||
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
|
|
||||||
resource = Resource.create({"service.name": service_name})
|
|
||||||
|
|
||||||
otlp_exporter = OTLPSpanExporter(
|
|
||||||
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
|
|
||||||
insecure=True
|
|
||||||
)
|
|
||||||
|
|
||||||
provider = TracerProvider(resource=resource)
|
|
||||||
processor = BatchSpanProcessor(otlp_exporter)
|
|
||||||
provider.add_span_processor(processor)
|
|
||||||
trace.set_tracer_provider(provider)
|
|
||||||
|
|
||||||
return provider
|
|
||||||
|
|
||||||
# Initialize tracing
|
|
||||||
tracer_provider = setup_tracing("alert-processor")
|
|
||||||
|
|
||||||
# Setup logging
|
|
||||||
setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
|
||||||
|
|
||||||
# Setup OpenTelemetry logging export if enabled
|
|
||||||
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
|
|
||||||
try:
|
|
||||||
from shared.monitoring.logs_exporter import setup_otel_logging
|
|
||||||
result = setup_otel_logging("alert-processor", settings.VERSION)
|
|
||||||
if result:
|
|
||||||
logger = structlog.get_logger()
|
logger = structlog.get_logger()
|
||||||
logger.info("OpenTelemetry logs export enabled for alert-processor")
|
|
||||||
else:
|
|
||||||
logger = structlog.get_logger()
|
|
||||||
logger.warning("OpenTelemetry logs export setup returned None")
|
|
||||||
except Exception as e:
|
|
||||||
logger = structlog.get_logger()
|
|
||||||
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
|
|
||||||
else:
|
|
||||||
logger = structlog.get_logger()
|
|
||||||
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
|
|
||||||
|
|
||||||
# Global consumer instance
|
# Global consumer instance
|
||||||
consumer: EventConsumer = None
|
consumer: EventConsumer = None
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
class AlertProcessorService(StandardFastAPIService):
|
||||||
async def lifespan(app: FastAPI):
|
"""Alert Processor Service with standardized monitoring setup and RabbitMQ consumer"""
|
||||||
"""
|
|
||||||
Application lifecycle manager.
|
|
||||||
|
|
||||||
Startup: Initialize Redis and RabbitMQ consumer
|
async def on_startup(self, app):
|
||||||
Shutdown: Close consumer and Redis connections
|
"""Custom startup logic for Alert Processor"""
|
||||||
"""
|
|
||||||
global consumer
|
global consumer
|
||||||
|
|
||||||
logger.info("alert_processor_starting", version=settings.VERSION)
|
|
||||||
|
|
||||||
# Startup: Initialize Redis and start consumer
|
|
||||||
try:
|
|
||||||
# Initialize Redis connection
|
# Initialize Redis connection
|
||||||
await initialize_redis(
|
await initialize_redis(
|
||||||
settings.REDIS_URL,
|
settings.REDIS_URL,
|
||||||
@@ -96,69 +34,48 @@ async def lifespan(app: FastAPI):
|
|||||||
)
|
)
|
||||||
logger.info("redis_initialized")
|
logger.info("redis_initialized")
|
||||||
|
|
||||||
|
# Start RabbitMQ consumer
|
||||||
consumer = EventConsumer()
|
consumer = EventConsumer()
|
||||||
await consumer.start()
|
await consumer.start()
|
||||||
logger.info("alert_processor_started")
|
logger.info("rabbitmq_consumer_started")
|
||||||
|
|
||||||
# Initialize system metrics collection
|
await super().on_startup(app)
|
||||||
system_metrics = SystemMetricsCollector("alert-processor")
|
|
||||||
logger.info("System metrics collection started")
|
|
||||||
|
|
||||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
|
async def on_shutdown(self, app):
|
||||||
logger.info("Metrics export configured via OpenTelemetry OTLP")
|
"""Custom shutdown logic for Alert Processor"""
|
||||||
except Exception as e:
|
global consumer
|
||||||
logger.error("alert_processor_startup_failed", error=str(e))
|
|
||||||
raise
|
|
||||||
|
|
||||||
yield
|
await super().on_shutdown(app)
|
||||||
|
|
||||||
# Shutdown: Stop consumer and close Redis
|
# Stop RabbitMQ consumer
|
||||||
try:
|
|
||||||
if consumer:
|
if consumer:
|
||||||
await consumer.stop()
|
await consumer.stop()
|
||||||
|
logger.info("rabbitmq_consumer_stopped")
|
||||||
|
|
||||||
|
# Close Redis
|
||||||
await close_redis()
|
await close_redis()
|
||||||
logger.info("alert_processor_shutdown")
|
logger.info("redis_closed")
|
||||||
except Exception as e:
|
|
||||||
logger.error("alert_processor_shutdown_failed", error=str(e))
|
|
||||||
|
|
||||||
|
|
||||||
# Create FastAPI app
|
# Create service instance
|
||||||
app = FastAPI(
|
service = AlertProcessorService(
|
||||||
title="Alert Processor Service",
|
service_name="alert-processor",
|
||||||
|
app_name="Alert Processor Service",
|
||||||
description="Event processing, enrichment, and alert management system",
|
description="Event processing, enrichment, and alert management system",
|
||||||
version=settings.VERSION,
|
version=settings.VERSION,
|
||||||
lifespan=lifespan,
|
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
|
||||||
debug=settings.DEBUG
|
cors_origins=["*"], # Configure appropriately for production
|
||||||
|
api_prefix="/api/v1",
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_health_checks=True,
|
||||||
|
enable_tracing=True,
|
||||||
|
enable_cors=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Instrument FastAPI with OpenTelemetry
|
# Create FastAPI app
|
||||||
FastAPIInstrumentor.instrument_app(app)
|
app = service.create_app(debug=settings.DEBUG)
|
||||||
|
|
||||||
# Instrument httpx for outgoing requests
|
# Add service-specific routers
|
||||||
HTTPXClientInstrumentor().instrument()
|
|
||||||
|
|
||||||
# Instrument Redis
|
|
||||||
RedisInstrumentor().instrument()
|
|
||||||
|
|
||||||
# Instrument SQLAlchemy
|
|
||||||
SQLAlchemyInstrumentor().instrument()
|
|
||||||
|
|
||||||
# Initialize metrics collector
|
|
||||||
metrics_collector = MetricsCollector("alert-processor")
|
|
||||||
|
|
||||||
# Add metrics middleware to track HTTP requests
|
|
||||||
add_metrics_middleware(app, metrics_collector)
|
|
||||||
|
|
||||||
# CORS middleware
|
|
||||||
app.add_middleware(
|
|
||||||
CORSMiddleware,
|
|
||||||
allow_origins=["*"], # Configure appropriately for production
|
|
||||||
allow_credentials=True,
|
|
||||||
allow_methods=["*"],
|
|
||||||
allow_headers=["*"],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Include routers
|
|
||||||
app.include_router(
|
app.include_router(
|
||||||
alerts.router,
|
alerts.router,
|
||||||
prefix="/api/v1/tenants/{tenant_id}",
|
prefix="/api/v1/tenants/{tenant_id}",
|
||||||
@@ -172,34 +89,6 @@ app.include_router(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
|
||||||
async def health_check():
|
|
||||||
"""
|
|
||||||
Health check endpoint.
|
|
||||||
|
|
||||||
Returns service status and version.
|
|
||||||
"""
|
|
||||||
return {
|
|
||||||
"status": "healthy",
|
|
||||||
"service": settings.SERVICE_NAME,
|
|
||||||
"version": settings.VERSION
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
|
||||||
async def root():
|
|
||||||
"""Root endpoint with service info"""
|
|
||||||
return {
|
|
||||||
"service": settings.SERVICE_NAME,
|
|
||||||
"version": settings.VERSION,
|
|
||||||
"description": "Event processing, enrichment, and alert management system"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
|
|
||||||
# The /metrics endpoint is not needed as metrics are pushed automatically
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|
||||||
|
|||||||
@@ -3,192 +3,74 @@ Demo Session Service - Main Application
|
|||||||
Manages isolated demo sessions with ephemeral data
|
Manages isolated demo sessions with ephemeral data
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from fastapi import FastAPI, Request, Response
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
from fastapi.responses import JSONResponse
|
|
||||||
import structlog
|
import structlog
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
import os
|
|
||||||
|
|
||||||
from app.core import settings, DatabaseManager
|
from app.core import settings, DatabaseManager
|
||||||
from app.api import demo_sessions, demo_accounts, demo_operations, internal
|
from app.api import demo_sessions, demo_accounts, demo_operations, internal
|
||||||
from shared.redis_utils import initialize_redis, close_redis
|
from shared.redis_utils import initialize_redis, close_redis
|
||||||
from shared.monitoring.logging import setup_logging
|
from shared.service_base import StandardFastAPIService
|
||||||
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
|
|
||||||
from shared.monitoring.system_metrics import SystemMetricsCollector
|
|
||||||
|
|
||||||
# OpenTelemetry imports
|
# Initialize logger
|
||||||
from opentelemetry import trace
|
|
||||||
from opentelemetry.sdk.trace import TracerProvider
|
|
||||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
||||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
||||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
|
||||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
|
||||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
|
||||||
from opentelemetry.sdk.resources import Resource
|
|
||||||
|
|
||||||
# Configure OpenTelemetry tracing
|
|
||||||
def setup_tracing(service_name: str = "demo-session"):
|
|
||||||
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
|
|
||||||
resource = Resource.create({"service.name": service_name})
|
|
||||||
|
|
||||||
otlp_exporter = OTLPSpanExporter(
|
|
||||||
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
|
|
||||||
insecure=True
|
|
||||||
)
|
|
||||||
|
|
||||||
provider = TracerProvider(resource=resource)
|
|
||||||
processor = BatchSpanProcessor(otlp_exporter)
|
|
||||||
provider.add_span_processor(processor)
|
|
||||||
trace.set_tracer_provider(provider)
|
|
||||||
|
|
||||||
return provider
|
|
||||||
|
|
||||||
# Initialize tracing
|
|
||||||
tracer_provider = setup_tracing("demo-session")
|
|
||||||
|
|
||||||
# Setup logging
|
|
||||||
setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
|
||||||
|
|
||||||
# Setup OpenTelemetry logging export if enabled
|
|
||||||
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
|
|
||||||
try:
|
|
||||||
from shared.monitoring.logs_exporter import setup_otel_logging
|
|
||||||
result = setup_otel_logging("demo-session", settings.VERSION)
|
|
||||||
if result:
|
|
||||||
logger = structlog.get_logger()
|
logger = structlog.get_logger()
|
||||||
logger.info("OpenTelemetry logs export enabled for demo-session")
|
|
||||||
else:
|
|
||||||
logger = structlog.get_logger()
|
|
||||||
logger.warning("OpenTelemetry logs export setup returned None")
|
|
||||||
except Exception as e:
|
|
||||||
logger = structlog.get_logger()
|
|
||||||
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
|
|
||||||
else:
|
|
||||||
logger = structlog.get_logger()
|
|
||||||
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
|
|
||||||
|
|
||||||
# Initialize database
|
# Initialize database manager
|
||||||
db_manager = DatabaseManager()
|
db_manager = DatabaseManager()
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
class DemoSessionService(StandardFastAPIService):
|
||||||
async def lifespan(app: FastAPI):
|
"""Demo Session Service with standardized monitoring setup"""
|
||||||
"""Application lifespan handler"""
|
|
||||||
logger.info("Starting Demo Session Service", version=settings.VERSION)
|
|
||||||
|
|
||||||
|
async def on_startup(self, app):
|
||||||
|
"""Custom startup logic for Demo Session"""
|
||||||
# Initialize database
|
# Initialize database
|
||||||
db_manager.initialize()
|
db_manager.initialize()
|
||||||
|
logger.info("Database initialized")
|
||||||
|
|
||||||
# Initialize Redis using shared implementation
|
# Initialize Redis
|
||||||
await initialize_redis(
|
await initialize_redis(
|
||||||
redis_url=settings.REDIS_URL,
|
redis_url=settings.REDIS_URL,
|
||||||
db=0,
|
db=0,
|
||||||
max_connections=50
|
max_connections=50
|
||||||
)
|
)
|
||||||
|
logger.info("Redis initialized")
|
||||||
|
|
||||||
# Initialize system metrics collection
|
await super().on_startup(app)
|
||||||
system_metrics = SystemMetricsCollector("demo-session")
|
|
||||||
logger.info("System metrics collection started")
|
|
||||||
|
|
||||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
|
async def on_shutdown(self, app):
|
||||||
logger.info("Metrics export configured via OpenTelemetry OTLP")
|
"""Custom shutdown logic for Demo Session"""
|
||||||
|
await super().on_shutdown(app)
|
||||||
|
|
||||||
logger.info("Demo Session Service started successfully")
|
# Cleanup
|
||||||
|
|
||||||
yield
|
|
||||||
|
|
||||||
# Cleanup on shutdown
|
|
||||||
await db_manager.close()
|
await db_manager.close()
|
||||||
await close_redis()
|
await close_redis()
|
||||||
|
logger.info("Database and Redis connections closed")
|
||||||
logger.info("Demo Session Service stopped")
|
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(
|
# Create service instance
|
||||||
title="Demo Session Service",
|
service = DemoSessionService(
|
||||||
|
service_name="demo-session",
|
||||||
|
app_name="Demo Session Service",
|
||||||
description="Manages isolated demo sessions for prospect users",
|
description="Manages isolated demo sessions for prospect users",
|
||||||
version=settings.VERSION,
|
version=settings.VERSION,
|
||||||
lifespan=lifespan
|
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
|
||||||
|
cors_origins=["*"], # Configure appropriately for production
|
||||||
|
api_prefix="/api/v1",
|
||||||
|
enable_metrics=True,
|
||||||
|
enable_health_checks=True,
|
||||||
|
enable_tracing=True,
|
||||||
|
enable_cors=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Instrument FastAPI with OpenTelemetry
|
# Create FastAPI app
|
||||||
FastAPIInstrumentor.instrument_app(app)
|
app = service.create_app(debug=settings.DEBUG)
|
||||||
|
|
||||||
# Instrument httpx for outgoing requests
|
# Add service-specific routers
|
||||||
HTTPXClientInstrumentor().instrument()
|
|
||||||
|
|
||||||
# Instrument Redis
|
|
||||||
RedisInstrumentor().instrument()
|
|
||||||
|
|
||||||
# Initialize metrics collector
|
|
||||||
metrics_collector = MetricsCollector("demo-session")
|
|
||||||
|
|
||||||
# Add metrics middleware to track HTTP requests
|
|
||||||
add_metrics_middleware(app, metrics_collector)
|
|
||||||
|
|
||||||
# CORS middleware
|
|
||||||
app.add_middleware(
|
|
||||||
CORSMiddleware,
|
|
||||||
allow_origins=["*"],
|
|
||||||
allow_credentials=True,
|
|
||||||
allow_methods=["*"],
|
|
||||||
allow_headers=["*"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.exception_handler(Exception)
|
|
||||||
async def global_exception_handler(request: Request, exc: Exception):
|
|
||||||
"""Global exception handler"""
|
|
||||||
logger.error(
|
|
||||||
"Unhandled exception",
|
|
||||||
path=request.url.path,
|
|
||||||
method=request.method,
|
|
||||||
error=str(exc)
|
|
||||||
)
|
|
||||||
return JSONResponse(
|
|
||||||
status_code=500,
|
|
||||||
content={"detail": "Internal server error"}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Include routers
|
|
||||||
app.include_router(demo_sessions.router)
|
app.include_router(demo_sessions.router)
|
||||||
app.include_router(demo_accounts.router)
|
app.include_router(demo_accounts.router)
|
||||||
app.include_router(demo_operations.router)
|
app.include_router(demo_operations.router)
|
||||||
app.include_router(internal.router)
|
app.include_router(internal.router)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
|
||||||
async def root():
|
|
||||||
"""Root endpoint"""
|
|
||||||
return {
|
|
||||||
"service": "demo-session",
|
|
||||||
"version": settings.VERSION,
|
|
||||||
"status": "running"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
|
||||||
async def health():
|
|
||||||
"""Health check endpoint"""
|
|
||||||
from shared.redis_utils import get_redis_manager
|
|
||||||
|
|
||||||
redis_manager = await get_redis_manager()
|
|
||||||
redis_ok = await redis_manager.health_check()
|
|
||||||
|
|
||||||
return {
|
|
||||||
"status": "healthy" if redis_ok else "degraded",
|
|
||||||
"service": "demo-session",
|
|
||||||
"version": settings.VERSION,
|
|
||||||
"redis": "connected" if redis_ok else "disconnected"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
|
|
||||||
# The /metrics endpoint is not needed as metrics are pushed automatically
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
uvicorn.run(
|
uvicorn.run(
|
||||||
|
|||||||
@@ -1,14 +1,34 @@
|
|||||||
"""
|
"""
|
||||||
Shared monitoring package for microservices
|
Shared monitoring package for microservices
|
||||||
|
|
||||||
|
Provides unified OpenTelemetry-based observability:
|
||||||
|
- Traces: Distributed tracing
|
||||||
|
- Metrics: System and application metrics
|
||||||
|
- Logs: Structured logging
|
||||||
|
|
||||||
|
All signals exported to SigNoz via OTLP.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Core setup - START HERE
|
||||||
from .logging import setup_logging
|
from .logging import setup_logging
|
||||||
from .metrics import setup_metrics_early, get_metrics_collector, MetricsCollector
|
from .telemetry import (
|
||||||
from .health_checks import (
|
setup_telemetry,
|
||||||
HealthCheckManager,
|
setup_telemetry_simple,
|
||||||
FastAPIHealthChecker,
|
get_telemetry_status,
|
||||||
create_health_manager,
|
TelemetryProviders
|
||||||
setup_fastapi_health_checks
|
)
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
from .otel_config import OTelConfig, OTelEndpoints
|
||||||
|
|
||||||
|
# Individual signal setup (used by telemetry.py)
|
||||||
|
from .tracing import (
|
||||||
|
setup_tracing,
|
||||||
|
get_current_trace_id,
|
||||||
|
get_current_span_id,
|
||||||
|
add_trace_attributes,
|
||||||
|
add_trace_event,
|
||||||
|
record_exception
|
||||||
)
|
)
|
||||||
from .logs_exporter import (
|
from .logs_exporter import (
|
||||||
setup_otel_logging,
|
setup_otel_logging,
|
||||||
@@ -27,23 +47,51 @@ from .system_metrics import (
|
|||||||
setup_all_metrics
|
setup_all_metrics
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Health checks
|
||||||
|
from .health_checks import (
|
||||||
|
HealthCheckManager,
|
||||||
|
FastAPIHealthChecker,
|
||||||
|
create_health_manager,
|
||||||
|
setup_fastapi_health_checks
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
# CORE - Start with these
|
||||||
'setup_logging',
|
'setup_logging',
|
||||||
'setup_metrics_early',
|
'setup_telemetry',
|
||||||
'get_metrics_collector',
|
'setup_telemetry_simple',
|
||||||
'MetricsCollector',
|
'get_telemetry_status',
|
||||||
'HealthCheckManager',
|
'TelemetryProviders',
|
||||||
'FastAPIHealthChecker',
|
|
||||||
'create_health_manager',
|
# Configuration
|
||||||
'setup_fastapi_health_checks',
|
'OTelConfig',
|
||||||
|
'OTelEndpoints',
|
||||||
|
|
||||||
|
# Tracing
|
||||||
|
'setup_tracing',
|
||||||
|
'get_current_trace_id',
|
||||||
|
'get_current_span_id',
|
||||||
|
'add_trace_attributes',
|
||||||
|
'add_trace_event',
|
||||||
|
'record_exception',
|
||||||
|
|
||||||
|
# Logs
|
||||||
'setup_otel_logging',
|
'setup_otel_logging',
|
||||||
'add_log_context',
|
'add_log_context',
|
||||||
'get_current_trace_context',
|
'get_current_trace_context',
|
||||||
'StructlogOTELProcessor',
|
'StructlogOTELProcessor',
|
||||||
|
|
||||||
|
# Metrics
|
||||||
'setup_otel_metrics',
|
'setup_otel_metrics',
|
||||||
'OTelMetricsCollector',
|
'OTelMetricsCollector',
|
||||||
'create_dual_metrics_collector',
|
'create_dual_metrics_collector',
|
||||||
'SystemMetricsCollector',
|
'SystemMetricsCollector',
|
||||||
'ApplicationMetricsCollector',
|
'ApplicationMetricsCollector',
|
||||||
'setup_all_metrics'
|
'setup_all_metrics',
|
||||||
|
|
||||||
|
# Health checks
|
||||||
|
'HealthCheckManager',
|
||||||
|
'FastAPIHealthChecker',
|
||||||
|
'create_health_manager',
|
||||||
|
'setup_fastapi_health_checks',
|
||||||
]
|
]
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
OpenTelemetry Logs Integration for SigNoz
|
OpenTelemetry Logs Integration for SigNoz
|
||||||
Exports structured logs to SigNoz via OpenTelemetry Collector
|
Exports structured logs to SigNoz via OpenTelemetry Collector using HTTP protocol
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -10,14 +10,21 @@ from typing import Optional
|
|||||||
from opentelemetry._logs import set_logger_provider
|
from opentelemetry._logs import set_logger_provider
|
||||||
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
|
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
|
||||||
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
|
||||||
|
from opentelemetry.sdk.resources import Resource
|
||||||
|
|
||||||
|
# Try to import HTTP log exporter (logs always use HTTP)
|
||||||
try:
|
try:
|
||||||
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
|
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
|
||||||
|
HTTP_LOG_EXPORTER_AVAILABLE = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
try:
|
try:
|
||||||
from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
|
from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
|
||||||
|
HTTP_LOG_EXPORTER_AVAILABLE = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
OTLPLogExporter = None
|
OTLPLogExporter = None
|
||||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
HTTP_LOG_EXPORTER_AVAILABLE = False
|
||||||
|
|
||||||
|
from .otel_config import OTelConfig
|
||||||
|
|
||||||
logger = structlog.get_logger()
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
@@ -31,13 +38,14 @@ def setup_otel_logging(
|
|||||||
"""
|
"""
|
||||||
Setup OpenTelemetry logging to export logs to SigNoz.
|
Setup OpenTelemetry logging to export logs to SigNoz.
|
||||||
|
|
||||||
This integrates with Python's standard logging to automatically
|
Uses HTTP protocol (port 4318) for sending logs to SigNoz.
|
||||||
export all log records to SigNoz via the OTLP protocol.
|
Integrates with Python's standard logging to automatically export
|
||||||
|
all log records to SigNoz via the OTLP HTTP protocol.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
service_name: Name of the service (e.g., "auth-service")
|
service_name: Name of the service (e.g., "auth-service")
|
||||||
service_version: Version of the service
|
service_version: Version of the service
|
||||||
otel_endpoint: OpenTelemetry collector endpoint (default from env)
|
otel_endpoint: Optional override for OTLP endpoint (HTTP format with path)
|
||||||
enable_console: Whether to also log to console (default: True)
|
enable_console: Whether to also log to console (default: True)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -47,7 +55,7 @@ def setup_otel_logging(
|
|||||||
from shared.monitoring.logs_exporter import setup_otel_logging
|
from shared.monitoring.logs_exporter import setup_otel_logging
|
||||||
|
|
||||||
# Setup during service initialization
|
# Setup during service initialization
|
||||||
setup_otel_logging("auth-service", "1.0.0")
|
handler = setup_otel_logging("auth-service", "1.0.0")
|
||||||
|
|
||||||
# Now all standard logging calls will be exported to SigNoz
|
# Now all standard logging calls will be exported to SigNoz
|
||||||
import logging
|
import logging
|
||||||
@@ -56,7 +64,7 @@ def setup_otel_logging(
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Check if logging export is enabled
|
# Check if logging export is enabled
|
||||||
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() != "otlp":
|
if not OTelConfig.is_enabled("logs"):
|
||||||
logger.info(
|
logger.info(
|
||||||
"OpenTelemetry logs export disabled",
|
"OpenTelemetry logs export disabled",
|
||||||
service=service_name,
|
service=service_name,
|
||||||
@@ -64,59 +72,36 @@ def setup_otel_logging(
|
|||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Get OTLP endpoint from environment or parameter
|
# Check if HTTP log exporter is available
|
||||||
# For logs, we need to use the HTTP endpoint (port 4318), not the gRPC endpoint (port 4317)
|
if not HTTP_LOG_EXPORTER_AVAILABLE or OTLPLogExporter is None:
|
||||||
if otel_endpoint is None:
|
|
||||||
# Try logs-specific endpoint first, then fall back to general OTLP endpoint
|
|
||||||
otel_endpoint = os.getenv(
|
|
||||||
"OTEL_EXPORTER_OTLP_LOGS_ENDPOINT",
|
|
||||||
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"Original OTLP endpoint for logs: {otel_endpoint}")
|
|
||||||
|
|
||||||
# If we got the tracing endpoint (4317), switch to logs endpoint (4318)
|
|
||||||
if otel_endpoint.endswith(":4317"):
|
|
||||||
logger.info("Converting tracing endpoint (4317) to logs endpoint (4318)")
|
|
||||||
otel_endpoint = otel_endpoint.replace(":4317", ":4318")
|
|
||||||
|
|
||||||
logger.info(f"Final OTLP endpoint for logs: {otel_endpoint}")
|
|
||||||
|
|
||||||
# Ensure endpoint has proper protocol prefix
|
|
||||||
if not otel_endpoint.startswith(("http://", "https://")):
|
|
||||||
# Default to HTTP for insecure connections
|
|
||||||
otel_endpoint = f"http://{otel_endpoint}"
|
|
||||||
|
|
||||||
# Ensure endpoint has /v1/logs path for HTTP
|
|
||||||
if not otel_endpoint.endswith("/v1/logs"):
|
|
||||||
otel_endpoint = f"{otel_endpoint}/v1/logs"
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Check if OTLPLogExporter is available
|
|
||||||
if OTLPLogExporter is None:
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"OpenTelemetry HTTP OTLP exporter not available",
|
"OpenTelemetry HTTP log exporter not available",
|
||||||
service=service_name,
|
service=service_name,
|
||||||
reason="opentelemetry-exporter-otlp-proto-http package not installed"
|
reason="opentelemetry-exporter-otlp-proto-http package not installed"
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Create resource with service information
|
try:
|
||||||
resource = Resource(attributes={
|
# Get endpoints from centralized config
|
||||||
SERVICE_NAME: service_name,
|
endpoints = OTelConfig.get_endpoints()
|
||||||
SERVICE_VERSION: service_version,
|
|
||||||
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
|
# Use provided endpoint or get from config
|
||||||
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
|
if otel_endpoint:
|
||||||
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
|
http_endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/logs")
|
||||||
})
|
else:
|
||||||
|
http_endpoint = endpoints.logs_http
|
||||||
|
|
||||||
|
# Get resource attributes
|
||||||
|
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
|
||||||
|
resource = Resource(attributes=resource_attrs)
|
||||||
|
|
||||||
# Configure logger provider
|
# Configure logger provider
|
||||||
logger_provider = LoggerProvider(resource=resource)
|
logger_provider = LoggerProvider(resource=resource)
|
||||||
set_logger_provider(logger_provider)
|
set_logger_provider(logger_provider)
|
||||||
|
|
||||||
# Configure OTLP exporter for logs
|
# Configure OTLP HTTP exporter for logs
|
||||||
otlp_exporter = OTLPLogExporter(
|
otlp_exporter = OTLPLogExporter(
|
||||||
endpoint=otel_endpoint,
|
endpoint=http_endpoint,
|
||||||
timeout=10
|
timeout=10
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -135,9 +120,10 @@ def setup_otel_logging(
|
|||||||
root_logger.addHandler(otel_handler)
|
root_logger.addHandler(otel_handler)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"OpenTelemetry logs export configured",
|
"OpenTelemetry logs export configured successfully",
|
||||||
service=service_name,
|
service=service_name,
|
||||||
otel_endpoint=otel_endpoint,
|
http_endpoint=http_endpoint,
|
||||||
|
protocol="http",
|
||||||
console_logging=enable_console
|
console_logging=enable_console
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -147,8 +133,7 @@ def setup_otel_logging(
|
|||||||
logger.error(
|
logger.error(
|
||||||
"Failed to setup OpenTelemetry logs export",
|
"Failed to setup OpenTelemetry logs export",
|
||||||
service=service_name,
|
service=service_name,
|
||||||
error=str(e),
|
error=str(e)
|
||||||
reason="Will continue with standard logging only"
|
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
OpenTelemetry Metrics Integration for SigNoz
|
OpenTelemetry Metrics Integration for SigNoz
|
||||||
Exports metrics to SigNoz via OpenTelemetry Collector in addition to Prometheus
|
Exports metrics to SigNoz via OpenTelemetry Collector using gRPC protocol
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -9,8 +9,24 @@ from typing import Optional
|
|||||||
from opentelemetry import metrics
|
from opentelemetry import metrics
|
||||||
from opentelemetry.sdk.metrics import MeterProvider
|
from opentelemetry.sdk.metrics import MeterProvider
|
||||||
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||||
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
|
from opentelemetry.sdk.resources import Resource
|
||||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
|
||||||
|
# Import both gRPC and HTTP exporters
|
||||||
|
try:
|
||||||
|
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GrpcMetricExporter
|
||||||
|
GRPC_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
GRPC_AVAILABLE = False
|
||||||
|
GrpcMetricExporter = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HttpMetricExporter
|
||||||
|
HTTP_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
HTTP_AVAILABLE = False
|
||||||
|
HttpMetricExporter = None
|
||||||
|
|
||||||
|
from .otel_config import OTelConfig
|
||||||
|
|
||||||
logger = structlog.get_logger()
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
@@ -19,20 +35,21 @@ def setup_otel_metrics(
|
|||||||
service_name: str,
|
service_name: str,
|
||||||
service_version: str = "1.0.0",
|
service_version: str = "1.0.0",
|
||||||
otel_endpoint: Optional[str] = None,
|
otel_endpoint: Optional[str] = None,
|
||||||
export_interval_millis: int = 60000 # Export every 60 seconds
|
export_interval_millis: int = 60000, # Export every 60 seconds
|
||||||
|
protocol: Optional[str] = None # "grpc" or "http", defaults to grpc
|
||||||
) -> Optional[MeterProvider]:
|
) -> Optional[MeterProvider]:
|
||||||
"""
|
"""
|
||||||
Setup OpenTelemetry metrics to export to SigNoz.
|
Setup OpenTelemetry metrics to export to SigNoz.
|
||||||
|
|
||||||
This creates a dual-export strategy:
|
Supports both gRPC (recommended, port 4317) and HTTP (port 4318) protocols.
|
||||||
- Prometheus exposition format at /metrics (for Prometheus scraping)
|
Default protocol is gRPC for better performance.
|
||||||
- OTLP push to SigNoz collector (for direct ingestion)
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
service_name: Name of the service (e.g., "auth-service")
|
service_name: Name of the service (e.g., "auth-service")
|
||||||
service_version: Version of the service
|
service_version: Version of the service
|
||||||
otel_endpoint: OpenTelemetry collector endpoint (default from env)
|
otel_endpoint: Optional override for OTLP endpoint
|
||||||
export_interval_millis: How often to push metrics (default 60s)
|
export_interval_millis: How often to push metrics in milliseconds (default 60s)
|
||||||
|
protocol: Protocol to use ("grpc" or "http"). Defaults to "grpc"
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
MeterProvider instance if successful, None otherwise
|
MeterProvider instance if successful, None otherwise
|
||||||
@@ -40,9 +57,12 @@ def setup_otel_metrics(
|
|||||||
Example:
|
Example:
|
||||||
from shared.monitoring.metrics_exporter import setup_otel_metrics
|
from shared.monitoring.metrics_exporter import setup_otel_metrics
|
||||||
|
|
||||||
# Setup during service initialization
|
# Setup with gRPC (default)
|
||||||
meter_provider = setup_otel_metrics("auth-service", "1.0.0")
|
meter_provider = setup_otel_metrics("auth-service", "1.0.0")
|
||||||
|
|
||||||
|
# Or with HTTP
|
||||||
|
meter_provider = setup_otel_metrics("auth-service", "1.0.0", protocol="http")
|
||||||
|
|
||||||
# Create meters for your metrics
|
# Create meters for your metrics
|
||||||
meter = meter_provider.get_meter(__name__)
|
meter = meter_provider.get_meter(__name__)
|
||||||
request_counter = meter.create_counter(
|
request_counter = meter.create_counter(
|
||||||
@@ -56,8 +76,7 @@ def setup_otel_metrics(
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
# Check if metrics export is enabled
|
# Check if metrics export is enabled
|
||||||
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
|
if not OTelConfig.is_enabled("metrics"):
|
||||||
if not enable_otel_metrics:
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"OpenTelemetry metrics export disabled",
|
"OpenTelemetry metrics export disabled",
|
||||||
service=service_name,
|
service=service_name,
|
||||||
@@ -65,30 +84,64 @@ def setup_otel_metrics(
|
|||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Get OTLP endpoint from environment or parameter
|
# Determine protocol to use
|
||||||
if otel_endpoint is None:
|
if protocol is None:
|
||||||
otel_endpoint = os.getenv(
|
protocol = OTelConfig.get_protocol("metrics")
|
||||||
"OTEL_EXPORTER_OTLP_ENDPOINT",
|
|
||||||
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ensure endpoint has /v1/metrics path for HTTP
|
# Validate protocol is available
|
||||||
if not otel_endpoint.endswith("/v1/metrics"):
|
if protocol == "grpc" and not GRPC_AVAILABLE:
|
||||||
otel_endpoint = f"{otel_endpoint}/v1/metrics"
|
logger.warning(
|
||||||
|
"gRPC exporter not available, falling back to HTTP",
|
||||||
|
service=service_name
|
||||||
|
)
|
||||||
|
protocol = "http"
|
||||||
|
elif protocol == "http" and not HTTP_AVAILABLE:
|
||||||
|
logger.warning(
|
||||||
|
"HTTP exporter not available, falling back to gRPC",
|
||||||
|
service=service_name
|
||||||
|
)
|
||||||
|
protocol = "grpc"
|
||||||
|
|
||||||
|
if protocol not in ["grpc", "http"]:
|
||||||
|
logger.error(
|
||||||
|
"Invalid protocol specified",
|
||||||
|
service=service_name,
|
||||||
|
protocol=protocol
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Create resource with service information
|
# Get endpoints from centralized config
|
||||||
resource = Resource(attributes={
|
endpoints = OTelConfig.get_endpoints()
|
||||||
SERVICE_NAME: service_name,
|
|
||||||
SERVICE_VERSION: service_version,
|
|
||||||
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
|
|
||||||
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
|
|
||||||
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
|
|
||||||
})
|
|
||||||
|
|
||||||
# Configure OTLP exporter for metrics
|
# Determine which endpoint to use
|
||||||
otlp_exporter = OTLPMetricExporter(
|
if otel_endpoint:
|
||||||
endpoint=otel_endpoint,
|
# User provided override
|
||||||
|
if protocol == "grpc":
|
||||||
|
endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
|
||||||
|
else:
|
||||||
|
endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/metrics")
|
||||||
|
else:
|
||||||
|
# Use config-determined endpoint
|
||||||
|
if protocol == "grpc":
|
||||||
|
endpoint = endpoints.metrics_grpc
|
||||||
|
else:
|
||||||
|
endpoint = endpoints.metrics_http
|
||||||
|
|
||||||
|
# Get resource attributes
|
||||||
|
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
|
||||||
|
resource = Resource(attributes=resource_attrs)
|
||||||
|
|
||||||
|
# Configure OTLP exporter based on protocol
|
||||||
|
if protocol == "grpc":
|
||||||
|
otlp_exporter = GrpcMetricExporter(
|
||||||
|
endpoint=endpoint,
|
||||||
|
insecure=True, # Use secure=False in production with proper TLS
|
||||||
|
timeout=10
|
||||||
|
)
|
||||||
|
else: # http
|
||||||
|
otlp_exporter = HttpMetricExporter(
|
||||||
|
endpoint=endpoint,
|
||||||
timeout=10
|
timeout=10
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -108,9 +161,10 @@ def setup_otel_metrics(
|
|||||||
metrics.set_meter_provider(meter_provider)
|
metrics.set_meter_provider(meter_provider)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"OpenTelemetry metrics export configured",
|
"OpenTelemetry metrics export configured successfully",
|
||||||
service=service_name,
|
service=service_name,
|
||||||
otel_endpoint=otel_endpoint,
|
endpoint=endpoint,
|
||||||
|
protocol=protocol,
|
||||||
export_interval_seconds=export_interval_millis / 1000
|
export_interval_seconds=export_interval_millis / 1000
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -121,7 +175,7 @@ def setup_otel_metrics(
|
|||||||
"Failed to setup OpenTelemetry metrics export",
|
"Failed to setup OpenTelemetry metrics export",
|
||||||
service=service_name,
|
service=service_name,
|
||||||
error=str(e),
|
error=str(e),
|
||||||
reason="Will continue with Prometheus-only metrics"
|
protocol=protocol
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
286
shared/monitoring/otel_config.py
Normal file
286
shared/monitoring/otel_config.py
Normal file
@@ -0,0 +1,286 @@
|
|||||||
|
"""
|
||||||
|
Centralized OpenTelemetry Configuration
|
||||||
|
Manages OTEL endpoints and settings for traces, metrics, and logs
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class OTelEndpoints:
|
||||||
|
"""
|
||||||
|
Container for OpenTelemetry endpoints.
|
||||||
|
|
||||||
|
SigNoz uses different protocols for different signals:
|
||||||
|
- Traces: gRPC (port 4317)
|
||||||
|
- Metrics: gRPC (port 4317) or HTTP (port 4318)
|
||||||
|
- Logs: HTTP (port 4318)
|
||||||
|
"""
|
||||||
|
traces_grpc: str # gRPC endpoint for traces (e.g., "host:4317")
|
||||||
|
metrics_grpc: str # gRPC endpoint for metrics (e.g., "host:4317")
|
||||||
|
metrics_http: str # HTTP endpoint for metrics (e.g., "http://host:4318/v1/metrics")
|
||||||
|
logs_http: str # HTTP endpoint for logs (e.g., "http://host:4318/v1/logs")
|
||||||
|
|
||||||
|
|
||||||
|
class OTelConfig:
|
||||||
|
"""
|
||||||
|
Centralized configuration for OpenTelemetry exporters.
|
||||||
|
|
||||||
|
This class manages endpoint URLs and ensures proper protocol usage:
|
||||||
|
- gRPC endpoints: host:port (no protocol prefix)
|
||||||
|
- HTTP endpoints: http://host:port/path (with protocol and path)
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Default base endpoint (can be overridden by environment variables)
|
||||||
|
DEFAULT_OTEL_COLLECTOR_HOST = "signoz-otel-collector.bakery-ia.svc.cluster.local"
|
||||||
|
DEFAULT_GRPC_PORT = 4317
|
||||||
|
DEFAULT_HTTP_PORT = 4318
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_endpoints(cls) -> OTelEndpoints:
|
||||||
|
"""
|
||||||
|
Get OpenTelemetry endpoints from environment variables with proper fallbacks.
|
||||||
|
|
||||||
|
Environment variables (in order of precedence):
|
||||||
|
1. OTEL_EXPORTER_OTLP_ENDPOINT - Base endpoint (gRPC format: host:port)
|
||||||
|
2. OTEL_EXPORTER_OTLP_TRACES_ENDPOINT - Specific traces endpoint
|
||||||
|
3. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT - Specific metrics endpoint
|
||||||
|
4. OTEL_EXPORTER_OTLP_LOGS_ENDPOINT - Specific logs endpoint
|
||||||
|
5. OTEL_COLLECTOR_ENDPOINT - Legacy variable (HTTP format)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
OTelEndpoints with all configured endpoints
|
||||||
|
"""
|
||||||
|
# Get base endpoint from environment
|
||||||
|
base_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||||
|
|
||||||
|
if base_endpoint:
|
||||||
|
# Clean and parse base endpoint
|
||||||
|
base_grpc = cls._clean_grpc_endpoint(base_endpoint)
|
||||||
|
base_http_host = cls._extract_host(base_endpoint)
|
||||||
|
else:
|
||||||
|
# Use default collector
|
||||||
|
base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
|
||||||
|
base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
|
||||||
|
|
||||||
|
# Get signal-specific endpoints (or use base endpoint)
|
||||||
|
traces_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", base_grpc)
|
||||||
|
metrics_endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", base_grpc)
|
||||||
|
logs_endpoint = os.getenv("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT")
|
||||||
|
|
||||||
|
# Build final endpoints
|
||||||
|
traces_grpc = cls._clean_grpc_endpoint(traces_endpoint)
|
||||||
|
metrics_grpc = cls._clean_grpc_endpoint(metrics_endpoint)
|
||||||
|
|
||||||
|
# For metrics HTTP, convert gRPC endpoint to HTTP if needed
|
||||||
|
metrics_http = cls._grpc_to_http_endpoint(metrics_grpc, "/v1/metrics")
|
||||||
|
|
||||||
|
# For logs, use HTTP endpoint
|
||||||
|
if logs_endpoint:
|
||||||
|
logs_http = cls._ensure_http_endpoint(logs_endpoint, "/v1/logs")
|
||||||
|
else:
|
||||||
|
logs_http = cls._grpc_to_http_endpoint(base_grpc, "/v1/logs")
|
||||||
|
|
||||||
|
endpoints = OTelEndpoints(
|
||||||
|
traces_grpc=traces_grpc,
|
||||||
|
metrics_grpc=metrics_grpc,
|
||||||
|
metrics_http=metrics_http,
|
||||||
|
logs_http=logs_http
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"OpenTelemetry endpoints configured",
|
||||||
|
traces_grpc=endpoints.traces_grpc,
|
||||||
|
metrics_grpc=endpoints.metrics_grpc,
|
||||||
|
metrics_http=endpoints.metrics_http,
|
||||||
|
logs_http=endpoints.logs_http
|
||||||
|
)
|
||||||
|
|
||||||
|
return endpoints
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _clean_grpc_endpoint(endpoint: str) -> str:
|
||||||
|
"""
|
||||||
|
Clean endpoint for gRPC usage (remove protocol, paths).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
endpoint: Raw endpoint string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cleaned endpoint in format "host:port"
|
||||||
|
"""
|
||||||
|
# Remove protocol prefixes
|
||||||
|
endpoint = endpoint.replace("http://", "").replace("https://", "")
|
||||||
|
|
||||||
|
# Remove paths (gRPC doesn't use paths)
|
||||||
|
if "/" in endpoint:
|
||||||
|
endpoint = endpoint.split("/")[0]
|
||||||
|
|
||||||
|
# Ensure it has a port
|
||||||
|
if ":" not in endpoint:
|
||||||
|
endpoint = f"{endpoint}:4317"
|
||||||
|
|
||||||
|
return endpoint
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_host(endpoint: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract host and convert to HTTP endpoint.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
endpoint: Raw endpoint string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
HTTP endpoint without path (e.g., "http://host:4318")
|
||||||
|
"""
|
||||||
|
# Remove protocol if present
|
||||||
|
clean = endpoint.replace("http://", "").replace("https://", "")
|
||||||
|
|
||||||
|
# Remove path if present
|
||||||
|
if "/" in clean:
|
||||||
|
clean = clean.split("/")[0]
|
||||||
|
|
||||||
|
# Extract host without port
|
||||||
|
if ":" in clean:
|
||||||
|
host = clean.split(":")[0]
|
||||||
|
else:
|
||||||
|
host = clean
|
||||||
|
|
||||||
|
return f"http://{host}:4318"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _grpc_to_http_endpoint(grpc_endpoint: str, path: str) -> str:
|
||||||
|
"""
|
||||||
|
Convert gRPC endpoint to HTTP endpoint with path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
grpc_endpoint: gRPC endpoint (e.g., "host:4317")
|
||||||
|
path: HTTP path (e.g., "/v1/metrics")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
HTTP endpoint (e.g., "http://host:4318/v1/metrics")
|
||||||
|
"""
|
||||||
|
# Extract host from gRPC endpoint
|
||||||
|
if ":" in grpc_endpoint:
|
||||||
|
host = grpc_endpoint.split(":")[0]
|
||||||
|
else:
|
||||||
|
host = grpc_endpoint
|
||||||
|
|
||||||
|
# Build HTTP endpoint with port 4318
|
||||||
|
return f"http://{host}:4318{path}"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _ensure_http_endpoint(endpoint: str, path: str) -> str:
|
||||||
|
"""
|
||||||
|
Ensure endpoint is in HTTP format with proper path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
endpoint: Raw endpoint string
|
||||||
|
path: Required path (e.g., "/v1/logs")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
HTTP endpoint with protocol and path
|
||||||
|
"""
|
||||||
|
# Add protocol if missing
|
||||||
|
if not endpoint.startswith(("http://", "https://")):
|
||||||
|
endpoint = f"http://{endpoint}"
|
||||||
|
|
||||||
|
# Ensure it has the correct port for HTTP
|
||||||
|
if ":4317" in endpoint:
|
||||||
|
endpoint = endpoint.replace(":4317", ":4318")
|
||||||
|
elif ":4318" not in endpoint and ":" in endpoint:
|
||||||
|
# Has a port but not the right one, replace it
|
||||||
|
parts = endpoint.split(":")
|
||||||
|
if len(parts) >= 2:
|
||||||
|
# Remove existing port and path
|
||||||
|
base = ":".join(parts[:-1])
|
||||||
|
endpoint = f"{base}:4318"
|
||||||
|
elif ":" not in endpoint.replace("http://", "").replace("https://", ""):
|
||||||
|
# No port at all, add it
|
||||||
|
endpoint = f"{endpoint}:4318"
|
||||||
|
|
||||||
|
# Ensure path is present
|
||||||
|
if not endpoint.endswith(path):
|
||||||
|
# Remove any existing path first
|
||||||
|
if "/" in endpoint.split("://")[1]:
|
||||||
|
base = endpoint.split("://")[0] + "://" + endpoint.split("://")[1].split("/")[0]
|
||||||
|
endpoint = base
|
||||||
|
endpoint = f"{endpoint}{path}"
|
||||||
|
|
||||||
|
return endpoint
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_resource_attributes(
|
||||||
|
cls,
|
||||||
|
service_name: str,
|
||||||
|
service_version: str = "1.0.0"
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Get common resource attributes for all OTEL signals.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
service_name: Name of the service
|
||||||
|
service_version: Version of the service
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of resource attributes
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"service.name": service_name,
|
||||||
|
"service.version": service_version,
|
||||||
|
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
|
||||||
|
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
|
||||||
|
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
|
||||||
|
"k8s.cluster.name": os.getenv("K8S_CLUSTER_NAME", "bakery-ia-cluster"),
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_enabled(cls, signal: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a specific telemetry signal is enabled.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
signal: One of "traces", "metrics", "logs"
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if signal is enabled, False otherwise
|
||||||
|
"""
|
||||||
|
signal = signal.lower()
|
||||||
|
|
||||||
|
if signal == "traces":
|
||||||
|
return os.getenv("ENABLE_TRACING", "true").lower() == "true"
|
||||||
|
elif signal == "metrics":
|
||||||
|
return os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
|
||||||
|
elif signal == "logs":
|
||||||
|
return os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp"
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_protocol(cls, signal: str) -> str:
|
||||||
|
"""
|
||||||
|
Get the preferred protocol for a signal.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
signal: One of "traces", "metrics", "logs"
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Protocol name ("grpc" or "http")
|
||||||
|
"""
|
||||||
|
protocol = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc")
|
||||||
|
|
||||||
|
# Signal-specific overrides
|
||||||
|
if signal == "traces":
|
||||||
|
return os.getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", protocol)
|
||||||
|
elif signal == "metrics":
|
||||||
|
return os.getenv("OTEL_EXPORTER_OTLP_METRICS_PROTOCOL", protocol)
|
||||||
|
elif signal == "logs":
|
||||||
|
# Logs always use HTTP in our setup
|
||||||
|
return "http"
|
||||||
|
|
||||||
|
return protocol
|
||||||
271
shared/monitoring/telemetry.py
Normal file
271
shared/monitoring/telemetry.py
Normal file
@@ -0,0 +1,271 @@
|
|||||||
|
"""
|
||||||
|
Unified OpenTelemetry Telemetry Setup
|
||||||
|
|
||||||
|
Provides a single entry point to configure all telemetry signals:
|
||||||
|
- Traces: Distributed tracing across services
|
||||||
|
- Metrics: OTLP metrics export + system metrics collection
|
||||||
|
- Logs: Structured logs with trace correlation
|
||||||
|
|
||||||
|
All signals are exported to SigNoz via OTLP.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import structlog
|
||||||
|
from typing import Optional, Dict, Any, Tuple
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from .otel_config import OTelConfig
|
||||||
|
from .tracing import setup_tracing
|
||||||
|
from .metrics_exporter import setup_otel_metrics
|
||||||
|
from .logs_exporter import setup_otel_logging
|
||||||
|
from .system_metrics import setup_all_metrics, SystemMetricsCollector, ApplicationMetricsCollector
|
||||||
|
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TelemetryProviders:
|
||||||
|
"""
|
||||||
|
Container for all OpenTelemetry providers and collectors.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
tracer_provider: Provider for distributed tracing
|
||||||
|
meter_provider: Provider for metrics export
|
||||||
|
logging_handler: Handler for structured logs
|
||||||
|
system_metrics: Collector for system-level metrics (CPU, memory, disk, network)
|
||||||
|
app_metrics: Collector for application-level metrics (HTTP, DB)
|
||||||
|
"""
|
||||||
|
tracer_provider: Optional[Any] = None
|
||||||
|
meter_provider: Optional[Any] = None
|
||||||
|
logging_handler: Optional[Any] = None
|
||||||
|
system_metrics: Optional[SystemMetricsCollector] = None
|
||||||
|
app_metrics: Optional[ApplicationMetricsCollector] = None
|
||||||
|
|
||||||
|
|
||||||
|
def setup_telemetry(
|
||||||
|
app,
|
||||||
|
service_name: str,
|
||||||
|
service_version: str = "1.0.0",
|
||||||
|
enable_traces: bool = True,
|
||||||
|
enable_metrics: bool = True,
|
||||||
|
enable_logs: bool = True,
|
||||||
|
enable_system_metrics: bool = True,
|
||||||
|
metrics_protocol: Optional[str] = None, # "grpc" or "http", defaults to grpc
|
||||||
|
export_interval_millis: int = 60000
|
||||||
|
) -> TelemetryProviders:
|
||||||
|
"""
|
||||||
|
Setup all OpenTelemetry telemetry signals (traces, metrics, logs) for a service.
|
||||||
|
|
||||||
|
This is the UNIFIED setup function that configures everything:
|
||||||
|
- Distributed tracing (gRPC, port 4317)
|
||||||
|
- Metrics export (gRPC by default, port 4317)
|
||||||
|
- System metrics collection (CPU, memory, disk, network)
|
||||||
|
- Application metrics (HTTP requests, DB queries)
|
||||||
|
- Structured logs export (HTTP, port 4318)
|
||||||
|
|
||||||
|
All signals use the centralized OTelConfig for endpoint management.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
app: FastAPI application instance
|
||||||
|
service_name: Name of the service (e.g., "auth-service")
|
||||||
|
service_version: Version of the service
|
||||||
|
enable_traces: Enable distributed tracing (default: True)
|
||||||
|
enable_metrics: Enable metrics export to OTLP (default: True)
|
||||||
|
enable_logs: Enable logs export to OTLP (default: True)
|
||||||
|
enable_system_metrics: Enable system metrics collection (default: True, can be disabled via ENABLE_SYSTEM_METRICS env)
|
||||||
|
metrics_protocol: Protocol for metrics ("grpc" or "http", default: "grpc")
|
||||||
|
export_interval_millis: How often to export metrics in milliseconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TelemetryProviders containing all initialized providers and collectors
|
||||||
|
|
||||||
|
Example:
|
||||||
|
from shared.monitoring.telemetry import setup_telemetry
|
||||||
|
|
||||||
|
app = FastAPI(title="Auth Service")
|
||||||
|
providers = setup_telemetry(
|
||||||
|
app,
|
||||||
|
service_name="auth-service",
|
||||||
|
service_version="1.0.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
# All telemetry is now configured:
|
||||||
|
# - Traces automatically captured for HTTP requests
|
||||||
|
# - System metrics automatically collected
|
||||||
|
# - Application metrics via providers.app_metrics
|
||||||
|
# - Logs automatically correlated with traces
|
||||||
|
"""
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Setting up unified OpenTelemetry telemetry",
|
||||||
|
service=service_name,
|
||||||
|
version=service_version,
|
||||||
|
traces=enable_traces,
|
||||||
|
metrics=enable_metrics,
|
||||||
|
logs=enable_logs,
|
||||||
|
system_metrics=enable_system_metrics
|
||||||
|
)
|
||||||
|
|
||||||
|
providers = TelemetryProviders()
|
||||||
|
|
||||||
|
# Setup distributed tracing
|
||||||
|
if enable_traces and OTelConfig.is_enabled("traces"):
|
||||||
|
try:
|
||||||
|
providers.tracer_provider = setup_tracing(
|
||||||
|
app,
|
||||||
|
service_name=service_name,
|
||||||
|
service_version=service_version
|
||||||
|
)
|
||||||
|
if providers.tracer_provider:
|
||||||
|
logger.info("✓ Distributed tracing configured", service=service_name)
|
||||||
|
else:
|
||||||
|
logger.warning("✗ Distributed tracing setup returned None", service=service_name)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("✗ Failed to setup distributed tracing", service=service_name, error=str(e))
|
||||||
|
|
||||||
|
# Setup OTLP metrics export
|
||||||
|
if enable_metrics and OTelConfig.is_enabled("metrics"):
|
||||||
|
try:
|
||||||
|
providers.meter_provider = setup_otel_metrics(
|
||||||
|
service_name=service_name,
|
||||||
|
service_version=service_version,
|
||||||
|
protocol=metrics_protocol,
|
||||||
|
export_interval_millis=export_interval_millis
|
||||||
|
)
|
||||||
|
if providers.meter_provider:
|
||||||
|
logger.info("✓ OTLP metrics export configured", service=service_name)
|
||||||
|
|
||||||
|
# Setup system and application metrics collectors
|
||||||
|
if enable_system_metrics:
|
||||||
|
enable_system_env = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
|
||||||
|
if enable_system_env:
|
||||||
|
try:
|
||||||
|
providers.system_metrics, providers.app_metrics = setup_all_metrics(
|
||||||
|
service_name=service_name,
|
||||||
|
service_version=service_version,
|
||||||
|
meter_provider=providers.meter_provider
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"✓ System and application metrics collectors initialized",
|
||||||
|
service=service_name,
|
||||||
|
system_metrics=["cpu", "memory", "disk", "network"],
|
||||||
|
app_metrics=["http_requests", "db_queries"]
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("✗ Failed to setup metrics collectors", service=service_name, error=str(e))
|
||||||
|
else:
|
||||||
|
logger.warning("✗ OTLP metrics export setup returned None", service=service_name)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("✗ Failed to setup OTLP metrics export", service=service_name, error=str(e))
|
||||||
|
|
||||||
|
# Setup logs export
|
||||||
|
if enable_logs and OTelConfig.is_enabled("logs"):
|
||||||
|
try:
|
||||||
|
providers.logging_handler = setup_otel_logging(
|
||||||
|
service_name=service_name,
|
||||||
|
service_version=service_version
|
||||||
|
)
|
||||||
|
if providers.logging_handler:
|
||||||
|
logger.info("✓ Structured logs export configured", service=service_name)
|
||||||
|
else:
|
||||||
|
logger.warning("✗ Logs export setup returned None", service=service_name)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("✗ Failed to setup logs export", service=service_name, error=str(e))
|
||||||
|
|
||||||
|
# Log endpoint configuration summary
|
||||||
|
try:
|
||||||
|
endpoints = OTelConfig.get_endpoints()
|
||||||
|
summary = {
|
||||||
|
"service": service_name,
|
||||||
|
"version": service_version,
|
||||||
|
"traces": {
|
||||||
|
"enabled": bool(providers.tracer_provider),
|
||||||
|
"endpoint": endpoints.traces_grpc if providers.tracer_provider else "disabled"
|
||||||
|
},
|
||||||
|
"metrics": {
|
||||||
|
"enabled": bool(providers.meter_provider),
|
||||||
|
"endpoint": (endpoints.metrics_grpc if metrics_protocol != "http" else endpoints.metrics_http) if providers.meter_provider else "disabled",
|
||||||
|
"system_metrics": bool(providers.system_metrics),
|
||||||
|
"app_metrics": bool(providers.app_metrics)
|
||||||
|
},
|
||||||
|
"logs": {
|
||||||
|
"enabled": bool(providers.logging_handler),
|
||||||
|
"endpoint": endpoints.logs_http if providers.logging_handler else "disabled"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logger.info("🎉 Telemetry setup complete", **summary)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Could not log endpoint summary", error=str(e))
|
||||||
|
|
||||||
|
return providers
|
||||||
|
|
||||||
|
|
||||||
|
def setup_telemetry_simple(
|
||||||
|
app,
|
||||||
|
service_name: str,
|
||||||
|
service_version: str = "1.0.0"
|
||||||
|
) -> TelemetryProviders:
|
||||||
|
"""
|
||||||
|
Simplified telemetry setup with all defaults.
|
||||||
|
|
||||||
|
Uses:
|
||||||
|
- gRPC for traces (port 4317)
|
||||||
|
- gRPC for metrics (port 4317)
|
||||||
|
- HTTP for logs (port 4318)
|
||||||
|
|
||||||
|
All settings are read from environment variables and OTelConfig.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
app: FastAPI application instance
|
||||||
|
service_name: Name of the service
|
||||||
|
service_version: Version of the service
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TelemetryProviders containing all initialized providers
|
||||||
|
|
||||||
|
Example:
|
||||||
|
from shared.monitoring.telemetry import setup_telemetry_simple
|
||||||
|
|
||||||
|
app = FastAPI(title="Auth Service")
|
||||||
|
providers = setup_telemetry_simple(app, "auth-service")
|
||||||
|
"""
|
||||||
|
return setup_telemetry(
|
||||||
|
app=app,
|
||||||
|
service_name=service_name,
|
||||||
|
service_version=service_version
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_telemetry_status() -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get current telemetry configuration status.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with telemetry status information
|
||||||
|
|
||||||
|
Example:
|
||||||
|
from shared.monitoring.telemetry import get_telemetry_status
|
||||||
|
|
||||||
|
status = get_telemetry_status()
|
||||||
|
print(f"Tracing enabled: {status['traces']['enabled']}")
|
||||||
|
"""
|
||||||
|
endpoints = OTelConfig.get_endpoints()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"traces": {
|
||||||
|
"enabled": OTelConfig.is_enabled("traces"),
|
||||||
|
"protocol": "grpc",
|
||||||
|
"endpoint": endpoints.traces_grpc
|
||||||
|
},
|
||||||
|
"metrics": {
|
||||||
|
"enabled": OTelConfig.is_enabled("metrics"),
|
||||||
|
"protocol": OTelConfig.get_protocol("metrics"),
|
||||||
|
"grpc_endpoint": endpoints.metrics_grpc,
|
||||||
|
"http_endpoint": endpoints.metrics_http
|
||||||
|
},
|
||||||
|
"logs": {
|
||||||
|
"enabled": OTelConfig.is_enabled("logs"),
|
||||||
|
"protocol": "http",
|
||||||
|
"endpoint": endpoints.logs_http
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,17 +3,38 @@ OpenTelemetry distributed tracing integration
|
|||||||
Provides end-to-end request tracking across all services
|
Provides end-to-end request tracking across all services
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
import structlog
|
import structlog
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from opentelemetry import trace
|
from opentelemetry import trace
|
||||||
from opentelemetry.sdk.trace import TracerProvider
|
from opentelemetry.sdk.trace import TracerProvider
|
||||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||||
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
from opentelemetry.sdk.resources import Resource
|
||||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||||
|
|
||||||
|
# Core instrumentations (should always be available)
|
||||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||||
|
|
||||||
|
# Optional instrumentations (may not be installed in all services)
|
||||||
|
try:
|
||||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||||
|
HTTPX_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
HTTPX_AVAILABLE = False
|
||||||
|
|
||||||
|
try:
|
||||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||||
|
REDIS_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
REDIS_AVAILABLE = False
|
||||||
|
|
||||||
|
try:
|
||||||
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
||||||
|
SQLALCHEMY_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
SQLALCHEMY_AVAILABLE = False
|
||||||
|
|
||||||
|
from .otel_config import OTelConfig
|
||||||
|
|
||||||
logger = structlog.get_logger()
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
@@ -22,8 +43,8 @@ def setup_tracing(
|
|||||||
app,
|
app,
|
||||||
service_name: str,
|
service_name: str,
|
||||||
service_version: str = "1.0.0",
|
service_version: str = "1.0.0",
|
||||||
otel_endpoint: str = "http://signoz-otel-collector.bakery-ia:4318"
|
otel_endpoint: Optional[str] = None
|
||||||
):
|
) -> Optional[TracerProvider]:
|
||||||
"""
|
"""
|
||||||
Setup OpenTelemetry distributed tracing for a FastAPI service.
|
Setup OpenTelemetry distributed tracing for a FastAPI service.
|
||||||
|
|
||||||
@@ -33,35 +54,56 @@ def setup_tracing(
|
|||||||
- Redis operations
|
- Redis operations
|
||||||
- PostgreSQL/SQLAlchemy queries
|
- PostgreSQL/SQLAlchemy queries
|
||||||
|
|
||||||
|
Uses gRPC protocol (port 4317) for sending traces to SigNoz.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
app: FastAPI application instance
|
app: FastAPI application instance
|
||||||
service_name: Name of the service (e.g., "auth-service")
|
service_name: Name of the service (e.g., "auth-service")
|
||||||
service_version: Version of the service
|
service_version: Version of the service
|
||||||
otel_endpoint: OpenTelemetry collector endpoint (SigNoz)
|
otel_endpoint: Optional override for OTLP endpoint (gRPC format: host:port)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
TracerProvider instance if successful, None otherwise
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
from shared.monitoring.tracing import setup_tracing
|
from shared.monitoring.tracing import setup_tracing
|
||||||
|
|
||||||
app = FastAPI(title="Auth Service")
|
app = FastAPI(title="Auth Service")
|
||||||
setup_tracing(app, "auth-service")
|
tracer_provider = setup_tracing(app, "auth-service", "1.0.0")
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Check if tracing is enabled
|
||||||
|
if not OTelConfig.is_enabled("traces"):
|
||||||
|
logger.info(
|
||||||
|
"Distributed tracing disabled",
|
||||||
|
service=service_name,
|
||||||
|
reason="ENABLE_TRACING not set to 'true'"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Create resource with service information
|
# Get endpoints from centralized config
|
||||||
resource = Resource(attributes={
|
endpoints = OTelConfig.get_endpoints()
|
||||||
SERVICE_NAME: service_name,
|
|
||||||
SERVICE_VERSION: service_version,
|
# Use provided endpoint or get from config
|
||||||
"deployment.environment": "production"
|
if otel_endpoint:
|
||||||
})
|
# Clean user-provided endpoint for gRPC
|
||||||
|
grpc_endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
|
||||||
|
else:
|
||||||
|
grpc_endpoint = endpoints.traces_grpc
|
||||||
|
|
||||||
|
# Get resource attributes
|
||||||
|
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
|
||||||
|
resource = Resource(attributes=resource_attrs)
|
||||||
|
|
||||||
# Configure tracer provider
|
# Configure tracer provider
|
||||||
tracer_provider = TracerProvider(resource=resource)
|
tracer_provider = TracerProvider(resource=resource)
|
||||||
trace.set_tracer_provider(tracer_provider)
|
trace.set_tracer_provider(tracer_provider)
|
||||||
|
|
||||||
# Configure OTLP exporter to send to SigNoz
|
# Configure OTLP gRPC exporter for traces
|
||||||
otlp_exporter = OTLPSpanExporter(
|
otlp_exporter = OTLPSpanExporter(
|
||||||
endpoint=otel_endpoint,
|
endpoint=grpc_endpoint,
|
||||||
insecure=True # Use TLS in production
|
insecure=True # Use secure=False in production with proper TLS
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add span processor with batching for performance
|
# Add span processor with batching for performance
|
||||||
@@ -75,40 +117,46 @@ def setup_tracing(
|
|||||||
excluded_urls="health,metrics" # Don't trace health/metrics endpoints
|
excluded_urls="health,metrics" # Don't trace health/metrics endpoints
|
||||||
)
|
)
|
||||||
|
|
||||||
# Auto-instrument HTTPX (inter-service communication)
|
# Auto-instrument HTTPX (inter-service communication) if available
|
||||||
|
if HTTPX_AVAILABLE:
|
||||||
|
try:
|
||||||
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
|
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||||
|
logger.debug("HTTPX instrumentation enabled")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to instrument HTTPX: {e}")
|
||||||
|
|
||||||
# Auto-instrument Redis
|
# Auto-instrument Redis if available
|
||||||
|
if REDIS_AVAILABLE:
|
||||||
try:
|
try:
|
||||||
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
|
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||||
|
logger.debug("Redis instrumentation enabled")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to instrument Redis: {e}")
|
logger.warning(f"Failed to instrument Redis: {e}")
|
||||||
|
|
||||||
# Auto-instrument PostgreSQL (psycopg2) - skip if not available
|
# Auto-instrument SQLAlchemy if available
|
||||||
# Most services use asyncpg instead of psycopg2
|
if SQLALCHEMY_AVAILABLE:
|
||||||
# try:
|
|
||||||
# Psycopg2Instrumentor().instrument(tracer_provider=tracer_provider)
|
|
||||||
# except Exception as e:
|
|
||||||
# logger.warning(f"Failed to instrument Psycopg2: {e}")
|
|
||||||
|
|
||||||
# Auto-instrument SQLAlchemy
|
|
||||||
try:
|
try:
|
||||||
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
|
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
|
||||||
|
logger.debug("SQLAlchemy instrumentation enabled")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
|
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"Distributed tracing configured",
|
"Distributed tracing configured successfully",
|
||||||
service=service_name,
|
service=service_name,
|
||||||
otel_endpoint=otel_endpoint
|
grpc_endpoint=grpc_endpoint,
|
||||||
|
protocol="grpc"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return tracer_provider
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
"Failed to setup tracing - continuing without it",
|
"Failed to setup tracing - continuing without it",
|
||||||
service=service_name,
|
service=service_name,
|
||||||
error=str(e)
|
error=str(e)
|
||||||
)
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_current_trace_id() -> Optional[str]:
|
def get_current_trace_id() -> Optional[str]:
|
||||||
|
|||||||
@@ -20,10 +20,11 @@ from fastapi.middleware.cors import CORSMiddleware
|
|||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
from fastapi.routing import APIRouter
|
from fastapi.routing import APIRouter
|
||||||
|
|
||||||
from shared.monitoring import setup_logging, setup_otel_logging, setup_otel_metrics, setup_all_metrics
|
from shared.monitoring import (
|
||||||
from shared.monitoring.metrics import setup_metrics_early
|
setup_logging,
|
||||||
|
setup_telemetry
|
||||||
|
)
|
||||||
from shared.monitoring.health_checks import setup_fastapi_health_checks
|
from shared.monitoring.health_checks import setup_fastapi_health_checks
|
||||||
from shared.monitoring.tracing import setup_tracing
|
|
||||||
from shared.database.base import DatabaseManager
|
from shared.database.base import DatabaseManager
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
@@ -77,24 +78,13 @@ class BaseFastAPIService:
|
|||||||
|
|
||||||
# Initialize logging
|
# Initialize logging
|
||||||
setup_logging(service_name, log_level)
|
setup_logging(service_name, log_level)
|
||||||
|
|
||||||
# Setup OpenTelemetry logging export if enabled
|
|
||||||
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
|
|
||||||
try:
|
|
||||||
setup_otel_logging(service_name, version)
|
|
||||||
self.logger = structlog.get_logger()
|
|
||||||
self.logger.info(f"OpenTelemetry logs export enabled for {service_name}")
|
|
||||||
except Exception as e:
|
|
||||||
self.logger = structlog.get_logger()
|
|
||||||
self.logger.warning(f"Failed to setup OpenTelemetry logs export: {e}")
|
|
||||||
else:
|
|
||||||
self.logger = structlog.get_logger()
|
self.logger = structlog.get_logger()
|
||||||
|
|
||||||
# Will be set during app creation
|
# Will be set during app creation
|
||||||
self.app: Optional[FastAPI] = None
|
self.app: Optional[FastAPI] = None
|
||||||
self.metrics_collector = None
|
|
||||||
self.health_manager = None
|
self.health_manager = None
|
||||||
self.alert_service = None
|
self.alert_service = None
|
||||||
|
self.telemetry_providers = None # Contains all OTEL providers and metrics collectors
|
||||||
|
|
||||||
def create_app(self, **fastapi_kwargs) -> FastAPI:
|
def create_app(self, **fastapi_kwargs) -> FastAPI:
|
||||||
"""
|
"""
|
||||||
@@ -116,49 +106,25 @@ class BaseFastAPIService:
|
|||||||
# Create FastAPI app
|
# Create FastAPI app
|
||||||
self.app = FastAPI(**config)
|
self.app = FastAPI(**config)
|
||||||
|
|
||||||
# Setup metrics BEFORE middleware and lifespan
|
# Setup unified OpenTelemetry telemetry
|
||||||
if self.enable_metrics:
|
# This single call configures:
|
||||||
self.metrics_collector = setup_metrics_early(self.app, self.service_name)
|
# - Distributed tracing (gRPC, port 4317)
|
||||||
|
# - OTLP metrics export (gRPC, port 4317)
|
||||||
# Setup OpenTelemetry metrics export if enabled
|
# - System metrics collection (CPU, memory, disk, network)
|
||||||
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
|
# - Application metrics (HTTP requests, DB queries)
|
||||||
if enable_otel_metrics:
|
# - Structured logs export (HTTP, port 4318)
|
||||||
try:
|
try:
|
||||||
self.otel_meter_provider = setup_otel_metrics(self.service_name, self.version)
|
self.telemetry_providers = setup_telemetry(
|
||||||
if self.otel_meter_provider:
|
app=self.app,
|
||||||
self.logger.info(f"OpenTelemetry metrics export enabled for {self.service_name}")
|
service_name=self.service_name,
|
||||||
|
service_version=self.version,
|
||||||
# Setup system metrics collection (CPU, memory, disk, network)
|
enable_traces=self.enable_tracing,
|
||||||
enable_system_metrics = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
|
enable_metrics=self.enable_metrics,
|
||||||
if enable_system_metrics:
|
enable_logs=True, # Controlled by OTEL_LOGS_EXPORTER env var
|
||||||
try:
|
enable_system_metrics=True # Controlled by ENABLE_SYSTEM_METRICS env var
|
||||||
self.system_metrics, self.app_metrics = setup_all_metrics(
|
|
||||||
self.service_name,
|
|
||||||
self.version,
|
|
||||||
self.otel_meter_provider
|
|
||||||
)
|
)
|
||||||
self.logger.info(f"System metrics collection enabled for {self.service_name}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f"Failed to setup system metrics: {e}")
|
self.logger.warning("Failed to setup telemetry", error=str(e))
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Failed to setup OpenTelemetry metrics export: {e}")
|
|
||||||
|
|
||||||
# Setup distributed tracing
|
|
||||||
# Check both constructor flag and environment variable
|
|
||||||
tracing_enabled = self.enable_tracing and os.getenv("ENABLE_TRACING", "true").lower() == "true"
|
|
||||||
|
|
||||||
if tracing_enabled:
|
|
||||||
try:
|
|
||||||
otel_endpoint = os.getenv(
|
|
||||||
"OTEL_COLLECTOR_ENDPOINT",
|
|
||||||
"http://signoz-otel-collector.bakery-ia:4318"
|
|
||||||
)
|
|
||||||
setup_tracing(self.app, self.service_name, self.version, otel_endpoint)
|
|
||||||
self.logger.info(f"Distributed tracing enabled for {self.service_name}")
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Failed to setup tracing, continuing without it: {e}")
|
|
||||||
else:
|
|
||||||
self.logger.info(f"Distributed tracing disabled for {self.service_name}")
|
|
||||||
|
|
||||||
# Setup lifespan
|
# Setup lifespan
|
||||||
self.app.router.lifespan_context = self._create_lifespan()
|
self.app.router.lifespan_context = self._create_lifespan()
|
||||||
@@ -361,10 +327,6 @@ class BaseFastAPIService:
|
|||||||
method=request.method
|
method=request.method
|
||||||
)
|
)
|
||||||
|
|
||||||
# Record error metric if available
|
|
||||||
if self.metrics_collector:
|
|
||||||
self.metrics_collector.increment_counter("errors_total", labels={"type": "unhandled"})
|
|
||||||
|
|
||||||
return JSONResponse(
|
return JSONResponse(
|
||||||
status_code=500,
|
status_code=500,
|
||||||
content={
|
content={
|
||||||
@@ -409,7 +371,10 @@ class BaseFastAPIService:
|
|||||||
|
|
||||||
def register_custom_metrics(self, metrics_config: Dict[str, Dict[str, Any]]):
|
def register_custom_metrics(self, metrics_config: Dict[str, Dict[str, Any]]):
|
||||||
"""
|
"""
|
||||||
Register custom metrics for the service
|
Register custom OTEL metrics for the service.
|
||||||
|
|
||||||
|
Note: System metrics (CPU, memory, disk, network) and application metrics (HTTP, DB)
|
||||||
|
are automatically created by setup_telemetry(). Use this for additional custom metrics.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
metrics_config: Dict with metric name as key and config as value
|
metrics_config: Dict with metric name as key and config as value
|
||||||
@@ -417,25 +382,36 @@ class BaseFastAPIService:
|
|||||||
"user_registrations": {
|
"user_registrations": {
|
||||||
"type": "counter",
|
"type": "counter",
|
||||||
"description": "Total user registrations",
|
"description": "Total user registrations",
|
||||||
"labels": ["status"]
|
"unit": "registrations"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
if not self.metrics_collector:
|
if not self.telemetry_providers or not self.telemetry_providers.meter_provider:
|
||||||
self.logger.warning("Metrics collector not available")
|
self.logger.warning("OTEL meter provider not available - metrics not registered")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
from opentelemetry.metrics import get_meter
|
||||||
|
meter = get_meter(self.service_name)
|
||||||
|
|
||||||
for metric_name, config in metrics_config.items():
|
for metric_name, config in metrics_config.items():
|
||||||
metric_type = config.get("type", "counter")
|
metric_type = config.get("type", "counter")
|
||||||
description = config.get("description", f"{metric_name} metric")
|
description = config.get("description", f"{metric_name} metric")
|
||||||
labels = config.get("labels", [])
|
unit = config.get("unit", "1")
|
||||||
|
|
||||||
|
try:
|
||||||
if metric_type == "counter":
|
if metric_type == "counter":
|
||||||
self.metrics_collector.register_counter(metric_name, description, labels=labels)
|
meter.create_counter(metric_name, description=description, unit=unit)
|
||||||
|
self.logger.info(f"Registered custom counter: {metric_name}")
|
||||||
elif metric_type == "histogram":
|
elif metric_type == "histogram":
|
||||||
self.metrics_collector.register_histogram(metric_name, description, labels=labels)
|
meter.create_histogram(metric_name, description=description, unit=unit)
|
||||||
|
self.logger.info(f"Registered custom histogram: {metric_name}")
|
||||||
|
elif metric_type == "gauge":
|
||||||
|
meter.create_up_down_counter(metric_name, description=description, unit=unit)
|
||||||
|
self.logger.info(f"Registered custom gauge: {metric_name}")
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"Unsupported metric type: {metric_type}")
|
self.logger.warning(f"Unsupported metric type: {metric_type}")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to register metric {metric_name}", error=str(e))
|
||||||
|
|
||||||
def run_development_server(self, host: str = "0.0.0.0", port: int = 8000, reload: Optional[bool] = None):
|
def run_development_server(self, host: str = "0.0.0.0", port: int = 8000, reload: Optional[bool] = None):
|
||||||
"""
|
"""
|
||||||
|
|||||||
Reference in New Issue
Block a user