Imporve monitoring 5

This commit is contained in:
Urtzi Alfaro
2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions

View File

@@ -8,13 +8,12 @@ import json
import structlog import structlog
import resource import resource
import os import os
from fastapi import FastAPI, Request, HTTPException, Depends, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse, Response
import httpx
import time import time
from fastapi import Request, HTTPException, WebSocket, WebSocketDisconnect
from fastapi.responses import StreamingResponse
import httpx
from shared.redis_utils import initialize_redis, close_redis, get_redis_client from shared.redis_utils import initialize_redis, close_redis, get_redis_client
from typing import Dict, Any from shared.service_base import StandardFastAPIService
from app.core.config import settings from app.core.config import settings
from app.middleware.request_id import RequestIDMiddleware from app.middleware.request_id import RequestIDMiddleware
@@ -26,128 +25,84 @@ from app.middleware.subscription import SubscriptionMiddleware
from app.middleware.demo_middleware import DemoMiddleware from app.middleware.demo_middleware import DemoMiddleware
from app.middleware.read_only_mode import ReadOnlyModeMiddleware from app.middleware.read_only_mode import ReadOnlyModeMiddleware
from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
# OpenTelemetry imports # Initialize logger
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "gateway"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
# Create resource with service name
resource = Resource.create({"service.name": service_name})
# Configure OTLP exporter (sends to OpenTelemetry Collector)
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True # Use insecure connection for internal cluster communication
)
# Configure tracer provider
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
# Set global tracer provider
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("gateway")
# Setup logging
setup_logging("gateway", settings.LOG_LEVEL)
logger = structlog.get_logger() logger = structlog.get_logger()
# Check file descriptor limits and warn if too low # Check file descriptor limits
try: try:
soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
if soft_limit < 1024: if soft_limit < 1024:
logger.warning(f"Low file descriptor limit detected: {soft_limit}. Gateway may experience 'too many open files' errors.") logger.warning(f"Low file descriptor limit detected: {soft_limit}")
logger.warning(f"Recommended: Increase limit with 'ulimit -n 4096' or higher for production.")
if soft_limit < 256:
logger.error(f"Critical: File descriptor limit ({soft_limit}) is too low for gateway operation!")
else: else:
logger.info(f"File descriptor limit: {soft_limit} (sufficient)") logger.info(f"File descriptor limit: {soft_limit} (sufficient)")
except Exception as e: except Exception as e:
logger.debug(f"Could not check file descriptor limits: {e}") logger.debug(f"Could not check file descriptor limits: {e}")
# Check and log current working directory and permissions
try:
cwd = os.getcwd()
logger.info(f"Current working directory: {cwd}")
# Check if we can write to common log locations
test_locations = ["/var/log", "./logs", "."]
for location in test_locations:
try:
test_file = os.path.join(location, ".gateway_permission_test")
with open(test_file, 'w') as f:
f.write("test")
os.remove(test_file)
logger.info(f"Write permission confirmed for: {location}")
except Exception as e:
logger.warning(f"Cannot write to {location}: {e}")
except Exception as e:
logger.debug(f"Could not check directory permissions: {e}")
# Create FastAPI app
app = FastAPI(
title="Bakery Forecasting API Gateway",
description="Central API Gateway for bakery forecasting microservices",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc",
redirect_slashes=False # Disable automatic trailing slash redirects
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis (will be active once redis client is initialized)
RedisInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("gateway")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# Redis client for SSE streaming # Redis client for SSE streaming
redis_client = None redis_client = None
# CORS middleware - Add first
app.add_middleware( class GatewayService(StandardFastAPIService):
CORSMiddleware, """Gateway Service with standardized monitoring setup"""
allow_origins=settings.CORS_ORIGINS_LIST,
allow_credentials=True, async def on_startup(self, app):
allow_methods=["*"], """Custom startup logic for Gateway"""
allow_headers=["*"], global redis_client
# Initialize Redis
try:
await initialize_redis(settings.REDIS_URL, db=0, max_connections=50)
redis_client = await get_redis_client()
logger.info("Connected to Redis for SSE streaming")
# Add API rate limiting middleware with Redis client
app.add_middleware(APIRateLimitMiddleware, redis_client=redis_client)
logger.info("API rate limiting middleware enabled")
except Exception as e:
logger.error(f"Failed to connect to Redis: {e}")
# Register custom metrics for gateway-specific operations
if self.telemetry_providers and self.telemetry_providers.app_metrics:
logger.info("Gateway-specific metrics tracking enabled")
await super().on_startup(app)
async def on_shutdown(self, app):
"""Custom shutdown logic for Gateway"""
await super().on_shutdown(app)
# Close Redis
await close_redis()
logger.info("Redis connection closed")
# Create service instance
service = GatewayService(
service_name="gateway",
app_name="Bakery Forecasting API Gateway",
description="Central API Gateway for bakery forecasting microservices",
version="1.0.0",
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
cors_origins=settings.CORS_ORIGINS_LIST,
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
) )
# Custom middleware - Add in REVERSE order (last added = first executed) # Create FastAPI app
app = service.create_app()
# Add gateway-specific middleware (in REVERSE order of execution)
# Execution order: RequestIDMiddleware -> DemoMiddleware -> AuthMiddleware -> ReadOnlyModeMiddleware -> SubscriptionMiddleware -> APIRateLimitMiddleware -> RateLimitMiddleware -> LoggingMiddleware # Execution order: RequestIDMiddleware -> DemoMiddleware -> AuthMiddleware -> ReadOnlyModeMiddleware -> SubscriptionMiddleware -> APIRateLimitMiddleware -> RateLimitMiddleware -> LoggingMiddleware
app.add_middleware(LoggingMiddleware) # Executes 8th (outermost) app.add_middleware(LoggingMiddleware)
app.add_middleware(RateLimitMiddleware, calls_per_minute=300) # Executes 7th - Simple rate limit app.add_middleware(RateLimitMiddleware, calls_per_minute=300)
# Note: APIRateLimitMiddleware will be added on startup with Redis client app.add_middleware(SubscriptionMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)
app.add_middleware(SubscriptionMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL) # Executes 5th app.add_middleware(ReadOnlyModeMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)
app.add_middleware(ReadOnlyModeMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL) # Executes 4th - Enforce read-only mode app.add_middleware(AuthMiddleware)
app.add_middleware(AuthMiddleware) # Executes 3rd - Checks for demo context app.add_middleware(DemoMiddleware)
app.add_middleware(DemoMiddleware) # Executes 2nd - Sets demo user context app.add_middleware(RequestIDMiddleware)
app.add_middleware(RequestIDMiddleware) # Executes 1st (innermost) - Generates request ID for tracing
# Include routers # Include routers
app.include_router(auth.router, prefix="/api/v1/auth", tags=["authentication"]) app.include_router(auth.router, prefix="/api/v1/auth", tags=["authentication"])
@@ -156,114 +111,18 @@ app.include_router(subscription.router, prefix="/api/v1", tags=["subscriptions"]
app.include_router(notification.router, prefix="/api/v1/notifications", tags=["notifications"]) app.include_router(notification.router, prefix="/api/v1/notifications", tags=["notifications"])
app.include_router(nominatim.router, prefix="/api/v1/nominatim", tags=["location"]) app.include_router(nominatim.router, prefix="/api/v1/nominatim", tags=["location"])
app.include_router(geocoding.router, prefix="/api/v1/geocoding", tags=["geocoding"]) app.include_router(geocoding.router, prefix="/api/v1/geocoding", tags=["geocoding"])
# app.include_router(poi_context.router, prefix="/api/v1/poi-context", tags=["poi-context"]) # Removed to implement tenant-based architecture
app.include_router(pos.router, prefix="/api/v1/pos", tags=["pos"]) app.include_router(pos.router, prefix="/api/v1/pos", tags=["pos"])
app.include_router(demo.router, prefix="/api/v1", tags=["demo"]) app.include_router(demo.router, prefix="/api/v1", tags=["demo"])
@app.on_event("startup")
async def startup_event():
"""Application startup"""
global redis_client
logger.info("Starting API Gateway")
# Initialize shared Redis connection
try:
await initialize_redis(settings.REDIS_URL, db=0, max_connections=50)
redis_client = await get_redis_client()
logger.info("Connected to Redis for SSE streaming")
# Add API rate limiting middleware with Redis client
app.add_middleware(APIRateLimitMiddleware, redis_client=redis_client)
logger.info("API rate limiting middleware enabled with subscription-based quotas")
except Exception as e:
logger.error(f"Failed to connect to Redis: {e}")
logger.warning("API rate limiting middleware will fail open (allow all requests)")
metrics_collector.register_counter(
"gateway_auth_requests_total",
"Total authentication requests"
)
metrics_collector.register_counter(
"gateway_auth_responses_total",
"Total authentication responses"
)
metrics_collector.register_counter(
"gateway_auth_errors_total",
"Total authentication errors"
)
metrics_collector.register_histogram(
"gateway_request_duration_seconds",
"Request duration in seconds"
)
logger.info("Metrics registered successfully")
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("gateway")
logger.info("System metrics collection started")
logger.info("Metrics export configured via OpenTelemetry OTLP")
logger.info("API Gateway started successfully")
@app.on_event("shutdown")
async def shutdown_event():
"""Application shutdown"""
logger.info("Shutting down API Gateway")
# Close shared Redis connection
await close_redis()
# Clean up service discovery
# await service_discovery.cleanup()
logger.info("API Gateway shutdown complete")
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy",
"service": "api-gateway",
"version": "1.0.0",
"timestamp": time.time()
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
# ================================================================ # ================================================================
# SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS # SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS
# ================================================================ # ================================================================
def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list: def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list:
""" """Determine which Redis channels to subscribe to based on filters"""
Determine which Redis channels to subscribe to based on filters.
Args:
tenant_id: Tenant identifier
channel_filters: List of channel patterns (e.g., ["inventory.alerts", "*.notifications"])
Returns:
List of full channel names to subscribe to
Examples:
>>> _get_subscription_channels("abc", ["inventory.alerts"])
["tenant:abc:inventory.alerts"]
>>> _get_subscription_channels("abc", ["*.alerts"])
["tenant:abc:inventory.alerts", "tenant:abc:production.alerts", ...]
>>> _get_subscription_channels("abc", [])
["tenant:abc:inventory.alerts", "tenant:abc:inventory.notifications", ...]
"""
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"] all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
all_classes = ["alerts", "notifications"] all_classes = ["alerts", "notifications"]
channels = [] channels = []
if not channel_filters: if not channel_filters:
@@ -271,70 +130,49 @@ def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list:
for domain in all_domains: for domain in all_domains:
for event_class in all_classes: for event_class in all_classes:
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}") channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
# Also subscribe to recommendations (tenant-wide)
channels.append(f"tenant:{tenant_id}:recommendations") channels.append(f"tenant:{tenant_id}:recommendations")
# Also subscribe to legacy channel for backward compatibility channels.append(f"alerts:{tenant_id}") # Legacy
channels.append(f"alerts:{tenant_id}")
return channels return channels
# Parse filters and expand wildcards # Parse filters and expand wildcards
for filter_pattern in channel_filters: for filter_pattern in channel_filters:
if filter_pattern == "*.*": if filter_pattern == "*.*":
# All channels
for domain in all_domains: for domain in all_domains:
for event_class in all_classes: for event_class in all_classes:
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}") channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
channels.append(f"tenant:{tenant_id}:recommendations") channels.append(f"tenant:{tenant_id}:recommendations")
elif filter_pattern.endswith(".*"): elif filter_pattern.endswith(".*"):
# Domain wildcard (e.g., "inventory.*")
domain = filter_pattern.split(".")[0] domain = filter_pattern.split(".")[0]
for event_class in all_classes: for event_class in all_classes:
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}") channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
elif filter_pattern.startswith("*."): elif filter_pattern.startswith("*."):
# Class wildcard (e.g., "*.alerts")
event_class = filter_pattern.split(".")[1] event_class = filter_pattern.split(".")[1]
if event_class == "recommendations": if event_class == "recommendations":
channels.append(f"tenant:{tenant_id}:recommendations") channels.append(f"tenant:{tenant_id}:recommendations")
else: else:
for domain in all_domains: for domain in all_domains:
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}") channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
elif filter_pattern == "recommendations": elif filter_pattern == "recommendations":
# Recommendations channel
channels.append(f"tenant:{tenant_id}:recommendations") channels.append(f"tenant:{tenant_id}:recommendations")
else: else:
# Specific channel (e.g., "inventory.alerts")
channels.append(f"tenant:{tenant_id}:{filter_pattern}") channels.append(f"tenant:{tenant_id}:{filter_pattern}")
return list(set(channels)) # Remove duplicates return list(set(channels))
async def _load_initial_state(redis_client, tenant_id: str, channel_filters: list) -> list: async def _load_initial_state(redis_client, tenant_id: str, channel_filters: list) -> list:
""" """Load initial state from Redis cache based on channel filters"""
Load initial state from Redis cache based on channel filters.
Args:
redis_client: Redis client
tenant_id: Tenant identifier
channel_filters: List of channel patterns
Returns:
List of initial events
"""
initial_events = [] initial_events = []
try: try:
if not channel_filters: if not channel_filters:
# Load from legacy cache if no filters (backward compat) # Legacy cache
legacy_cache_key = f"active_alerts:{tenant_id}" legacy_cache_key = f"active_alerts:{tenant_id}"
cached_data = await redis_client.get(legacy_cache_key) cached_data = await redis_client.get(legacy_cache_key)
if cached_data: if cached_data:
return json.loads(cached_data) return json.loads(cached_data)
# Also try loading from new domain-specific caches # New domain-specific caches
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"] all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
all_classes = ["alerts", "notifications"] all_classes = ["alerts", "notifications"]
@@ -343,10 +181,9 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s" cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
cached_data = await redis_client.get(cache_key) cached_data = await redis_client.get(cache_key)
if cached_data: if cached_data:
events = json.loads(cached_data) initial_events.extend(json.loads(cached_data))
initial_events.extend(events)
# Load recommendations # Recommendations
recommendations_cache_key = f"active_events:{tenant_id}:recommendations" recommendations_cache_key = f"active_events:{tenant_id}:recommendations"
cached_data = await redis_client.get(recommendations_cache_key) cached_data = await redis_client.get(recommendations_cache_key)
if cached_data: if cached_data:
@@ -356,36 +193,29 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
# Load based on specific filters # Load based on specific filters
for filter_pattern in channel_filters: for filter_pattern in channel_filters:
# Extract domain and class from filter
if "." in filter_pattern: if "." in filter_pattern:
parts = filter_pattern.split(".") parts = filter_pattern.split(".")
domain = parts[0] if parts[0] != "*" else None domain = parts[0] if parts[0] != "*" else None
event_class = parts[1] if len(parts) > 1 and parts[1] != "*" else None event_class = parts[1] if len(parts) > 1 and parts[1] != "*" else None
if domain and event_class: if domain and event_class:
# Specific cache (e.g., "inventory.alerts")
cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s" cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
cached_data = await redis_client.get(cache_key) cached_data = await redis_client.get(cache_key)
if cached_data: if cached_data:
initial_events.extend(json.loads(cached_data)) initial_events.extend(json.loads(cached_data))
elif domain and not event_class: elif domain and not event_class:
# Domain wildcard (e.g., "inventory.*")
for ec in ["alerts", "notifications"]: for ec in ["alerts", "notifications"]:
cache_key = f"active_events:{tenant_id}:{domain}.{ec}" cache_key = f"active_events:{tenant_id}:{domain}.{ec}"
cached_data = await redis_client.get(cache_key) cached_data = await redis_client.get(cache_key)
if cached_data: if cached_data:
initial_events.extend(json.loads(cached_data)) initial_events.extend(json.loads(cached_data))
elif not domain and event_class: elif not domain and event_class:
# Class wildcard (e.g., "*.alerts")
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"] all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
for d in all_domains: for d in all_domains:
cache_key = f"active_events:{tenant_id}:{d}.{event_class}s" cache_key = f"active_events:{tenant_id}:{d}.{event_class}s"
cached_data = await redis_client.get(cache_key) cached_data = await redis_client.get(cache_key)
if cached_data: if cached_data:
initial_events.extend(json.loads(cached_data)) initial_events.extend(json.loads(cached_data))
elif filter_pattern == "recommendations": elif filter_pattern == "recommendations":
cache_key = f"active_events:{tenant_id}:recommendations" cache_key = f"active_events:{tenant_id}:recommendations"
cached_data = await redis_client.get(cache_key) cached_data = await redis_client.get(cache_key)
@@ -400,27 +230,14 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
def _determine_event_type(event_data: dict) -> str: def _determine_event_type(event_data: dict) -> str:
""" """Determine SSE event type from event data"""
Determine SSE event type from event data.
Args:
event_data: Event data dictionary
Returns:
SSE event type: 'alert', 'notification', or 'recommendation'
"""
# New event architecture uses 'event_class'
if 'event_class' in event_data: if 'event_class' in event_data:
return event_data['event_class'] # 'alert', 'notification', or 'recommendation' return event_data['event_class']
# Legacy format uses 'item_type'
if 'item_type' in event_data: if 'item_type' in event_data:
if event_data['item_type'] == 'recommendation': if event_data['item_type'] == 'recommendation':
return 'recommendation' return 'recommendation'
else: else:
return 'alert' return 'alert'
# Default to 'alert' for backward compatibility
return 'alert' return 'alert'
@@ -432,42 +249,25 @@ def _determine_event_type(event_data: dict) -> str:
async def events_stream( async def events_stream(
request: Request, request: Request,
tenant_id: str, tenant_id: str,
channels: str = None # Comma-separated channel filters (e.g., "inventory.alerts,production.notifications") channels: str = None
): ):
""" """
Server-Sent Events stream for real-time notifications with multi-channel support. Server-Sent Events stream for real-time notifications with multi-channel support.
Authentication is handled by auth middleware via query param token.
User context is available in request.state.user (injected by middleware).
Query Parameters: Query Parameters:
tenant_id: Tenant identifier (required) tenant_id: Tenant identifier (required)
channels: Comma-separated channel filters (optional) channels: Comma-separated channel filters (optional)
Examples:
- "inventory.alerts,production.notifications" - Specific channels
- "*.alerts" - All alert channels
- "inventory.*" - All inventory events
- None - All channels (default, backward compatible)
New channel pattern: tenant:{tenant_id}:{domain}.{class}
Examples:
- tenant:abc:inventory.alerts
- tenant:abc:production.notifications
- tenant:abc:recommendations
Legacy channel (backward compat): alerts:{tenant_id}
""" """
global redis_client global redis_client
if not redis_client: if not redis_client:
raise HTTPException(status_code=503, detail="SSE service unavailable") raise HTTPException(status_code=503, detail="SSE service unavailable")
# Extract user context from request state (set by auth middleware) # Extract user context from request state
user_context = request.state.user user_context = request.state.user
user_id = user_context.get('user_id') user_id = user_context.get('user_id')
email = user_context.get('email') email = user_context.get('email')
# Validate tenant_id parameter
if not tenant_id: if not tenant_id:
raise HTTPException(status_code=400, detail="tenant_id query parameter is required") raise HTTPException(status_code=400, detail="tenant_id query parameter is required")
@@ -479,79 +279,53 @@ async def events_stream(
logger.info(f"SSE connection request for user {email}, tenant {tenant_id}, channels: {channel_filters or 'all'}") logger.info(f"SSE connection request for user {email}, tenant {tenant_id}, channels: {channel_filters or 'all'}")
async def event_generator(): async def event_generator():
"""Generate server-sent events from Redis pub/sub with multi-channel support""" """Generate server-sent events from Redis pub/sub"""
pubsub = None pubsub = None
try: try:
# Create pubsub connection with resource monitoring
pubsub = redis_client.pubsub() pubsub = redis_client.pubsub()
logger.debug(f"Created Redis pubsub connection for tenant: {tenant_id}") logger.debug(f"Created Redis pubsub connection for tenant: {tenant_id}")
# Monitor connection count # Determine channels
try:
connection_info = await redis_client.info('clients')
connected_clients = connection_info.get('connected_clients', 'unknown')
logger.debug(f"Redis connected clients: {connected_clients}")
except Exception:
# Don't fail if we can't get connection info
pass
# Determine which channels to subscribe to
subscription_channels = _get_subscription_channels(tenant_id, channel_filters) subscription_channels = _get_subscription_channels(tenant_id, channel_filters)
# Subscribe to all determined channels # Subscribe
if subscription_channels: if subscription_channels:
await pubsub.subscribe(*subscription_channels) await pubsub.subscribe(*subscription_channels)
logger.info(f"Subscribed to {len(subscription_channels)} channels for tenant {tenant_id}") logger.info(f"Subscribed to {len(subscription_channels)} channels for tenant {tenant_id}")
else: else:
# Fallback to legacy channel if no channels specified
legacy_channel = f"alerts:{tenant_id}" legacy_channel = f"alerts:{tenant_id}"
await pubsub.subscribe(legacy_channel) await pubsub.subscribe(legacy_channel)
logger.info(f"Subscribed to legacy channel: {legacy_channel}")
# Send initial connection event # Connection event
yield f"event: connection\n" yield f"event: connection\n"
yield f"data: {json.dumps({'type': 'connected', 'message': 'SSE connection established', 'channels': subscription_channels or ['all'], 'timestamp': time.time()})}\n\n" yield f"data: {json.dumps({'type': 'connected', 'message': 'SSE connection established', 'channels': subscription_channels or ['all'], 'timestamp': time.time()})}\n\n"
# Fetch and send initial state from cache (domain-specific or legacy) # Initial state
initial_events = await _load_initial_state(redis_client, tenant_id, channel_filters) initial_events = await _load_initial_state(redis_client, tenant_id, channel_filters)
if initial_events: if initial_events:
logger.info(f"Sending {len(initial_events)} initial events to tenant {tenant_id}") logger.info(f"Sending {len(initial_events)} initial events to tenant {tenant_id}")
yield f"event: initial_state\n" yield f"event: initial_state\n"
yield f"data: {json.dumps(initial_events)}\n\n" yield f"data: {json.dumps(initial_events)}\n\n"
else:
# Send empty initial state for compatibility
yield f"event: initial_state\n"
yield f"data: {json.dumps([])}\n\n"
heartbeat_counter = 0 heartbeat_counter = 0
while True: while True:
# Check if client has disconnected
if await request.is_disconnected(): if await request.is_disconnected():
logger.info(f"SSE client disconnected for tenant: {tenant_id}") logger.info(f"SSE client disconnected for tenant: {tenant_id}")
break break
try: try:
# Get message from Redis with timeout
message = await asyncio.wait_for(pubsub.get_message(ignore_subscribe_messages=True), timeout=10.0) message = await asyncio.wait_for(pubsub.get_message(ignore_subscribe_messages=True), timeout=10.0)
if message and message['type'] == 'message': if message and message['type'] == 'message':
# Forward the event from Redis
event_data = json.loads(message['data']) event_data = json.loads(message['data'])
# Determine event type for SSE
event_type = _determine_event_type(event_data) event_type = _determine_event_type(event_data)
# Add channel metadata for frontend routing
event_data['_channel'] = message['channel'].decode('utf-8') if isinstance(message['channel'], bytes) else message['channel'] event_data['_channel'] = message['channel'].decode('utf-8') if isinstance(message['channel'], bytes) else message['channel']
yield f"event: {event_type}\n" yield f"event: {event_type}\n"
yield f"data: {json.dumps(event_data)}\n\n" yield f"data: {json.dumps(event_data)}\n\n"
logger.debug(f"SSE event sent to tenant {tenant_id}: {event_type} - {event_data.get('title')}")
except asyncio.TimeoutError: except asyncio.TimeoutError:
# Send heartbeat every 10 timeouts (100 seconds)
heartbeat_counter += 1 heartbeat_counter += 1
if heartbeat_counter >= 10: if heartbeat_counter >= 10:
yield f"event: heartbeat\n" yield f"event: heartbeat\n"
@@ -563,24 +337,13 @@ async def events_stream(
except Exception as e: except Exception as e:
logger.error(f"SSE error for tenant {tenant_id}: {e}", exc_info=True) logger.error(f"SSE error for tenant {tenant_id}: {e}", exc_info=True)
finally: finally:
try:
if pubsub: if pubsub:
try: try:
# Unsubscribe from all channels
await pubsub.unsubscribe() await pubsub.unsubscribe()
logger.debug(f"Unsubscribed from Redis channels for tenant: {tenant_id}")
except Exception as unsubscribe_error:
logger.error(f"Failed to unsubscribe Redis pubsub for tenant {tenant_id}: {unsubscribe_error}")
try:
# Close pubsub connection
await pubsub.close() await pubsub.close()
logger.debug(f"Closed Redis pubsub connection for tenant: {tenant_id}") except Exception as e:
except Exception as close_error: logger.error(f"Error closing pubsub: {e}")
logger.error(f"Failed to close Redis pubsub for tenant {tenant_id}: {close_error}")
logger.info(f"SSE connection closed for tenant: {tenant_id}") logger.info(f"SSE connection closed for tenant: {tenant_id}")
except Exception as finally_error:
logger.error(f"Error in SSE cleanup for tenant {tenant_id}: {finally_error}")
return StreamingResponse( return StreamingResponse(
event_generator(), event_generator(),
@@ -593,55 +356,35 @@ async def events_stream(
} }
) )
# ================================================================ # ================================================================
# WEBSOCKET ROUTING FOR TRAINING SERVICE # WEBSOCKET ROUTING FOR TRAINING SERVICE
# ================================================================ # ================================================================
@app.websocket("/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live") @app.websocket("/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live")
async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_id: str): async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_id: str):
""" """WebSocket proxy with token verification for training service"""
Simple WebSocket proxy with token verification only.
Validates the token and forwards the connection to the training service.
"""
# Get token from query params
token = websocket.query_params.get("token") token = websocket.query_params.get("token")
if not token: if not token:
logger.warning("WebSocket proxy rejected - missing token",
job_id=job_id,
tenant_id=tenant_id)
await websocket.accept() await websocket.accept()
await websocket.close(code=1008, reason="Authentication token required") await websocket.close(code=1008, reason="Authentication token required")
return return
# Verify token # Verify token
from shared.auth.jwt_handler import JWTHandler from shared.auth.jwt_handler import JWTHandler
jwt_handler = JWTHandler(settings.JWT_SECRET_KEY, settings.JWT_ALGORITHM) jwt_handler = JWTHandler(settings.JWT_SECRET_KEY, settings.JWT_ALGORITHM)
try: try:
payload = jwt_handler.verify_token(token) payload = jwt_handler.verify_token(token)
if not payload or not payload.get('user_id'): if not payload or not payload.get('user_id'):
logger.warning("WebSocket proxy rejected - invalid token",
job_id=job_id,
tenant_id=tenant_id)
await websocket.accept() await websocket.accept()
await websocket.close(code=1008, reason="Invalid token") await websocket.close(code=1008, reason="Invalid token")
return return
logger.info("WebSocket proxy - token verified",
user_id=payload.get('user_id'),
tenant_id=tenant_id,
job_id=job_id)
except Exception as e: except Exception as e:
logger.warning("WebSocket proxy - token verification failed",
job_id=job_id,
error=str(e))
await websocket.accept() await websocket.accept()
await websocket.close(code=1008, reason="Token verification failed") await websocket.close(code=1008, reason="Token verification failed")
return return
# Accept the connection
await websocket.accept() await websocket.accept()
# Build WebSocket URL to training service # Build WebSocket URL to training service
@@ -649,33 +392,24 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
training_ws_url = training_service_base.replace('http://', 'ws://').replace('https://', 'wss://') training_ws_url = training_service_base.replace('http://', 'ws://').replace('https://', 'wss://')
training_ws_url = f"{training_ws_url}/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live?token={token}" training_ws_url = f"{training_ws_url}/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live?token={token}"
logger.info("Gateway proxying WebSocket to training service",
job_id=job_id,
training_ws_url=training_ws_url.replace(token, '***'))
training_ws = None training_ws = None
try: try:
# Connect to training service WebSocket
import websockets import websockets
from websockets.protocol import State from websockets.protocol import State
training_ws = await websockets.connect( training_ws = await websockets.connect(
training_ws_url, training_ws_url,
ping_interval=120, # Send ping every 2 minutes (tolerates long training operations) ping_interval=120,
ping_timeout=60, # Wait up to 1 minute for pong (graceful timeout) ping_timeout=60,
close_timeout=60, # Increase close timeout for graceful shutdown close_timeout=60,
open_timeout=30 open_timeout=30
) )
logger.info("Gateway connected to training service WebSocket", job_id=job_id)
async def forward_frontend_to_training(): async def forward_frontend_to_training():
"""Forward messages from frontend to training service"""
try: try:
while training_ws and training_ws.state == State.OPEN: while training_ws and training_ws.state == State.OPEN:
data = await websocket.receive() data = await websocket.receive()
if data.get("type") == "websocket.receive": if data.get("type") == "websocket.receive":
if "text" in data: if "text" in data:
await training_ws.send(data["text"]) await training_ws.send(data["text"])
@@ -683,30 +417,17 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
await training_ws.send(data["bytes"]) await training_ws.send(data["bytes"])
elif data.get("type") == "websocket.disconnect": elif data.get("type") == "websocket.disconnect":
break break
except Exception as e: except Exception:
logger.debug("Frontend to training forward ended", error=str(e)) pass
async def forward_training_to_frontend(): async def forward_training_to_frontend():
"""Forward messages from training service to frontend"""
message_count = 0
try: try:
while training_ws and training_ws.state == State.OPEN: while training_ws and training_ws.state == State.OPEN:
message = await training_ws.recv() message = await training_ws.recv()
await websocket.send_text(message) await websocket.send_text(message)
message_count += 1 except Exception:
pass
# Log every 10th message to track connectivity
if message_count % 10 == 0:
logger.debug("WebSocket proxy active",
job_id=job_id,
messages_forwarded=message_count)
except Exception as e:
logger.info("Training to frontend forward ended",
job_id=job_id,
messages_forwarded=message_count,
error=str(e))
# Run both forwarding tasks concurrently
await asyncio.gather( await asyncio.gather(
forward_frontend_to_training(), forward_frontend_to_training(),
forward_training_to_frontend(), forward_training_to_frontend(),
@@ -716,20 +437,17 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
except Exception as e: except Exception as e:
logger.error("WebSocket proxy error", job_id=job_id, error=str(e)) logger.error("WebSocket proxy error", job_id=job_id, error=str(e))
finally: finally:
# Cleanup
if training_ws and training_ws.state == State.OPEN: if training_ws and training_ws.state == State.OPEN:
try: try:
await training_ws.close() await training_ws.close()
except: except:
pass pass
try: try:
if not websocket.client_state.name == 'DISCONNECTED': if not websocket.client_state.name == 'DISCONNECTED':
await websocket.close(code=1000, reason="Proxy closed") await websocket.close(code=1000, reason="Proxy closed")
except: except:
pass pass
logger.info("WebSocket proxy connection closed", job_id=job_id)
if __name__ == "__main__": if __name__ == "__main__":
import uvicorn import uvicorn

View File

@@ -48,9 +48,9 @@ signoz:
signoz_traces_ttl_duration_hrs: "168" signoz_traces_ttl_duration_hrs: "168"
signoz_metrics_ttl_duration_hrs: "168" signoz_metrics_ttl_duration_hrs: "168"
signoz_logs_ttl_duration_hrs: "168" signoz_logs_ttl_duration_hrs: "168"
# OpAMP Server Configuration # OpAMP Server Configuration - DISABLED for dev (causes gRPC instability)
signoz_opamp_server_enabled: "true" signoz_opamp_server_enabled: "false"
signoz_opamp_server_endpoint: "0.0.0.0:4320" # signoz_opamp_server_endpoint: "0.0.0.0:4320"
persistence: persistence:
enabled: true enabled: true
@@ -149,9 +149,10 @@ otelCollector:
repository: signoz/signoz-otel-collector repository: signoz/signoz-otel-collector
tag: v0.129.12 # Latest recommended version tag: v0.129.12 # Latest recommended version
# OpAMP Configuration - Enabled for dynamic configuration management # OpAMP Configuration - DISABLED for development
# Note: OpAMP allows remote configuration management via SigNoz backend # OpAMP is designed for production with remote config management
# This replaces the manual kubectl patch approach # In dev, it causes gRPC instability and collector reloads
# We use static configuration instead
# Init containers for the Otel Collector pod # Init containers for the Otel Collector pod
initContainers: initContainers:
@@ -231,6 +232,9 @@ otelCollector:
secretName: postgres-tls secretName: postgres-tls
- name: postgres-tls-fixed - name: postgres-tls-fixed
emptyDir: {} emptyDir: {}
- name: varlogpods
hostPath:
path: /var/log/pods
extraVolumeMounts: extraVolumeMounts:
- name: redis-tls - name: redis-tls
@@ -242,13 +246,16 @@ otelCollector:
- name: postgres-tls-fixed - name: postgres-tls-fixed
mountPath: /etc/postgres-tls mountPath: /etc/postgres-tls
readOnly: false readOnly: false
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
# Enable OpAMP for dynamic configuration management # Disable OpAMP - use static configuration only
# Use 'args' instead of 'extraArgs' to completely override the command
command: command:
name: /signoz-otel-collector name: /signoz-otel-collector
extraArgs: args:
- --config=/conf/otel-collector-config.yaml - --config=/conf/otel-collector-config.yaml
- --manager-config=/conf/otel-collector-opamp-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName - --feature-gates=-pkg.translator.prometheus.NormalizeName
# OpenTelemetry Collector configuration # OpenTelemetry Collector configuration
@@ -275,6 +282,63 @@ otelCollector:
allowed_origins: allowed_origins:
- "*" - "*"
# Filelog receiver for Kubernetes pod logs
# Collects container stdout/stderr from /var/log/pods
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
# Exclude SigNoz's own logs to avoid recursive collection
- /var/log/pods/bakery-ia_signoz-*/*/*.log
include_file_path: true
include_file_name: false
operators:
# Parse CRI-O / containerd log format
- type: regex_parser
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Fix timestamp parsing - extract from the parsed time field
- type: move
from: attributes.time
to: attributes.timestamp
# Extract Kubernetes metadata from file path
- type: regex_parser
id: extract_metadata_from_filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
parse_from: attributes["log.file.path"]
# Move metadata to resource attributes
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.log
to: body
# Kubernetes Cluster Receiver - Collects cluster-level metrics
# Provides information about nodes, namespaces, pods, and other cluster resources
k8s_cluster:
collection_interval: 30s
node_conditions_to_report:
- Ready
- MemoryPressure
- DiskPressure
- PIDPressure
- NetworkUnavailable
allocatable_types_to_report:
- cpu
- memory
- pods
# PostgreSQL receivers for database metrics # PostgreSQL receivers for database metrics
# ENABLED: Monitor users configured and credentials stored in secrets # ENABLED: Monitor users configured and credentials stored in secrets
# Collects metrics directly from PostgreSQL databases with proper TLS # Collects metrics directly from PostgreSQL databases with proper TLS
@@ -538,6 +602,43 @@ otelCollector:
password: ${env:RABBITMQ_PASSWORD} password: ${env:RABBITMQ_PASSWORD}
collection_interval: 30s collection_interval: 30s
# Prometheus Receiver - Scrapes metrics from Kubernetes API
# Simplified configuration using only Kubernetes API metrics
prometheus:
config:
scrape_configs:
- job_name: 'kubernetes-nodes-cadvisor'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
processors: processors:
# Batch processor for better performance (optimized for high throughput) # Batch processor for better performance (optimized for high throughput)
batch: batch:
@@ -562,6 +663,25 @@ otelCollector:
detectors: [env, system, docker] detectors: [env, system, docker]
timeout: 5s timeout: 5s
# Kubernetes attributes processor - CRITICAL for logs
# Extracts pod, namespace, container metadata from log attributes
k8sattributes:
auth_type: "serviceAccount"
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
- k8s.container.name
labels:
- tag_name: "app"
- tag_name: "pod-template-hash"
annotations:
- tag_name: "description"
# SigNoz span metrics processor with delta aggregation (recommended) # SigNoz span metrics processor with delta aggregation (recommended)
# Generates RED metrics (Rate, Error, Duration) from trace spans # Generates RED metrics (Rate, Error, Duration) from trace spans
signozspanmetrics/delta: signozspanmetrics/delta:
@@ -643,7 +763,7 @@ otelCollector:
postgresql/orchestrator, postgresql/pos, postgresql/procurement, postgresql/orchestrator, postgresql/pos, postgresql/procurement,
postgresql/production, postgresql/recipes, postgresql/sales, postgresql/production, postgresql/recipes, postgresql/sales,
postgresql/suppliers, postgresql/tenant, postgresql/training, postgresql/suppliers, postgresql/tenant, postgresql/training,
redis, rabbitmq] redis, rabbitmq, k8s_cluster, prometheus]
processors: [memory_limiter, batch, resourcedetection] processors: [memory_limiter, batch, resourcedetection]
exporters: [signozclickhousemetrics] exporters: [signozclickhousemetrics]
@@ -653,17 +773,38 @@ otelCollector:
processors: [batch/meter] processors: [batch/meter]
exporters: [signozclickhousemeter] exporters: [signozclickhousemeter]
# Logs pipeline # Logs pipeline - includes both OTLP and Kubernetes pod logs
logs: logs:
receivers: [otlp] receivers: [otlp, filelog]
processors: [memory_limiter, batch, resourcedetection] processors: [memory_limiter, batch, resourcedetection, k8sattributes]
exporters: [clickhouselogsexporter] exporters: [clickhouselogsexporter]
# Additional Configuration # Additional Configuration
serviceAccount: serviceAccount:
create: true create: true
annotations: {} annotations: {}
name: "" name: "signoz-otel-collector"
# RBAC Configuration for Kubernetes monitoring
# Required for k8s_cluster and kubeletstats receivers to access Kubernetes API
rbac:
create: true
rules:
- apiGroups: [""]
resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources: ["deployments", "daemonsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["metrics.k8s.io"]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch"]
# Security Context # Security Context
securityContext: securityContext:

View File

@@ -66,6 +66,11 @@ signoz:
signoz_traces_ttl_duration_hrs: "720" signoz_traces_ttl_duration_hrs: "720"
signoz_metrics_ttl_duration_hrs: "720" signoz_metrics_ttl_duration_hrs: "720"
signoz_logs_ttl_duration_hrs: "720" signoz_logs_ttl_duration_hrs: "720"
# OpAMP Server Configuration
# WARNING: OpAMP can cause gRPC instability and collector reloads
# Only enable if you have a stable OpAMP backend server
signoz_opamp_server_enabled: "false"
# signoz_opamp_server_endpoint: "0.0.0.0:4320"
# SMTP configuration for email alerts # SMTP configuration for email alerts
signoz_smtp_enabled: "true" signoz_smtp_enabled: "true"
signoz_smtp_host: "smtp.gmail.com" signoz_smtp_host: "smtp.gmail.com"
@@ -247,17 +252,52 @@ otelCollector:
tag: v0.129.12 # Updated to latest recommended version tag: v0.129.12 # Updated to latest recommended version
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
# Init containers for the Otel Collector pod
initContainers:
fix-postgres-tls:
enabled: true
image:
registry: docker.io
repository: busybox
tag: 1.35
pullPolicy: IfNotPresent
command:
- sh
- -c
- |
echo "Fixing PostgreSQL TLS file permissions..."
cp /etc/postgres-tls-source/* /etc/postgres-tls/
chmod 600 /etc/postgres-tls/server-key.pem
chmod 644 /etc/postgres-tls/server-cert.pem
chmod 644 /etc/postgres-tls/ca-cert.pem
echo "PostgreSQL TLS permissions fixed"
volumeMounts:
- name: postgres-tls-source
mountPath: /etc/postgres-tls-source
readOnly: true
- name: postgres-tls-fixed
mountPath: /etc/postgres-tls
readOnly: false
service: service:
type: ClusterIP type: ClusterIP
ports: ports:
- name: otlp-grpc - name: otlp-grpc
port: 4317 port: 4317
targetPort: 4317
protocol: TCP
- name: otlp-http - name: otlp-http
port: 4318 port: 4318
targetPort: 4318
protocol: TCP
- name: prometheus
port: 8889
targetPort: 8889
protocol: TCP
- name: metrics - name: metrics
port: 8888 port: 8888
- name: healthcheck targetPort: 8888
port: 13133 protocol: TCP
resources: resources:
requests: requests:
@@ -267,6 +307,50 @@ otelCollector:
cpu: 2000m cpu: 2000m
memory: 2Gi memory: 2Gi
# Additional environment variables for receivers
additionalEnvs:
POSTGRES_MONITOR_USER: "monitoring"
POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca"
REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k"
RABBITMQ_USER: "bakery"
RABBITMQ_PASSWORD: "forecast123"
# Mount TLS certificates for secure connections
extraVolumes:
- name: redis-tls
secret:
secretName: redis-tls-secret
- name: postgres-tls
secret:
secretName: postgres-tls
- name: postgres-tls-fixed
emptyDir: {}
- name: varlogpods
hostPath:
path: /var/log/pods
extraVolumeMounts:
- name: redis-tls
mountPath: /etc/redis-tls
readOnly: true
- name: postgres-tls
mountPath: /etc/postgres-tls-source
readOnly: true
- name: postgres-tls-fixed
mountPath: /etc/postgres-tls
readOnly: false
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
# Enable OpAMP for dynamic configuration management
command:
name: /signoz-otel-collector
extraArgs:
- --config=/conf/otel-collector-config.yaml
- --manager-config=/conf/otel-collector-opamp-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName
# Full OTEL Collector Configuration # Full OTEL Collector Configuration
config: config:
# Connectors - bridge between pipelines # Connectors - bridge between pipelines
@@ -297,14 +381,358 @@ otelCollector:
- "https://monitoring.bakewise.ai" - "https://monitoring.bakewise.ai"
- "https://*.bakewise.ai" - "https://*.bakewise.ai"
# Filelog receiver for Kubernetes pod logs
# Collects container stdout/stderr from /var/log/pods
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
# Exclude SigNoz's own logs to avoid recursive collection
- /var/log/pods/bakery-ia_signoz-*/*/*.log
include_file_path: true
include_file_name: false
operators:
# Parse CRI-O / containerd log format
- type: regex_parser
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Fix timestamp parsing - extract from the parsed time field
- type: move
from: attributes.time
to: attributes.timestamp
# Extract Kubernetes metadata from file path
- type: regex_parser
id: extract_metadata_from_filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
parse_from: attributes["log.file.path"]
# Move metadata to resource attributes
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.log
to: body
# Kubernetes Cluster Receiver - Collects cluster-level metrics
# Provides information about nodes, namespaces, pods, and other cluster resources
k8s_cluster:
collection_interval: 30s
node_conditions_to_report:
- Ready
- MemoryPressure
- DiskPressure
- PIDPressure
- NetworkUnavailable
allocatable_types_to_report:
- cpu
- memory
- pods
# Prometheus receiver for scraping metrics # Prometheus receiver for scraping metrics
prometheus: prometheus:
config: config:
scrape_configs: scrape_configs:
- job_name: 'otel-collector' - job_name: 'kubernetes-nodes-cadvisor'
scrape_interval: 30s scrape_interval: 30s
static_configs: scrape_timeout: 10s
- targets: ['localhost:8888'] scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Redis receiver for cache metrics
# ENABLED: Using existing credentials from redis-secrets with TLS
redis:
endpoint: redis-service.bakery-ia:6379
password: ${env:REDIS_PASSWORD}
collection_interval: 60s
transport: tcp
tls:
insecure_skip_verify: false
cert_file: /etc/redis-tls/redis-cert.pem
key_file: /etc/redis-tls/redis-key.pem
ca_file: /etc/redis-tls/ca-cert.pem
metrics:
redis.maxmemory:
enabled: true
redis.cmd.latency:
enabled: true
# RabbitMQ receiver via management API
# ENABLED: Using existing credentials from rabbitmq-secrets
rabbitmq:
endpoint: http://rabbitmq-service.bakery-ia:15672
username: ${env:RABBITMQ_USER}
password: ${env:RABBITMQ_PASSWORD}
collection_interval: 30s
# PostgreSQL receivers for database metrics
# Monitor all databases with proper TLS configuration
postgresql/auth:
endpoint: auth-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- auth_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/inventory:
endpoint: inventory-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- inventory_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/orders:
endpoint: orders-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- orders_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/ai-insights:
endpoint: ai-insights-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- ai_insights_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/alert-processor:
endpoint: alert-processor-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- alert_processor_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/distribution:
endpoint: distribution-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- distribution_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/external:
endpoint: external-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- external_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/forecasting:
endpoint: forecasting-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- forecasting_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/notification:
endpoint: notification-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- notification_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/orchestrator:
endpoint: orchestrator-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- orchestrator_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/pos:
endpoint: pos-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- pos_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/procurement:
endpoint: procurement-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- procurement_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/production:
endpoint: production-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- production_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/recipes:
endpoint: recipes-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- recipes_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/sales:
endpoint: sales-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- sales_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/suppliers:
endpoint: suppliers-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- suppliers_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/tenant:
endpoint: tenant-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- tenant_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/training:
endpoint: training-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- training_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
processors: processors:
# High-performance batch processing (official recommendation) # High-performance batch processing (official recommendation)
@@ -326,7 +754,7 @@ otelCollector:
# Resource detection for K8s # Resource detection for K8s
resourcedetection: resourcedetection:
detectors: [env, system, docker, kubernetes] detectors: [env, system, docker]
timeout: 5s timeout: 5s
# Add resource attributes # Add resource attributes
@@ -339,6 +767,26 @@ otelCollector:
value: bakery-ia-prod value: bakery-ia-prod
action: upsert action: upsert
# Kubernetes attributes processor - CRITICAL for logs
# Extracts pod, namespace, container metadata from log attributes
k8sattributes:
auth_type: "serviceAccount"
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
- k8s.container.name
labels:
- tag_name: "app"
- tag_name: "pod-template-hash"
- tag_name: "version"
annotations:
- tag_name: "description"
# SigNoz span metrics processor with delta aggregation (recommended) # SigNoz span metrics processor with delta aggregation (recommended)
# Generates RED metrics (Rate, Error, Duration) from trace spans # Generates RED metrics (Rate, Error, Duration) from trace spans
signozspanmetrics/delta: signozspanmetrics/delta:
@@ -354,9 +802,9 @@ otelCollector:
- name: signoz.collector.id - name: signoz.collector.id
exporters: exporters:
# Export to SigNoz ClickHouse # ClickHouse exporter for traces
clickhousetraces: clickhousetraces:
datasource: tcp://clickhouse:9000/?database=signoz_traces datasource: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_traces
timeout: 10s timeout: 10s
retry_on_failure: retry_on_failure:
enabled: true enabled: true
@@ -364,8 +812,9 @@ otelCollector:
max_interval: 30s max_interval: 30s
max_elapsed_time: 300s max_elapsed_time: 300s
# ClickHouse exporter for metrics
signozclickhousemetrics: signozclickhousemetrics:
endpoint: "tcp://clickhouse:9000/?database=signoz_metrics" dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
timeout: 10s timeout: 10s
retry_on_failure: retry_on_failure:
enabled: true enabled: true
@@ -375,32 +824,32 @@ otelCollector:
# ClickHouse exporter for meter data (usage metrics) # ClickHouse exporter for meter data (usage metrics)
signozclickhousemeter: signozclickhousemeter:
dsn: "tcp://clickhouse:9000/?database=signoz_meter" dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_meter"
timeout: 45s timeout: 45s
sending_queue: sending_queue:
enabled: false enabled: false
# ClickHouse exporter for logs
clickhouselogsexporter: clickhouselogsexporter:
dsn: tcp://clickhouse:9000/?database=signoz_logs dsn: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_logs
timeout: 10s timeout: 10s
retry_on_failure: retry_on_failure:
enabled: true enabled: true
initial_interval: 5s initial_interval: 5s
max_interval: 30s max_interval: 30s
max_elapsed_time: 300s
# Metadata exporter for service metadata # Metadata exporter for service metadata
metadataexporter: metadataexporter:
dsn: "tcp://clickhouse:9000/?database=signoz_metadata" dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metadata"
timeout: 10s timeout: 10s
cache: cache:
provider: in_memory provider: in_memory
# Debug exporter for debugging (replaces deprecated logging exporter) # Debug exporter for debugging (optional)
debug: debug:
verbosity: detailed verbosity: detailed
sampling_initial: 2 sampling_initial: 5
sampling_thereafter: 500 sampling_thereafter: 200
service: service:
extensions: [health_check, zpages] extensions: [health_check, zpages]
@@ -411,9 +860,16 @@ otelCollector:
processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource] processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource]
exporters: [clickhousetraces, metadataexporter, signozmeter] exporters: [clickhousetraces, metadataexporter, signozmeter]
# Metrics pipeline # Metrics pipeline - includes all infrastructure receivers
metrics: metrics:
receivers: [otlp, prometheus] receivers: [otlp,
postgresql/auth, postgresql/inventory, postgresql/orders,
postgresql/ai-insights, postgresql/alert-processor, postgresql/distribution,
postgresql/external, postgresql/forecasting, postgresql/notification,
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
postgresql/production, postgresql/recipes, postgresql/sales,
postgresql/suppliers, postgresql/tenant, postgresql/training,
redis, rabbitmq, k8s_cluster, prometheus]
processors: [memory_limiter, batch, resourcedetection, resource] processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [signozclickhousemetrics] exporters: [signozclickhousemetrics]
@@ -423,10 +879,10 @@ otelCollector:
processors: [batch/meter] processors: [batch/meter]
exporters: [signozclickhousemeter] exporters: [signozclickhousemeter]
# Logs pipeline # Logs pipeline - includes both OTLP and Kubernetes pod logs
logs: logs:
receivers: [otlp] receivers: [otlp, filelog]
processors: [memory_limiter, batch, resourcedetection, resource] processors: [memory_limiter, batch, resourcedetection, resource, k8sattributes]
exporters: [clickhouselogsexporter] exporters: [clickhouselogsexporter]
# HPA for OTEL Collector # HPA for OTEL Collector
@@ -455,6 +911,27 @@ serviceAccount:
annotations: {} annotations: {}
name: "signoz" name: "signoz"
# RBAC Configuration for Kubernetes monitoring
# Required for k8s_cluster receiver to access Kubernetes API
rbac:
create: true
rules:
- apiGroups: [""]
resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources: ["deployments", "daemonsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["metrics.k8s.io"]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch"]
# Security Context # Security Context
securityContext: securityContext:
runAsNonRoot: true runAsNonRoot: true

View File

@@ -15,9 +15,13 @@ data:
LOG_LEVEL: "INFO" LOG_LEVEL: "INFO"
# Observability Settings - SigNoz enabled # Observability Settings - SigNoz enabled
# Note: Detailed OTEL configuration is in the OBSERVABILITY section below
ENABLE_TRACING: "true" ENABLE_TRACING: "true"
ENABLE_METRICS: "true" ENABLE_METRICS: "true"
ENABLE_LOGS: "true" ENABLE_LOGS: "true"
ENABLE_OTEL_METRICS: "true"
ENABLE_SYSTEM_METRICS: "true"
OTEL_LOGS_EXPORTER: "otlp"
# Database initialization settings # Database initialization settings
# IMPORTANT: Services NEVER run migrations - they only verify DB is ready # IMPORTANT: Services NEVER run migrations - they only verify DB is ready
@@ -384,15 +388,44 @@ data:
# ================================================================ # ================================================================
# OBSERVABILITY - SigNoz (Unified Monitoring) # OBSERVABILITY - SigNoz (Unified Monitoring)
# ================================================================ # ================================================================
# OpenTelemetry Configuration - Direct to SigNoz # OpenTelemetry Configuration - Direct to SigNoz OTel Collector
# IMPORTANT: gRPC endpoints should NOT include http:// prefix #
# ENDPOINT CONFIGURATION:
# - OTEL_EXPORTER_OTLP_ENDPOINT: Base gRPC endpoint (host:port format, NO http:// prefix)
# Used by traces and metrics (gRPC) by default
# Format: "host:4317" (gRPC port)
#
# PROTOCOL USAGE:
# - Traces: gRPC (port 4317) - High performance, low latency
# - Metrics: gRPC (port 4317) - Efficient batch export
# - Logs: HTTP (port 4318) - Required for OTLP log protocol
#
# The monitoring library automatically handles:
# - Converting gRPC endpoint (4317) to HTTP endpoint (4318) for logs
# - Adding proper paths (/v1/traces, /v1/metrics, /v1/logs)
# - Protocol prefixes (http:// for HTTP, none for gRPC)
#
# Base OTLP endpoint (gRPC format - used by traces and metrics)
OTEL_EXPORTER_OTLP_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317" OTEL_EXPORTER_OTLP_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
# Protocol configuration (gRPC is recommended for better performance)
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc" OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
# Optional: Signal-specific endpoint overrides (if different from base)
# OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
# OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
# OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
# Optional: Protocol overrides per signal
# OTEL_EXPORTER_OTLP_TRACES_PROTOCOL: "grpc"
# OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: "grpc"
# Note: Logs always use HTTP protocol regardless of this setting
# Resource attributes (added to all telemetry signals)
OTEL_SERVICE_NAME: "bakery-ia" OTEL_SERVICE_NAME: "bakery-ia"
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development" OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
OTEL_LOGS_EXPORTER: "otlp"
# SigNoz Endpoints (v0.106.0+ unified service) # SigNoz service endpoints (for UI and API access)
SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080" SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local" SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local"

View File

@@ -1,104 +1,170 @@
{ {
"dashboard": {
"title": "Bakery IA - Alert Management",
"description": "Alert monitoring and management dashboard", "description": "Alert monitoring and management dashboard",
"tags": ["alerts", "monitoring", "management"], "tags": ["alerts", "monitoring", "management"],
"panels": [ "name": "bakery-ia-alert-management",
"title": "Bakery IA - Alert Management",
"uploadedGrafana": false,
"uuid": "bakery-ia-alerts-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{ {
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "active-alerts",
"moved": false,
"static": false
},
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "alert-rate",
"moved": false,
"static": false
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'alerts_active' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "active-alerts",
"title": "Active Alerts", "title": "Active Alerts",
"type": "stat", "description": "Number of currently active alerts",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": { "query": {
"metric": "alerts_active", "builder": {
"aggregate": "sum", "queryData": [
"filters": [
{ {
"key": "severity", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${severity}" "aggregateOperator": "sum",
"aggregateAttribute": {
"key": "alerts_active",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
}, },
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{ {
"key": "status", "key": {
"operator": "=", "key": "serviceName",
"value": "firing" "dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
} }
] ],
"op": "AND"
}, },
"unit": "number" "expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Active Alerts",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
}, },
{ {
"id": "alert-rate",
"title": "Alert Rate", "title": "Alert Rate",
"type": "timeseries", "description": "Rate of alerts over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": { "query": {
"metric": "alerts_total", "builder": {
"aggregate": "rate", "queryData": [
"filters": [
{ {
"key": "severity", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${severity}" "aggregateOperator": "sum",
} "aggregateAttribute": {
] "key": "alerts_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
}, },
"unit": "alerts/s" "timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}, },
{ "op": "=",
"title": "Alerts by Severity", "value": "{{.service}}"
"type": "pie",
"query": {
"metric": "alerts_total",
"aggregate": "sum",
"groupBy": ["severity"],
"filters": [
{
"key": "severity",
"operator": "=",
"value": "${severity}"
}
]
}
},
{
"title": "Alerts by Status",
"type": "pie",
"query": {
"metric": "alerts_total",
"aggregate": "sum",
"groupBy": ["status"],
"filters": [
{
"key": "status",
"operator": "=",
"value": "${status}"
}
]
}
} }
], ],
"variables": [ "op": "AND"
{
"name": "severity",
"label": "Severity",
"type": "dropdown",
"default": "*",
"values": ["*", "critical", "high", "medium", "low"]
}, },
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{ {
"name": "status", "key": "serviceName",
"label": "Status", "dataType": "string",
"type": "dropdown", "type": "tag",
"default": "*", "isColumn": true
"values": ["*", "firing", "resolved", "acknowledged"]
} }
], ],
"layout": { "legend": "{{serviceName}}",
"type": "grid", "reduceTo": "sum"
"columns": 12, }
"gap": [16, 16] ],
"queryFormulas": []
}, },
"refresh": "15s", "queryType": "builder"
"time": { },
"from": "now-1h", "fillSpans": false,
"to": "now" "yAxisUnit": "alerts/s"
}
} }
]
} }

View File

@@ -1,102 +1,351 @@
{ {
"dashboard": {
"title": "Bakery IA - API Performance",
"description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints", "description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints",
"tags": ["api", "performance", "rest", "graphql"], "tags": ["api", "performance", "rest", "graphql"],
"panels": [ "name": "bakery-ia-api-performance",
"title": "Bakery IA - API Performance",
"uploadedGrafana": false,
"uuid": "bakery-ia-api-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{ {
"title": "Request Volume", "x": 0,
"type": "timeseries", "y": 0,
"query": { "w": 6,
"metric": "http_server_requests_seconds_count", "h": 3,
"aggregate": "sum", "i": "request-volume",
"groupBy": ["api"], "moved": false,
"filters": [ "static": false
},
{ {
"key": "api", "x": 6,
"operator": "=", "y": 0,
"value": "${api}" "w": 6,
"h": 3,
"i": "error-rate",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "avg-response-time",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "p95-latency",
"moved": false,
"static": false
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by API service",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'http_server_requests_seconds_count' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
} }
]
},
"unit": "req/s"
}, },
"widgets": [
{ {
"title": "Error Rate", "id": "request-volume",
"type": "timeseries", "title": "Request Volume",
"description": "API request volume by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": { "query": {
"metric": "http_server_requests_seconds_count", "builder": {
"aggregate": "sum", "queryData": [
"groupBy": ["api", "status"],
"filters": [
{ {
"key": "api", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${api}" "aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{api.name}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "req/s"
}, },
{ {
"key": "status", "id": "error-rate",
"operator": "=~", "title": "Error Rate",
"description": "API error rate by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.api}}"
},
{
"key": {
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=~",
"value": "5.." "value": "5.."
} }
] ],
"op": "AND"
}, },
"unit": "req/s" "expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}, },
{ {
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{api.name}} - {{status_code}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "req/s"
},
{
"id": "avg-response-time",
"title": "Average Response Time", "title": "Average Response Time",
"type": "timeseries", "description": "Average API response time by endpoint",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": { "query": {
"metric": "http_server_requests_seconds_sum", "builder": {
"aggregate": "avg", "queryData": [
"groupBy": ["api", "endpoint"],
"filters": [
{ {
"key": "api", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${api}" "aggregateOperator": "avg",
"aggregateAttribute": {
"key": "http_server_requests_seconds_sum",
"dataType": "float64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.api}}"
} }
] ],
"op": "AND"
}, },
"unit": "seconds" "expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}, },
{ {
"key": "endpoint",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{api.name}} - {{endpoint}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "seconds"
},
{
"id": "p95-latency",
"title": "P95 Latency", "title": "P95 Latency",
"type": "timeseries", "description": "95th percentile latency by endpoint",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": { "query": {
"metric": "http_server_requests_seconds_bucket", "builder": {
"aggregate": "histogram_quantile", "queryData": [
"quantile": 0.95,
"groupBy": ["api", "endpoint"],
"filters": [
{ {
"key": "api", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${api}" "aggregateOperator": "histogram_quantile",
"aggregateAttribute": {
"key": "http_server_requests_seconds_bucket",
"dataType": "float64",
"type": "Histogram",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.api}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
{
"key": "endpoint",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{api.name}} - {{endpoint}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "seconds"
} }
] ]
},
"unit": "seconds"
}
],
"variables": [
{
"name": "api",
"label": "API Service",
"type": "dropdown",
"default": "*",
"values": ["*", "gateway-api", "auth-api", "inventory-api", "production-api", "forecasting-api", "procurement-api"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
},
"refresh": "15s",
"time": {
"from": "now-1h",
"to": "now"
}
}
} }

View File

@@ -1,101 +1,333 @@
{ {
"dashboard": { "description": "Application performance monitoring dashboard using distributed traces and metrics",
"title": "Bakery IA - Application Performance", "tags": ["application", "performance", "traces", "apm"],
"description": "Application performance monitoring dashboard for Bakery IA microservices", "name": "bakery-ia-application-performance",
"tags": ["application", "performance", "apm"], "title": "Bakery IA - Application Performance (APM)",
"panels": [ "uploadedGrafana": false,
"uuid": "bakery-ia-apm-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{ {
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "latency-p99",
"moved": false,
"static": false
},
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "request-rate",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "error-rate",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "avg-duration",
"moved": false,
"static": false
}
],
"variables": {
"service_name": {
"id": "service-var",
"name": "service_name",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(serviceName) FROM signoz_traces.distributed_signoz_index_v2 ORDER BY serviceName",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "latency-p99",
"title": "P99 Latency",
"description": "99th percentile latency for selected service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "traces",
"queryName": "A",
"aggregateOperator": "p99",
"aggregateAttribute": {
"key": "duration_ns",
"dataType": "float64",
"type": "",
"isColumn": true
},
"timeAggregation": "avg",
"spaceAggregation": "p99",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "ms"
},
{
"id": "request-rate",
"title": "Request Rate", "title": "Request Rate",
"type": "timeseries", "description": "Requests per second for the service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": { "query": {
"metric": "http_server_requests_seconds_count", "builder": {
"aggregate": "sum", "queryData": [
"groupBy": ["service"],
"filters": [
{ {
"key": "service", "dataSource": "traces",
"operator": "=", "queryName": "A",
"value": "${service}" "aggregateOperator": "count",
"aggregateAttribute": {
"key": "",
"dataType": "",
"type": "",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
} }
] ],
"op": "AND"
}, },
"unit": "req/s" "expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "reqps"
}, },
{ {
"id": "error-rate",
"title": "Error Rate", "title": "Error Rate",
"type": "timeseries", "description": "Error rate percentage for the service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": { "query": {
"metric": "http_server_requests_seconds_count", "builder": {
"aggregate": "sum", "queryData": [
"groupBy": ["service", "status"],
"filters": [
{ {
"key": "service", "dataSource": "traces",
"operator": "=", "queryName": "A",
"value": "${service}" "aggregateOperator": "count",
"aggregateAttribute": {
"key": "",
"dataType": "",
"type": "",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
}, },
{ {
"key": "status", "key": {
"operator": "=~", "key": "status_code",
"value": "5.." "dataType": "string",
} "type": "tag",
] "isColumn": true
}, },
"unit": "req/s" "op": "=",
}, "value": "STATUS_CODE_ERROR"
{
"title": "Average Response Time",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_sum",
"aggregate": "avg",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "seconds"
},
{
"title": "Throughput",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "rate",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "req/s"
} }
], ],
"variables": [ "op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{ {
"name": "service", "key": "serviceName",
"label": "Service", "dataType": "string",
"type": "dropdown", "type": "tag",
"default": "*", "isColumn": true
"values": ["*", "auth-service", "gateway-service", "forecasting-service", "inventory-service", "production-service", "procurement-service"]
} }
], ],
"layout": { "legend": "{{serviceName}}",
"type": "grid", "reduceTo": "sum"
"columns": 12, }
"gap": [16, 16] ],
"queryFormulas": []
}, },
"refresh": "15s", "queryType": "builder"
"time": { },
"from": "now-30m", "fillSpans": false,
"to": "now" "yAxisUnit": "reqps"
},
{
"id": "avg-duration",
"title": "Average Duration",
"description": "Average request duration",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "traces",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "duration_ns",
"dataType": "float64",
"type": "",
"isColumn": true
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
} }
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
} }
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "ms"
}
]
} }

View File

@@ -1,101 +1,425 @@
{ {
"dashboard": { "description": "Comprehensive database performance monitoring for PostgreSQL, Redis, and RabbitMQ",
"tags": ["database", "postgresql", "redis", "rabbitmq", "performance"],
"name": "bakery-ia-database-performance",
"title": "Bakery IA - Database Performance", "title": "Bakery IA - Database Performance",
"description": "Comprehensive database performance monitoring for PostgreSQL and Redis", "uploadedGrafana": false,
"tags": ["database", "postgresql", "redis", "performance"], "uuid": "bakery-ia-db-01",
"panels": [ "version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{ {
"title": "Database Connections", "x": 0,
"type": "timeseries", "y": 0,
"query": { "w": 6,
"metric": "pg_stat_activity_count", "h": 3,
"aggregate": "sum", "i": "pg-connections",
"groupBy": ["datname"], "moved": false,
"filters": [ "static": false
{
"key": "datname",
"operator": "=",
"value": "${database}"
}
]
},
"unit": "number"
}, },
{ {
"title": "Active Queries", "x": 6,
"type": "timeseries", "y": 0,
"query": { "w": 6,
"metric": "pg_stat_activity_count", "h": 3,
"aggregate": "sum", "i": "pg-db-size",
"groupBy": ["datname"], "moved": false,
"filters": [ "static": false
{
"key": "datname",
"operator": "=",
"value": "${database}"
}, },
{ {
"key": "state", "x": 0,
"operator": "=", "y": 3,
"value": "active" "w": 6,
} "h": 3,
] "i": "redis-connected-clients",
}, "moved": false,
"unit": "number" "static": false
}, },
{ {
"title": "Database Size", "x": 6,
"type": "timeseries", "y": 3,
"query": { "w": 6,
"metric": "pg_database_size_bytes", "h": 3,
"aggregate": "sum", "i": "redis-memory",
"groupBy": ["datname"], "moved": false,
"filters": [ "static": false
{
"key": "datname",
"operator": "=",
"value": "${database}"
}
]
},
"unit": "bytes"
}, },
{ {
"title": "Query Execution Time", "x": 0,
"type": "timeseries", "y": 6,
"query": { "w": 6,
"metric": "pg_stat_statements_total_time", "h": 3,
"aggregate": "avg", "i": "rabbitmq-messages",
"groupBy": ["datname"], "moved": false,
"filters": [ "static": false
{
"key": "datname",
"operator": "=",
"value": "${database}"
}
]
}, },
"unit": "seconds" {
"x": 6,
"y": 6,
"w": 6,
"h": 3,
"i": "rabbitmq-consumers",
"moved": false,
"static": false
} }
], ],
"variables": [ "variables": {
{ "database": {
"id": "database-var",
"name": "database", "name": "database",
"label": "Database", "description": "Filter by PostgreSQL database name",
"type": "dropdown", "type": "QUERY",
"default": "*", "queryValue": "SELECT DISTINCT(resource_attrs['postgresql.database.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'postgresql.db_size' AND value != '' ORDER BY value",
"values": ["*", "postgresql", "redis"] "customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "pg-connections",
"title": "PostgreSQL - Active Connections",
"description": "Number of active PostgreSQL connections",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "postgresql.backends",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.database}}"
} }
], ],
"layout": { "op": "AND"
"type": "grid",
"columns": 12,
"gap": [16, 16]
}, },
"refresh": "30s", "expression": "A",
"time": { "disabled": false,
"from": "now-1h", "having": [],
"to": "now" "stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
} }
],
"legend": "{{postgresql.database.name}}",
"reduceTo": "sum"
} }
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "pg-db-size",
"title": "PostgreSQL - Database Size",
"description": "Size of PostgreSQL databases in bytes",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "postgresql.db_size",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.database}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{postgresql.database.name}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "bytes"
},
{
"id": "redis-connected-clients",
"title": "Redis - Connected Clients",
"description": "Number of clients connected to Redis",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "redis.clients.connected",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "host.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{host.name}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "redis-memory",
"title": "Redis - Memory Usage",
"description": "Redis memory usage in bytes",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "redis.memory.used",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "host.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{host.name}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "bytes"
},
{
"id": "rabbitmq-messages",
"title": "RabbitMQ - Current Messages",
"description": "Number of messages currently in RabbitMQ queues",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "rabbitmq.message.current",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "queue",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "Queue: {{queue}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "rabbitmq-consumers",
"title": "RabbitMQ - Consumer Count",
"description": "Number of consumers per queue",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "rabbitmq.consumer.count",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "queue",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "Queue: {{queue}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
}
]
} }

View File

@@ -1,105 +1,348 @@
{ {
"dashboard": {
"title": "Bakery IA - Error Tracking",
"description": "Comprehensive error tracking and analysis dashboard", "description": "Comprehensive error tracking and analysis dashboard",
"tags": ["errors", "exceptions", "tracking"], "tags": ["errors", "exceptions", "tracking"],
"panels": [ "name": "bakery-ia-error-tracking",
"title": "Bakery IA - Error Tracking",
"uploadedGrafana": false,
"uuid": "bakery-ia-errors-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{ {
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "total-errors",
"moved": false,
"static": false
},
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "error-rate",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "http-5xx",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "http-4xx",
"moved": false,
"static": false
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'error_total' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "total-errors",
"title": "Total Errors", "title": "Total Errors",
"type": "stat", "description": "Total number of errors across all services",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": { "query": {
"metric": "error_total", "builder": {
"aggregate": "sum", "queryData": [
"filters": [
{ {
"key": "service", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${service}" "aggregateOperator": "sum",
"aggregateAttribute": {
"key": "error_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
} }
] ],
"op": "AND"
}, },
"unit": "number" "expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Total Errors",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
}, },
{ {
"id": "error-rate",
"title": "Error Rate", "title": "Error Rate",
"type": "timeseries", "description": "Error rate over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": { "query": {
"metric": "error_total", "builder": {
"aggregate": "rate", "queryData": [
"groupBy": ["service"],
"filters": [
{ {
"key": "service", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${service}" "aggregateOperator": "sum",
"aggregateAttribute": {
"key": "error_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
} }
] ],
"op": "AND"
}, },
"unit": "errors/s" "expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "errors/s"
}, },
{ {
"id": "http-5xx",
"title": "HTTP 5xx Errors", "title": "HTTP 5xx Errors",
"type": "timeseries", "description": "Server errors (5xx status codes)",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": { "query": {
"metric": "http_server_requests_seconds_count", "builder": {
"aggregate": "sum", "queryData": [
"groupBy": ["service", "status"],
"filters": [
{ {
"key": "service", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${service}" "aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
}, },
{ {
"key": "status", "key": {
"operator": "=~", "key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=~",
"value": "5.." "value": "5.."
} }
] ],
"op": "AND"
}, },
"unit": "number" "expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}, },
{ {
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{serviceName}} - {{status_code}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "number"
},
{
"id": "http-4xx",
"title": "HTTP 4xx Errors", "title": "HTTP 4xx Errors",
"type": "timeseries", "description": "Client errors (4xx status codes)",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": { "query": {
"metric": "http_server_requests_seconds_count", "builder": {
"aggregate": "sum", "queryData": [
"groupBy": ["service", "status"],
"filters": [
{ {
"key": "service", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${service}" "aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
}, },
{ {
"key": "status", "key": {
"operator": "=~", "key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=~",
"value": "4.." "value": "4.."
} }
]
},
"unit": "number"
}
], ],
"variables": [ "op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{ {
"name": "service", "key": "serviceName",
"label": "Service", "dataType": "string",
"type": "dropdown", "type": "tag",
"default": "*", "isColumn": true
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"] },
{
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
} }
], ],
"layout": { "legend": "{{serviceName}} - {{status_code}}",
"type": "grid", "reduceTo": "sum"
"columns": 12, }
"gap": [16, 16] ],
"queryFormulas": []
}, },
"refresh": "15s", "queryType": "builder"
"time": { },
"from": "now-1h", "fillSpans": false,
"to": "now" "yAxisUnit": "number"
}
} }
]
} }

View File

@@ -1,105 +1,423 @@
{ {
"dashboard": { "description": "Comprehensive infrastructure monitoring dashboard for Bakery IA Kubernetes cluster",
"tags": ["infrastructure", "kubernetes", "k8s", "system"],
"name": "bakery-ia-infrastructure-monitoring",
"title": "Bakery IA - Infrastructure Monitoring", "title": "Bakery IA - Infrastructure Monitoring",
"description": "Comprehensive infrastructure monitoring dashboard for Bakery IA system", "uploadedGrafana": false,
"tags": ["infrastructure", "system", "kubernetes"], "uuid": "bakery-ia-infra-01",
"panels": [ "version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{ {
"title": "CPU Usage", "x": 0,
"type": "timeseries", "y": 0,
"query": { "w": 6,
"metric": "container_cpu_usage_seconds_total", "h": 3,
"aggregate": "sum", "i": "pod-count",
"groupBy": ["namespace"], "moved": false,
"filters": [ "static": false
{
"key": "namespace",
"operator": "=",
"value": "bakery-ia"
}
]
}, },
"unit": "percent", {
"yAxis": { "x": 6,
"min": 0, "y": 0,
"max": 100 "w": 6,
"h": 3,
"i": "pod-phase",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "container-restarts",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "node-condition",
"moved": false,
"static": false
},
{
"x": 0,
"y": 6,
"w": 12,
"h": 3,
"i": "deployment-status",
"moved": false,
"static": false
}
],
"variables": {
"namespace": {
"id": "namespace-var",
"name": "namespace",
"description": "Filter by Kubernetes namespace",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'k8s.pod.phase' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": "bakery-ia"
} }
}, },
"widgets": [
{ {
"title": "Memory Usage", "id": "pod-count",
"type": "timeseries", "title": "Total Pods",
"description": "Total number of pods in the namespace",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": { "query": {
"metric": "container_memory_working_set_bytes", "builder": {
"aggregate": "sum", "queryData": [
"groupBy": ["namespace"],
"filters": [
{ {
"key": "namespace", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "bakery-ia" "aggregateOperator": "count",
"aggregateAttribute": {
"key": "k8s.pod.phase",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
} }
] ],
"op": "AND"
}, },
"unit": "bytes" "expression": "A",
}, "disabled": false,
{ "having": [],
"title": "Network Traffic", "stepInterval": 60,
"type": "timeseries", "limit": null,
"query": { "orderBy": [],
"metric": "container_network_receive_bytes_total", "groupBy": [],
"aggregate": "sum", "legend": "Total Pods",
"groupBy": ["namespace"], "reduceTo": "sum"
"filters": [
{
"key": "namespace",
"operator": "=",
"value": "bakery-ia"
} }
] ],
"queryFormulas": []
}, },
"unit": "bytes" "queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
}, },
{ {
"title": "Pod Status", "id": "pod-phase",
"type": "stat", "title": "Pod Phase Distribution",
"description": "Pods by phase (Running, Pending, Failed, etc.)",
"isStacked": true,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": { "query": {
"metric": "kube_pod_status_phase", "builder": {
"aggregate": "count", "queryData": [
"groupBy": ["phase"],
"filters": [
{ {
"key": "namespace", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "bakery-ia" "aggregateOperator": "sum",
"aggregateAttribute": {
"key": "k8s.pod.phase",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
}, },
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{ {
"key": "phase", "key": "phase",
"operator": "=", "dataType": "string",
"value": "Running" "type": "tag",
"isColumn": false
}
],
"legend": "{{phase}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "container-restarts",
"title": "Container Restarts",
"description": "Container restart count over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "k8s.container.restarts",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "increase",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.pod.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.pod.name}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "node-condition",
"title": "Node Conditions",
"description": "Node condition status (Ready, MemoryPressure, DiskPressure, etc.)",
"isStacked": true,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "k8s.node.condition_ready",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.node.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.node.name}} Ready",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "deployment-status",
"title": "Deployment Status (Desired vs Available)",
"description": "Deployment replicas: desired vs available",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "k8s.deployment.desired",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.deployment.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.deployment.name}} (desired)",
"reduceTo": "avg"
},
{
"dataSource": "metrics",
"queryName": "B",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "k8s.deployment.available",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "B",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.deployment.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.deployment.name}} (available)",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
} }
] ]
},
"unit": "number"
}
],
"variables": [
{
"name": "namespace",
"label": "Namespace",
"type": "dropdown",
"default": "bakery-ia",
"values": ["bakery-ia", "default", "kube-system"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
}
}
} }

View File

@@ -1,99 +1,333 @@
{ {
"dashboard": {
"title": "Bakery IA - Log Analysis",
"description": "Comprehensive log analysis and search dashboard", "description": "Comprehensive log analysis and search dashboard",
"tags": ["logs", "analysis", "search"], "tags": ["logs", "analysis", "search"],
"panels": [ "name": "bakery-ia-log-analysis",
"title": "Bakery IA - Log Analysis",
"uploadedGrafana": false,
"uuid": "bakery-ia-logs-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{ {
"title": "Log Volume", "x": 0,
"type": "timeseries", "y": 0,
"query": { "w": 6,
"metric": "log_lines_total", "h": 3,
"aggregate": "sum", "i": "log-volume",
"groupBy": ["service"], "moved": false,
"filters": [ "static": false
},
{ {
"key": "service", "x": 6,
"operator": "=", "y": 0,
"value": "${service}" "w": 6,
"h": 3,
"i": "error-logs",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "logs-by-level",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "logs-by-service",
"moved": false,
"static": false
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'log_lines_total' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
} }
]
},
"unit": "logs/s"
}, },
"widgets": [
{ {
"title": "Error Logs", "id": "log-volume",
"type": "timeseries", "title": "Log Volume",
"description": "Total log volume by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": { "query": {
"metric": "log_lines_total", "builder": {
"aggregate": "sum", "queryData": [
"groupBy": ["service"],
"filters": [
{ {
"key": "service", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${service}" "aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "logs/s"
}, },
{ {
"id": "error-logs",
"title": "Error Logs",
"description": "Error log volume by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
},
{
"key": {
"key": "level", "key": "level",
"operator": "=", "dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=",
"value": "error" "value": "error"
} }
] ],
"op": "AND"
}, },
"unit": "logs/s" "expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}} (errors)",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "logs/s"
}, },
{ {
"id": "logs-by-level",
"title": "Logs by Level", "title": "Logs by Level",
"type": "pie", "description": "Distribution of logs by severity level",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "pie",
"query": { "query": {
"metric": "log_lines_total", "builder": {
"aggregate": "sum", "queryData": [
"groupBy": ["level"],
"filters": [
{ {
"key": "service", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${service}" "aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
} }
] ],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "level",
"dataType": "string",
"type": "tag",
"isColumn": false
} }
],
"legend": "{{level}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
}, },
{ {
"id": "logs-by-service",
"title": "Logs by Service", "title": "Logs by Service",
"type": "pie", "description": "Distribution of logs by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "pie",
"query": { "query": {
"metric": "log_lines_total", "builder": {
"aggregate": "sum", "queryData": [
"groupBy": ["service"],
"filters": [
{ {
"key": "service", "dataSource": "metrics",
"operator": "=", "queryName": "A",
"value": "${service}" "aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
} }
] ]
} }
}
],
"variables": [
{
"name": "service",
"label": "Service",
"type": "dropdown",
"default": "*",
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
}
}
}

View File

@@ -1,92 +1,295 @@
{ {
"dashboard": {
"title": "Bakery IA - System Health",
"description": "Comprehensive system health monitoring dashboard", "description": "Comprehensive system health monitoring dashboard",
"tags": ["system", "health", "monitoring"], "tags": ["system", "health", "monitoring"],
"panels": [ "name": "bakery-ia-system-health",
"title": "Bakery IA - System Health",
"uploadedGrafana": false,
"uuid": "bakery-ia-health-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{ {
"title": "System Availability", "x": 0,
"type": "stat", "y": 0,
"query": { "w": 6,
"metric": "system_availability", "h": 3,
"aggregate": "avg", "i": "system-availability",
"filters": [ "moved": false,
{ "static": false
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "percent"
}, },
{ {
"title": "Service Health Score", "x": 6,
"type": "stat", "y": 0,
"query": { "w": 6,
"metric": "service_health_score", "h": 3,
"aggregate": "avg", "i": "health-score",
"filters": [ "moved": false,
{ "static": false
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "number"
}, },
{ {
"title": "CPU Usage", "x": 0,
"type": "timeseries", "y": 3,
"query": { "w": 6,
"metric": "system_cpu_usage", "h": 3,
"aggregate": "avg", "i": "cpu-usage",
"filters": [ "moved": false,
{ "static": false
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "percent"
}, },
{ {
"title": "Memory Usage", "x": 6,
"type": "timeseries", "y": 3,
"query": { "w": 6,
"metric": "system_memory_usage", "h": 3,
"aggregate": "avg", "i": "memory-usage",
"filters": [ "moved": false,
{ "static": false
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "percent"
} }
], ],
"variables": [ "variables": {
{ "namespace": {
"id": "namespace-var",
"name": "namespace", "name": "namespace",
"label": "Namespace", "description": "Filter by Kubernetes namespace",
"type": "dropdown", "type": "QUERY",
"default": "bakery-ia", "queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'system_availability' AND value != '' ORDER BY value",
"values": ["bakery-ia", "default"] "customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": "bakery-ia"
}
},
"widgets": [
{
"id": "system-availability",
"title": "System Availability",
"description": "Overall system availability percentage",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "system_availability",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
} }
], ],
"layout": { "op": "AND"
"type": "grid",
"columns": 12,
"gap": [16, 16]
}, },
"refresh": "30s", "expression": "A",
"time": { "disabled": false,
"from": "now-1h", "having": [],
"to": "now" "stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "System Availability",
"reduceTo": "avg"
} }
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "percent"
},
{
"id": "health-score",
"title": "Service Health Score",
"description": "Overall service health score",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "service_health_score",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
} }
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Health Score",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "cpu-usage",
"title": "CPU Usage",
"description": "System CPU usage over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "system_cpu_usage",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "CPU Usage",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "percent"
},
{
"id": "memory-usage",
"title": "Memory Usage",
"description": "System memory usage over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "system_memory_usage",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Memory Usage",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "percent"
}
]
} }

View File

@@ -1,96 +1,323 @@
{ {
"dashboard": {
"title": "Bakery IA - User Activity",
"description": "User activity and behavior monitoring dashboard", "description": "User activity and behavior monitoring dashboard",
"tags": ["user", "activity", "behavior"], "tags": ["user", "activity", "behavior"],
"panels": [ "name": "bakery-ia-user-activity",
"title": "Bakery IA - User Activity",
"uploadedGrafana": false,
"uuid": "bakery-ia-user-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{ {
"title": "Active Users", "x": 0,
"type": "timeseries", "y": 0,
"query": { "w": 6,
"metric": "active_users", "h": 3,
"aggregate": "sum", "i": "active-users",
"groupBy": ["service"], "moved": false,
"filters": [ "static": false
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "number"
}, },
{ {
"title": "User Sessions", "x": 6,
"type": "timeseries", "y": 0,
"query": { "w": 6,
"metric": "user_sessions_total", "h": 3,
"aggregate": "sum", "i": "user-sessions",
"groupBy": ["service"], "moved": false,
"filters": [ "static": false
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "number"
}, },
{ {
"title": "API Calls per User", "x": 0,
"type": "timeseries", "y": 3,
"query": { "w": 6,
"metric": "api_calls_per_user", "h": 3,
"aggregate": "avg", "i": "api-calls-per-user",
"groupBy": ["service"], "moved": false,
"filters": [ "static": false
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "number"
}, },
{ {
"title": "Session Duration", "x": 6,
"type": "timeseries", "y": 3,
"query": { "w": 6,
"metric": "session_duration_seconds", "h": 3,
"aggregate": "avg", "i": "session-duration",
"groupBy": ["service"], "moved": false,
"filters": [ "static": false
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "seconds"
} }
], ],
"variables": [ "variables": {
{ "service": {
"id": "service-var",
"name": "service", "name": "service",
"label": "Service", "description": "Filter by service name",
"type": "dropdown", "type": "QUERY",
"default": "*", "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'active_users' AND value != '' ORDER BY value",
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service"] "customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "active-users",
"title": "Active Users",
"description": "Number of active users by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "active_users",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
} }
], ],
"layout": { "op": "AND"
"type": "grid",
"columns": 12,
"gap": [16, 16]
}, },
"refresh": "30s", "expression": "A",
"time": { "disabled": false,
"from": "now-1h", "having": [],
"to": "now" "stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
} }
],
"legend": "{{service.name}}",
"reduceTo": "sum"
} }
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "user-sessions",
"title": "User Sessions",
"description": "Total user sessions by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "user_sessions_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "api-calls-per-user",
"title": "API Calls per User",
"description": "Average API calls per user by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "api_calls_per_user",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "session-duration",
"title": "Session Duration",
"description": "Average session duration by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "session_duration_seconds",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "seconds"
}
]
} }

View File

@@ -1,160 +1,61 @@
"""Main FastAPI application for AI Insights Service.""" """Main FastAPI application for AI Insights Service."""
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import structlog import structlog
import os
from app.core.config import settings from app.core.config import settings
from app.core.database import init_db, close_db from app.core.database import init_db, close_db
from app.api import insights from app.api import insights
from shared.monitoring.logging import setup_logging from shared.service_base import StandardFastAPIService
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
# OpenTelemetry imports # Initialize logger
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "ai-insights"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("ai-insights")
# Setup logging
setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO'))
logger = structlog.get_logger() logger = structlog.get_logger()
# Setup OpenTelemetry logging export if enabled
logger.info(f"OTEL_LOGS_EXPORTER env var: {os.getenv('OTEL_LOGS_EXPORTER', 'not set')}")
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
logger.info("Attempting to setup OpenTelemetry logging")
from shared.monitoring.logs_exporter import setup_otel_logging
result = setup_otel_logging("ai-insights", settings.SERVICE_VERSION)
if result:
logger.info("OpenTelemetry logs export enabled for ai-insights")
else:
logger.warning("OpenTelemetry logs export setup returned None")
except Exception as e:
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
else:
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
class AIInsightsService(StandardFastAPIService):
"""AI Insights Service with standardized monitoring setup"""
@asynccontextmanager async def on_startup(self, app):
async def lifespan(app: FastAPI): """Custom startup logic for AI Insights"""
"""Lifespan event handler for startup and shutdown.""" # Initialize database
# Startup
logger.info("Starting AI Insights Service", service=settings.SERVICE_NAME, version=settings.SERVICE_VERSION)
await init_db() await init_db()
logger.info("Database initialized") logger.info("Database initialized")
# Initialize system metrics collection await super().on_startup(app)
system_metrics = SystemMetricsCollector("ai-insights")
logger.info("System metrics collection started")
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed async def on_shutdown(self, app):
logger.info("Metrics export configured via OpenTelemetry OTLP") """Custom shutdown logic for AI Insights"""
await super().on_shutdown(app)
yield # Close database
# Shutdown
logger.info("Shutting down AI Insights Service")
await close_db() await close_db()
logger.info("Database connections closed") logger.info("Database connections closed")
# Create FastAPI app # Create service instance
app = FastAPI( service = AIInsightsService(
title="AI Insights Service", service_name="ai-insights",
app_name="AI Insights Service",
description="Intelligent insights and recommendations for bakery operations", description="Intelligent insights and recommendations for bakery operations",
version=settings.SERVICE_VERSION, version=settings.SERVICE_VERSION,
lifespan=lifespan log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
cors_origins=getattr(settings, 'ALLOWED_ORIGINS', ["*"]),
api_prefix=settings.API_V1_PREFIX,
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
) )
# Instrument FastAPI with OpenTelemetry # Create FastAPI app
FastAPIInstrumentor.instrument_app(app) app = service.create_app()
# Instrument httpx for outgoing requests # Add service-specific routers
HTTPXClientInstrumentor().instrument() service.add_router(
# Instrument Redis
RedisInstrumentor().instrument()
# Instrument SQLAlchemy
SQLAlchemyInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("ai-insights")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=settings.ALLOWED_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router(
insights.router, insights.router,
prefix=settings.API_V1_PREFIX,
tags=["insights"] tags=["insights"]
) )
@app.get("/")
async def root():
"""Root endpoint."""
return {
"service": settings.SERVICE_NAME,
"version": settings.SERVICE_VERSION,
"status": "running"
}
@app.get("/health")
async def health_check():
"""Health check endpoint."""
return {
"status": "healthy",
"service": settings.SERVICE_NAME,
"version": settings.SERVICE_VERSION
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__": if __name__ == "__main__":
import uvicorn import uvicorn

View File

@@ -4,90 +4,28 @@ Alert Processor Service v2.0
Main FastAPI application with RabbitMQ consumer lifecycle management. Main FastAPI application with RabbitMQ consumer lifecycle management.
""" """
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import structlog import structlog
import os
from app.core.config import settings from app.core.config import settings
from app.consumer.event_consumer import EventConsumer from app.consumer.event_consumer import EventConsumer
from app.api import alerts, sse from app.api import alerts, sse
from shared.redis_utils import initialize_redis, close_redis from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging from shared.service_base import StandardFastAPIService
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
# OpenTelemetry imports # Initialize logger
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "alert-processor"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("alert-processor")
# Setup logging
setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
# Setup OpenTelemetry logging export if enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
from shared.monitoring.logs_exporter import setup_otel_logging
result = setup_otel_logging("alert-processor", settings.VERSION)
if result:
logger = structlog.get_logger() logger = structlog.get_logger()
logger.info("OpenTelemetry logs export enabled for alert-processor")
else:
logger = structlog.get_logger()
logger.warning("OpenTelemetry logs export setup returned None")
except Exception as e:
logger = structlog.get_logger()
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
else:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
# Global consumer instance # Global consumer instance
consumer: EventConsumer = None consumer: EventConsumer = None
@asynccontextmanager class AlertProcessorService(StandardFastAPIService):
async def lifespan(app: FastAPI): """Alert Processor Service with standardized monitoring setup and RabbitMQ consumer"""
"""
Application lifecycle manager.
Startup: Initialize Redis and RabbitMQ consumer async def on_startup(self, app):
Shutdown: Close consumer and Redis connections """Custom startup logic for Alert Processor"""
"""
global consumer global consumer
logger.info("alert_processor_starting", version=settings.VERSION)
# Startup: Initialize Redis and start consumer
try:
# Initialize Redis connection # Initialize Redis connection
await initialize_redis( await initialize_redis(
settings.REDIS_URL, settings.REDIS_URL,
@@ -96,69 +34,48 @@ async def lifespan(app: FastAPI):
) )
logger.info("redis_initialized") logger.info("redis_initialized")
# Start RabbitMQ consumer
consumer = EventConsumer() consumer = EventConsumer()
await consumer.start() await consumer.start()
logger.info("alert_processor_started") logger.info("rabbitmq_consumer_started")
# Initialize system metrics collection await super().on_startup(app)
system_metrics = SystemMetricsCollector("alert-processor")
logger.info("System metrics collection started")
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed async def on_shutdown(self, app):
logger.info("Metrics export configured via OpenTelemetry OTLP") """Custom shutdown logic for Alert Processor"""
except Exception as e: global consumer
logger.error("alert_processor_startup_failed", error=str(e))
raise
yield await super().on_shutdown(app)
# Shutdown: Stop consumer and close Redis # Stop RabbitMQ consumer
try:
if consumer: if consumer:
await consumer.stop() await consumer.stop()
logger.info("rabbitmq_consumer_stopped")
# Close Redis
await close_redis() await close_redis()
logger.info("alert_processor_shutdown") logger.info("redis_closed")
except Exception as e:
logger.error("alert_processor_shutdown_failed", error=str(e))
# Create FastAPI app # Create service instance
app = FastAPI( service = AlertProcessorService(
title="Alert Processor Service", service_name="alert-processor",
app_name="Alert Processor Service",
description="Event processing, enrichment, and alert management system", description="Event processing, enrichment, and alert management system",
version=settings.VERSION, version=settings.VERSION,
lifespan=lifespan, log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
debug=settings.DEBUG cors_origins=["*"], # Configure appropriately for production
api_prefix="/api/v1",
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
) )
# Instrument FastAPI with OpenTelemetry # Create FastAPI app
FastAPIInstrumentor.instrument_app(app) app = service.create_app(debug=settings.DEBUG)
# Instrument httpx for outgoing requests # Add service-specific routers
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Instrument SQLAlchemy
SQLAlchemyInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("alert-processor")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure appropriately for production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router( app.include_router(
alerts.router, alerts.router,
prefix="/api/v1/tenants/{tenant_id}", prefix="/api/v1/tenants/{tenant_id}",
@@ -172,34 +89,6 @@ app.include_router(
) )
@app.get("/health")
async def health_check():
"""
Health check endpoint.
Returns service status and version.
"""
return {
"status": "healthy",
"service": settings.SERVICE_NAME,
"version": settings.VERSION
}
@app.get("/")
async def root():
"""Root endpoint with service info"""
return {
"service": settings.SERVICE_NAME,
"version": settings.VERSION,
"description": "Event processing, enrichment, and alert management system"
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__": if __name__ == "__main__":
import uvicorn import uvicorn

View File

@@ -3,192 +3,74 @@ Demo Session Service - Main Application
Manages isolated demo sessions with ephemeral data Manages isolated demo sessions with ephemeral data
""" """
from fastapi import FastAPI, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import structlog import structlog
from contextlib import asynccontextmanager
import os
from app.core import settings, DatabaseManager from app.core import settings, DatabaseManager
from app.api import demo_sessions, demo_accounts, demo_operations, internal from app.api import demo_sessions, demo_accounts, demo_operations, internal
from shared.redis_utils import initialize_redis, close_redis from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging from shared.service_base import StandardFastAPIService
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
# OpenTelemetry imports # Initialize logger
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "demo-session"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("demo-session")
# Setup logging
setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO'))
# Setup OpenTelemetry logging export if enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
from shared.monitoring.logs_exporter import setup_otel_logging
result = setup_otel_logging("demo-session", settings.VERSION)
if result:
logger = structlog.get_logger() logger = structlog.get_logger()
logger.info("OpenTelemetry logs export enabled for demo-session")
else:
logger = structlog.get_logger()
logger.warning("OpenTelemetry logs export setup returned None")
except Exception as e:
logger = structlog.get_logger()
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
else:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
# Initialize database # Initialize database manager
db_manager = DatabaseManager() db_manager = DatabaseManager()
@asynccontextmanager class DemoSessionService(StandardFastAPIService):
async def lifespan(app: FastAPI): """Demo Session Service with standardized monitoring setup"""
"""Application lifespan handler"""
logger.info("Starting Demo Session Service", version=settings.VERSION)
async def on_startup(self, app):
"""Custom startup logic for Demo Session"""
# Initialize database # Initialize database
db_manager.initialize() db_manager.initialize()
logger.info("Database initialized")
# Initialize Redis using shared implementation # Initialize Redis
await initialize_redis( await initialize_redis(
redis_url=settings.REDIS_URL, redis_url=settings.REDIS_URL,
db=0, db=0,
max_connections=50 max_connections=50
) )
logger.info("Redis initialized")
# Initialize system metrics collection await super().on_startup(app)
system_metrics = SystemMetricsCollector("demo-session")
logger.info("System metrics collection started")
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed async def on_shutdown(self, app):
logger.info("Metrics export configured via OpenTelemetry OTLP") """Custom shutdown logic for Demo Session"""
await super().on_shutdown(app)
logger.info("Demo Session Service started successfully") # Cleanup
yield
# Cleanup on shutdown
await db_manager.close() await db_manager.close()
await close_redis() await close_redis()
logger.info("Database and Redis connections closed")
logger.info("Demo Session Service stopped")
app = FastAPI( # Create service instance
title="Demo Session Service", service = DemoSessionService(
service_name="demo-session",
app_name="Demo Session Service",
description="Manages isolated demo sessions for prospect users", description="Manages isolated demo sessions for prospect users",
version=settings.VERSION, version=settings.VERSION,
lifespan=lifespan log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
cors_origins=["*"], # Configure appropriately for production
api_prefix="/api/v1",
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
) )
# Instrument FastAPI with OpenTelemetry # Create FastAPI app
FastAPIInstrumentor.instrument_app(app) app = service.create_app(debug=settings.DEBUG)
# Instrument httpx for outgoing requests # Add service-specific routers
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("demo-session")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
"""Global exception handler"""
logger.error(
"Unhandled exception",
path=request.url.path,
method=request.method,
error=str(exc)
)
return JSONResponse(
status_code=500,
content={"detail": "Internal server error"}
)
# Include routers
app.include_router(demo_sessions.router) app.include_router(demo_sessions.router)
app.include_router(demo_accounts.router) app.include_router(demo_accounts.router)
app.include_router(demo_operations.router) app.include_router(demo_operations.router)
app.include_router(internal.router) app.include_router(internal.router)
@app.get("/")
async def root():
"""Root endpoint"""
return {
"service": "demo-session",
"version": settings.VERSION,
"status": "running"
}
@app.get("/health")
async def health():
"""Health check endpoint"""
from shared.redis_utils import get_redis_manager
redis_manager = await get_redis_manager()
redis_ok = await redis_manager.health_check()
return {
"status": "healthy" if redis_ok else "degraded",
"service": "demo-session",
"version": settings.VERSION,
"redis": "connected" if redis_ok else "disconnected"
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__": if __name__ == "__main__":
import uvicorn import uvicorn
uvicorn.run( uvicorn.run(

View File

@@ -1,14 +1,34 @@
""" """
Shared monitoring package for microservices Shared monitoring package for microservices
Provides unified OpenTelemetry-based observability:
- Traces: Distributed tracing
- Metrics: System and application metrics
- Logs: Structured logging
All signals exported to SigNoz via OTLP.
""" """
# Core setup - START HERE
from .logging import setup_logging from .logging import setup_logging
from .metrics import setup_metrics_early, get_metrics_collector, MetricsCollector from .telemetry import (
from .health_checks import ( setup_telemetry,
HealthCheckManager, setup_telemetry_simple,
FastAPIHealthChecker, get_telemetry_status,
create_health_manager, TelemetryProviders
setup_fastapi_health_checks )
# Configuration
from .otel_config import OTelConfig, OTelEndpoints
# Individual signal setup (used by telemetry.py)
from .tracing import (
setup_tracing,
get_current_trace_id,
get_current_span_id,
add_trace_attributes,
add_trace_event,
record_exception
) )
from .logs_exporter import ( from .logs_exporter import (
setup_otel_logging, setup_otel_logging,
@@ -27,23 +47,51 @@ from .system_metrics import (
setup_all_metrics setup_all_metrics
) )
# Health checks
from .health_checks import (
HealthCheckManager,
FastAPIHealthChecker,
create_health_manager,
setup_fastapi_health_checks
)
__all__ = [ __all__ = [
# CORE - Start with these
'setup_logging', 'setup_logging',
'setup_metrics_early', 'setup_telemetry',
'get_metrics_collector', 'setup_telemetry_simple',
'MetricsCollector', 'get_telemetry_status',
'HealthCheckManager', 'TelemetryProviders',
'FastAPIHealthChecker',
'create_health_manager', # Configuration
'setup_fastapi_health_checks', 'OTelConfig',
'OTelEndpoints',
# Tracing
'setup_tracing',
'get_current_trace_id',
'get_current_span_id',
'add_trace_attributes',
'add_trace_event',
'record_exception',
# Logs
'setup_otel_logging', 'setup_otel_logging',
'add_log_context', 'add_log_context',
'get_current_trace_context', 'get_current_trace_context',
'StructlogOTELProcessor', 'StructlogOTELProcessor',
# Metrics
'setup_otel_metrics', 'setup_otel_metrics',
'OTelMetricsCollector', 'OTelMetricsCollector',
'create_dual_metrics_collector', 'create_dual_metrics_collector',
'SystemMetricsCollector', 'SystemMetricsCollector',
'ApplicationMetricsCollector', 'ApplicationMetricsCollector',
'setup_all_metrics' 'setup_all_metrics',
# Health checks
'HealthCheckManager',
'FastAPIHealthChecker',
'create_health_manager',
'setup_fastapi_health_checks',
] ]

View File

@@ -1,6 +1,6 @@
""" """
OpenTelemetry Logs Integration for SigNoz OpenTelemetry Logs Integration for SigNoz
Exports structured logs to SigNoz via OpenTelemetry Collector Exports structured logs to SigNoz via OpenTelemetry Collector using HTTP protocol
""" """
import os import os
@@ -10,14 +10,21 @@ from typing import Optional
from opentelemetry._logs import set_logger_provider from opentelemetry._logs import set_logger_provider
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.sdk.resources import Resource
# Try to import HTTP log exporter (logs always use HTTP)
try: try:
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
HTTP_LOG_EXPORTER_AVAILABLE = True
except ImportError: except ImportError:
try: try:
from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
HTTP_LOG_EXPORTER_AVAILABLE = True
except ImportError: except ImportError:
OTLPLogExporter = None OTLPLogExporter = None
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION HTTP_LOG_EXPORTER_AVAILABLE = False
from .otel_config import OTelConfig
logger = structlog.get_logger() logger = structlog.get_logger()
@@ -31,13 +38,14 @@ def setup_otel_logging(
""" """
Setup OpenTelemetry logging to export logs to SigNoz. Setup OpenTelemetry logging to export logs to SigNoz.
This integrates with Python's standard logging to automatically Uses HTTP protocol (port 4318) for sending logs to SigNoz.
export all log records to SigNoz via the OTLP protocol. Integrates with Python's standard logging to automatically export
all log records to SigNoz via the OTLP HTTP protocol.
Args: Args:
service_name: Name of the service (e.g., "auth-service") service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (default from env) otel_endpoint: Optional override for OTLP endpoint (HTTP format with path)
enable_console: Whether to also log to console (default: True) enable_console: Whether to also log to console (default: True)
Returns: Returns:
@@ -47,7 +55,7 @@ def setup_otel_logging(
from shared.monitoring.logs_exporter import setup_otel_logging from shared.monitoring.logs_exporter import setup_otel_logging
# Setup during service initialization # Setup during service initialization
setup_otel_logging("auth-service", "1.0.0") handler = setup_otel_logging("auth-service", "1.0.0")
# Now all standard logging calls will be exported to SigNoz # Now all standard logging calls will be exported to SigNoz
import logging import logging
@@ -56,7 +64,7 @@ def setup_otel_logging(
""" """
# Check if logging export is enabled # Check if logging export is enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() != "otlp": if not OTelConfig.is_enabled("logs"):
logger.info( logger.info(
"OpenTelemetry logs export disabled", "OpenTelemetry logs export disabled",
service=service_name, service=service_name,
@@ -64,59 +72,36 @@ def setup_otel_logging(
) )
return None return None
# Get OTLP endpoint from environment or parameter # Check if HTTP log exporter is available
# For logs, we need to use the HTTP endpoint (port 4318), not the gRPC endpoint (port 4317) if not HTTP_LOG_EXPORTER_AVAILABLE or OTLPLogExporter is None:
if otel_endpoint is None:
# Try logs-specific endpoint first, then fall back to general OTLP endpoint
otel_endpoint = os.getenv(
"OTEL_EXPORTER_OTLP_LOGS_ENDPOINT",
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
)
logger.info(f"Original OTLP endpoint for logs: {otel_endpoint}")
# If we got the tracing endpoint (4317), switch to logs endpoint (4318)
if otel_endpoint.endswith(":4317"):
logger.info("Converting tracing endpoint (4317) to logs endpoint (4318)")
otel_endpoint = otel_endpoint.replace(":4317", ":4318")
logger.info(f"Final OTLP endpoint for logs: {otel_endpoint}")
# Ensure endpoint has proper protocol prefix
if not otel_endpoint.startswith(("http://", "https://")):
# Default to HTTP for insecure connections
otel_endpoint = f"http://{otel_endpoint}"
# Ensure endpoint has /v1/logs path for HTTP
if not otel_endpoint.endswith("/v1/logs"):
otel_endpoint = f"{otel_endpoint}/v1/logs"
try:
# Check if OTLPLogExporter is available
if OTLPLogExporter is None:
logger.warning( logger.warning(
"OpenTelemetry HTTP OTLP exporter not available", "OpenTelemetry HTTP log exporter not available",
service=service_name, service=service_name,
reason="opentelemetry-exporter-otlp-proto-http package not installed" reason="opentelemetry-exporter-otlp-proto-http package not installed"
) )
return None return None
# Create resource with service information try:
resource = Resource(attributes={ # Get endpoints from centralized config
SERVICE_NAME: service_name, endpoints = OTelConfig.get_endpoints()
SERVICE_VERSION: service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"), # Use provided endpoint or get from config
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"), if otel_endpoint:
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"), http_endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/logs")
}) else:
http_endpoint = endpoints.logs_http
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure logger provider # Configure logger provider
logger_provider = LoggerProvider(resource=resource) logger_provider = LoggerProvider(resource=resource)
set_logger_provider(logger_provider) set_logger_provider(logger_provider)
# Configure OTLP exporter for logs # Configure OTLP HTTP exporter for logs
otlp_exporter = OTLPLogExporter( otlp_exporter = OTLPLogExporter(
endpoint=otel_endpoint, endpoint=http_endpoint,
timeout=10 timeout=10
) )
@@ -135,9 +120,10 @@ def setup_otel_logging(
root_logger.addHandler(otel_handler) root_logger.addHandler(otel_handler)
logger.info( logger.info(
"OpenTelemetry logs export configured", "OpenTelemetry logs export configured successfully",
service=service_name, service=service_name,
otel_endpoint=otel_endpoint, http_endpoint=http_endpoint,
protocol="http",
console_logging=enable_console console_logging=enable_console
) )
@@ -147,8 +133,7 @@ def setup_otel_logging(
logger.error( logger.error(
"Failed to setup OpenTelemetry logs export", "Failed to setup OpenTelemetry logs export",
service=service_name, service=service_name,
error=str(e), error=str(e)
reason="Will continue with standard logging only"
) )
return None return None

View File

@@ -1,6 +1,6 @@
""" """
OpenTelemetry Metrics Integration for SigNoz OpenTelemetry Metrics Integration for SigNoz
Exports metrics to SigNoz via OpenTelemetry Collector in addition to Prometheus Exports metrics to SigNoz via OpenTelemetry Collector using gRPC protocol
""" """
import os import os
@@ -9,8 +9,24 @@ from typing import Optional
from opentelemetry import metrics from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
# Import both gRPC and HTTP exporters
try:
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GrpcMetricExporter
GRPC_AVAILABLE = True
except ImportError:
GRPC_AVAILABLE = False
GrpcMetricExporter = None
try:
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HttpMetricExporter
HTTP_AVAILABLE = True
except ImportError:
HTTP_AVAILABLE = False
HttpMetricExporter = None
from .otel_config import OTelConfig
logger = structlog.get_logger() logger = structlog.get_logger()
@@ -19,20 +35,21 @@ def setup_otel_metrics(
service_name: str, service_name: str,
service_version: str = "1.0.0", service_version: str = "1.0.0",
otel_endpoint: Optional[str] = None, otel_endpoint: Optional[str] = None,
export_interval_millis: int = 60000 # Export every 60 seconds export_interval_millis: int = 60000, # Export every 60 seconds
protocol: Optional[str] = None # "grpc" or "http", defaults to grpc
) -> Optional[MeterProvider]: ) -> Optional[MeterProvider]:
""" """
Setup OpenTelemetry metrics to export to SigNoz. Setup OpenTelemetry metrics to export to SigNoz.
This creates a dual-export strategy: Supports both gRPC (recommended, port 4317) and HTTP (port 4318) protocols.
- Prometheus exposition format at /metrics (for Prometheus scraping) Default protocol is gRPC for better performance.
- OTLP push to SigNoz collector (for direct ingestion)
Args: Args:
service_name: Name of the service (e.g., "auth-service") service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (default from env) otel_endpoint: Optional override for OTLP endpoint
export_interval_millis: How often to push metrics (default 60s) export_interval_millis: How often to push metrics in milliseconds (default 60s)
protocol: Protocol to use ("grpc" or "http"). Defaults to "grpc"
Returns: Returns:
MeterProvider instance if successful, None otherwise MeterProvider instance if successful, None otherwise
@@ -40,9 +57,12 @@ def setup_otel_metrics(
Example: Example:
from shared.monitoring.metrics_exporter import setup_otel_metrics from shared.monitoring.metrics_exporter import setup_otel_metrics
# Setup during service initialization # Setup with gRPC (default)
meter_provider = setup_otel_metrics("auth-service", "1.0.0") meter_provider = setup_otel_metrics("auth-service", "1.0.0")
# Or with HTTP
meter_provider = setup_otel_metrics("auth-service", "1.0.0", protocol="http")
# Create meters for your metrics # Create meters for your metrics
meter = meter_provider.get_meter(__name__) meter = meter_provider.get_meter(__name__)
request_counter = meter.create_counter( request_counter = meter.create_counter(
@@ -56,8 +76,7 @@ def setup_otel_metrics(
""" """
# Check if metrics export is enabled # Check if metrics export is enabled
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true" if not OTelConfig.is_enabled("metrics"):
if not enable_otel_metrics:
logger.info( logger.info(
"OpenTelemetry metrics export disabled", "OpenTelemetry metrics export disabled",
service=service_name, service=service_name,
@@ -65,30 +84,64 @@ def setup_otel_metrics(
) )
return None return None
# Get OTLP endpoint from environment or parameter # Determine protocol to use
if otel_endpoint is None: if protocol is None:
otel_endpoint = os.getenv( protocol = OTelConfig.get_protocol("metrics")
"OTEL_EXPORTER_OTLP_ENDPOINT",
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
)
# Ensure endpoint has /v1/metrics path for HTTP # Validate protocol is available
if not otel_endpoint.endswith("/v1/metrics"): if protocol == "grpc" and not GRPC_AVAILABLE:
otel_endpoint = f"{otel_endpoint}/v1/metrics" logger.warning(
"gRPC exporter not available, falling back to HTTP",
service=service_name
)
protocol = "http"
elif protocol == "http" and not HTTP_AVAILABLE:
logger.warning(
"HTTP exporter not available, falling back to gRPC",
service=service_name
)
protocol = "grpc"
if protocol not in ["grpc", "http"]:
logger.error(
"Invalid protocol specified",
service=service_name,
protocol=protocol
)
return None
try: try:
# Create resource with service information # Get endpoints from centralized config
resource = Resource(attributes={ endpoints = OTelConfig.get_endpoints()
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
})
# Configure OTLP exporter for metrics # Determine which endpoint to use
otlp_exporter = OTLPMetricExporter( if otel_endpoint:
endpoint=otel_endpoint, # User provided override
if protocol == "grpc":
endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
else:
endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/metrics")
else:
# Use config-determined endpoint
if protocol == "grpc":
endpoint = endpoints.metrics_grpc
else:
endpoint = endpoints.metrics_http
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure OTLP exporter based on protocol
if protocol == "grpc":
otlp_exporter = GrpcMetricExporter(
endpoint=endpoint,
insecure=True, # Use secure=False in production with proper TLS
timeout=10
)
else: # http
otlp_exporter = HttpMetricExporter(
endpoint=endpoint,
timeout=10 timeout=10
) )
@@ -108,9 +161,10 @@ def setup_otel_metrics(
metrics.set_meter_provider(meter_provider) metrics.set_meter_provider(meter_provider)
logger.info( logger.info(
"OpenTelemetry metrics export configured", "OpenTelemetry metrics export configured successfully",
service=service_name, service=service_name,
otel_endpoint=otel_endpoint, endpoint=endpoint,
protocol=protocol,
export_interval_seconds=export_interval_millis / 1000 export_interval_seconds=export_interval_millis / 1000
) )
@@ -121,7 +175,7 @@ def setup_otel_metrics(
"Failed to setup OpenTelemetry metrics export", "Failed to setup OpenTelemetry metrics export",
service=service_name, service=service_name,
error=str(e), error=str(e),
reason="Will continue with Prometheus-only metrics" protocol=protocol
) )
return None return None

View File

@@ -0,0 +1,286 @@
"""
Centralized OpenTelemetry Configuration
Manages OTEL endpoints and settings for traces, metrics, and logs
"""
import os
from typing import Optional, Tuple
from dataclasses import dataclass
import structlog
logger = structlog.get_logger()
@dataclass
class OTelEndpoints:
"""
Container for OpenTelemetry endpoints.
SigNoz uses different protocols for different signals:
- Traces: gRPC (port 4317)
- Metrics: gRPC (port 4317) or HTTP (port 4318)
- Logs: HTTP (port 4318)
"""
traces_grpc: str # gRPC endpoint for traces (e.g., "host:4317")
metrics_grpc: str # gRPC endpoint for metrics (e.g., "host:4317")
metrics_http: str # HTTP endpoint for metrics (e.g., "http://host:4318/v1/metrics")
logs_http: str # HTTP endpoint for logs (e.g., "http://host:4318/v1/logs")
class OTelConfig:
"""
Centralized configuration for OpenTelemetry exporters.
This class manages endpoint URLs and ensures proper protocol usage:
- gRPC endpoints: host:port (no protocol prefix)
- HTTP endpoints: http://host:port/path (with protocol and path)
"""
# Default base endpoint (can be overridden by environment variables)
DEFAULT_OTEL_COLLECTOR_HOST = "signoz-otel-collector.bakery-ia.svc.cluster.local"
DEFAULT_GRPC_PORT = 4317
DEFAULT_HTTP_PORT = 4318
@classmethod
def get_endpoints(cls) -> OTelEndpoints:
"""
Get OpenTelemetry endpoints from environment variables with proper fallbacks.
Environment variables (in order of precedence):
1. OTEL_EXPORTER_OTLP_ENDPOINT - Base endpoint (gRPC format: host:port)
2. OTEL_EXPORTER_OTLP_TRACES_ENDPOINT - Specific traces endpoint
3. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT - Specific metrics endpoint
4. OTEL_EXPORTER_OTLP_LOGS_ENDPOINT - Specific logs endpoint
5. OTEL_COLLECTOR_ENDPOINT - Legacy variable (HTTP format)
Returns:
OTelEndpoints with all configured endpoints
"""
# Get base endpoint from environment
base_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
if base_endpoint:
# Clean and parse base endpoint
base_grpc = cls._clean_grpc_endpoint(base_endpoint)
base_http_host = cls._extract_host(base_endpoint)
else:
# Use default collector
base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
# Get signal-specific endpoints (or use base endpoint)
traces_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", base_grpc)
metrics_endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", base_grpc)
logs_endpoint = os.getenv("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT")
# Build final endpoints
traces_grpc = cls._clean_grpc_endpoint(traces_endpoint)
metrics_grpc = cls._clean_grpc_endpoint(metrics_endpoint)
# For metrics HTTP, convert gRPC endpoint to HTTP if needed
metrics_http = cls._grpc_to_http_endpoint(metrics_grpc, "/v1/metrics")
# For logs, use HTTP endpoint
if logs_endpoint:
logs_http = cls._ensure_http_endpoint(logs_endpoint, "/v1/logs")
else:
logs_http = cls._grpc_to_http_endpoint(base_grpc, "/v1/logs")
endpoints = OTelEndpoints(
traces_grpc=traces_grpc,
metrics_grpc=metrics_grpc,
metrics_http=metrics_http,
logs_http=logs_http
)
logger.info(
"OpenTelemetry endpoints configured",
traces_grpc=endpoints.traces_grpc,
metrics_grpc=endpoints.metrics_grpc,
metrics_http=endpoints.metrics_http,
logs_http=endpoints.logs_http
)
return endpoints
@staticmethod
def _clean_grpc_endpoint(endpoint: str) -> str:
"""
Clean endpoint for gRPC usage (remove protocol, paths).
Args:
endpoint: Raw endpoint string
Returns:
Cleaned endpoint in format "host:port"
"""
# Remove protocol prefixes
endpoint = endpoint.replace("http://", "").replace("https://", "")
# Remove paths (gRPC doesn't use paths)
if "/" in endpoint:
endpoint = endpoint.split("/")[0]
# Ensure it has a port
if ":" not in endpoint:
endpoint = f"{endpoint}:4317"
return endpoint
@staticmethod
def _extract_host(endpoint: str) -> str:
"""
Extract host and convert to HTTP endpoint.
Args:
endpoint: Raw endpoint string
Returns:
HTTP endpoint without path (e.g., "http://host:4318")
"""
# Remove protocol if present
clean = endpoint.replace("http://", "").replace("https://", "")
# Remove path if present
if "/" in clean:
clean = clean.split("/")[0]
# Extract host without port
if ":" in clean:
host = clean.split(":")[0]
else:
host = clean
return f"http://{host}:4318"
@staticmethod
def _grpc_to_http_endpoint(grpc_endpoint: str, path: str) -> str:
"""
Convert gRPC endpoint to HTTP endpoint with path.
Args:
grpc_endpoint: gRPC endpoint (e.g., "host:4317")
path: HTTP path (e.g., "/v1/metrics")
Returns:
HTTP endpoint (e.g., "http://host:4318/v1/metrics")
"""
# Extract host from gRPC endpoint
if ":" in grpc_endpoint:
host = grpc_endpoint.split(":")[0]
else:
host = grpc_endpoint
# Build HTTP endpoint with port 4318
return f"http://{host}:4318{path}"
@staticmethod
def _ensure_http_endpoint(endpoint: str, path: str) -> str:
"""
Ensure endpoint is in HTTP format with proper path.
Args:
endpoint: Raw endpoint string
path: Required path (e.g., "/v1/logs")
Returns:
HTTP endpoint with protocol and path
"""
# Add protocol if missing
if not endpoint.startswith(("http://", "https://")):
endpoint = f"http://{endpoint}"
# Ensure it has the correct port for HTTP
if ":4317" in endpoint:
endpoint = endpoint.replace(":4317", ":4318")
elif ":4318" not in endpoint and ":" in endpoint:
# Has a port but not the right one, replace it
parts = endpoint.split(":")
if len(parts) >= 2:
# Remove existing port and path
base = ":".join(parts[:-1])
endpoint = f"{base}:4318"
elif ":" not in endpoint.replace("http://", "").replace("https://", ""):
# No port at all, add it
endpoint = f"{endpoint}:4318"
# Ensure path is present
if not endpoint.endswith(path):
# Remove any existing path first
if "/" in endpoint.split("://")[1]:
base = endpoint.split("://")[0] + "://" + endpoint.split("://")[1].split("/")[0]
endpoint = base
endpoint = f"{endpoint}{path}"
return endpoint
@classmethod
def get_resource_attributes(
cls,
service_name: str,
service_version: str = "1.0.0"
) -> dict:
"""
Get common resource attributes for all OTEL signals.
Args:
service_name: Name of the service
service_version: Version of the service
Returns:
Dictionary of resource attributes
"""
return {
"service.name": service_name,
"service.version": service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
"k8s.cluster.name": os.getenv("K8S_CLUSTER_NAME", "bakery-ia-cluster"),
}
@classmethod
def is_enabled(cls, signal: str) -> bool:
"""
Check if a specific telemetry signal is enabled.
Args:
signal: One of "traces", "metrics", "logs"
Returns:
True if signal is enabled, False otherwise
"""
signal = signal.lower()
if signal == "traces":
return os.getenv("ENABLE_TRACING", "true").lower() == "true"
elif signal == "metrics":
return os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
elif signal == "logs":
return os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp"
else:
return False
@classmethod
def get_protocol(cls, signal: str) -> str:
"""
Get the preferred protocol for a signal.
Args:
signal: One of "traces", "metrics", "logs"
Returns:
Protocol name ("grpc" or "http")
"""
protocol = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc")
# Signal-specific overrides
if signal == "traces":
return os.getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", protocol)
elif signal == "metrics":
return os.getenv("OTEL_EXPORTER_OTLP_METRICS_PROTOCOL", protocol)
elif signal == "logs":
# Logs always use HTTP in our setup
return "http"
return protocol

View File

@@ -0,0 +1,271 @@
"""
Unified OpenTelemetry Telemetry Setup
Provides a single entry point to configure all telemetry signals:
- Traces: Distributed tracing across services
- Metrics: OTLP metrics export + system metrics collection
- Logs: Structured logs with trace correlation
All signals are exported to SigNoz via OTLP.
"""
import os
import structlog
from typing import Optional, Dict, Any, Tuple
from dataclasses import dataclass
from .otel_config import OTelConfig
from .tracing import setup_tracing
from .metrics_exporter import setup_otel_metrics
from .logs_exporter import setup_otel_logging
from .system_metrics import setup_all_metrics, SystemMetricsCollector, ApplicationMetricsCollector
logger = structlog.get_logger()
@dataclass
class TelemetryProviders:
"""
Container for all OpenTelemetry providers and collectors.
Attributes:
tracer_provider: Provider for distributed tracing
meter_provider: Provider for metrics export
logging_handler: Handler for structured logs
system_metrics: Collector for system-level metrics (CPU, memory, disk, network)
app_metrics: Collector for application-level metrics (HTTP, DB)
"""
tracer_provider: Optional[Any] = None
meter_provider: Optional[Any] = None
logging_handler: Optional[Any] = None
system_metrics: Optional[SystemMetricsCollector] = None
app_metrics: Optional[ApplicationMetricsCollector] = None
def setup_telemetry(
app,
service_name: str,
service_version: str = "1.0.0",
enable_traces: bool = True,
enable_metrics: bool = True,
enable_logs: bool = True,
enable_system_metrics: bool = True,
metrics_protocol: Optional[str] = None, # "grpc" or "http", defaults to grpc
export_interval_millis: int = 60000
) -> TelemetryProviders:
"""
Setup all OpenTelemetry telemetry signals (traces, metrics, logs) for a service.
This is the UNIFIED setup function that configures everything:
- Distributed tracing (gRPC, port 4317)
- Metrics export (gRPC by default, port 4317)
- System metrics collection (CPU, memory, disk, network)
- Application metrics (HTTP requests, DB queries)
- Structured logs export (HTTP, port 4318)
All signals use the centralized OTelConfig for endpoint management.
Args:
app: FastAPI application instance
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
enable_traces: Enable distributed tracing (default: True)
enable_metrics: Enable metrics export to OTLP (default: True)
enable_logs: Enable logs export to OTLP (default: True)
enable_system_metrics: Enable system metrics collection (default: True, can be disabled via ENABLE_SYSTEM_METRICS env)
metrics_protocol: Protocol for metrics ("grpc" or "http", default: "grpc")
export_interval_millis: How often to export metrics in milliseconds
Returns:
TelemetryProviders containing all initialized providers and collectors
Example:
from shared.monitoring.telemetry import setup_telemetry
app = FastAPI(title="Auth Service")
providers = setup_telemetry(
app,
service_name="auth-service",
service_version="1.0.0"
)
# All telemetry is now configured:
# - Traces automatically captured for HTTP requests
# - System metrics automatically collected
# - Application metrics via providers.app_metrics
# - Logs automatically correlated with traces
"""
logger.info(
"Setting up unified OpenTelemetry telemetry",
service=service_name,
version=service_version,
traces=enable_traces,
metrics=enable_metrics,
logs=enable_logs,
system_metrics=enable_system_metrics
)
providers = TelemetryProviders()
# Setup distributed tracing
if enable_traces and OTelConfig.is_enabled("traces"):
try:
providers.tracer_provider = setup_tracing(
app,
service_name=service_name,
service_version=service_version
)
if providers.tracer_provider:
logger.info("✓ Distributed tracing configured", service=service_name)
else:
logger.warning("✗ Distributed tracing setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup distributed tracing", service=service_name, error=str(e))
# Setup OTLP metrics export
if enable_metrics and OTelConfig.is_enabled("metrics"):
try:
providers.meter_provider = setup_otel_metrics(
service_name=service_name,
service_version=service_version,
protocol=metrics_protocol,
export_interval_millis=export_interval_millis
)
if providers.meter_provider:
logger.info("✓ OTLP metrics export configured", service=service_name)
# Setup system and application metrics collectors
if enable_system_metrics:
enable_system_env = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
if enable_system_env:
try:
providers.system_metrics, providers.app_metrics = setup_all_metrics(
service_name=service_name,
service_version=service_version,
meter_provider=providers.meter_provider
)
logger.info(
"✓ System and application metrics collectors initialized",
service=service_name,
system_metrics=["cpu", "memory", "disk", "network"],
app_metrics=["http_requests", "db_queries"]
)
except Exception as e:
logger.warning("✗ Failed to setup metrics collectors", service=service_name, error=str(e))
else:
logger.warning("✗ OTLP metrics export setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup OTLP metrics export", service=service_name, error=str(e))
# Setup logs export
if enable_logs and OTelConfig.is_enabled("logs"):
try:
providers.logging_handler = setup_otel_logging(
service_name=service_name,
service_version=service_version
)
if providers.logging_handler:
logger.info("✓ Structured logs export configured", service=service_name)
else:
logger.warning("✗ Logs export setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup logs export", service=service_name, error=str(e))
# Log endpoint configuration summary
try:
endpoints = OTelConfig.get_endpoints()
summary = {
"service": service_name,
"version": service_version,
"traces": {
"enabled": bool(providers.tracer_provider),
"endpoint": endpoints.traces_grpc if providers.tracer_provider else "disabled"
},
"metrics": {
"enabled": bool(providers.meter_provider),
"endpoint": (endpoints.metrics_grpc if metrics_protocol != "http" else endpoints.metrics_http) if providers.meter_provider else "disabled",
"system_metrics": bool(providers.system_metrics),
"app_metrics": bool(providers.app_metrics)
},
"logs": {
"enabled": bool(providers.logging_handler),
"endpoint": endpoints.logs_http if providers.logging_handler else "disabled"
}
}
logger.info("🎉 Telemetry setup complete", **summary)
except Exception as e:
logger.warning("Could not log endpoint summary", error=str(e))
return providers
def setup_telemetry_simple(
app,
service_name: str,
service_version: str = "1.0.0"
) -> TelemetryProviders:
"""
Simplified telemetry setup with all defaults.
Uses:
- gRPC for traces (port 4317)
- gRPC for metrics (port 4317)
- HTTP for logs (port 4318)
All settings are read from environment variables and OTelConfig.
Args:
app: FastAPI application instance
service_name: Name of the service
service_version: Version of the service
Returns:
TelemetryProviders containing all initialized providers
Example:
from shared.monitoring.telemetry import setup_telemetry_simple
app = FastAPI(title="Auth Service")
providers = setup_telemetry_simple(app, "auth-service")
"""
return setup_telemetry(
app=app,
service_name=service_name,
service_version=service_version
)
def get_telemetry_status() -> Dict[str, Any]:
"""
Get current telemetry configuration status.
Returns:
Dictionary with telemetry status information
Example:
from shared.monitoring.telemetry import get_telemetry_status
status = get_telemetry_status()
print(f"Tracing enabled: {status['traces']['enabled']}")
"""
endpoints = OTelConfig.get_endpoints()
return {
"traces": {
"enabled": OTelConfig.is_enabled("traces"),
"protocol": "grpc",
"endpoint": endpoints.traces_grpc
},
"metrics": {
"enabled": OTelConfig.is_enabled("metrics"),
"protocol": OTelConfig.get_protocol("metrics"),
"grpc_endpoint": endpoints.metrics_grpc,
"http_endpoint": endpoints.metrics_http
},
"logs": {
"enabled": OTelConfig.is_enabled("logs"),
"protocol": "http",
"endpoint": endpoints.logs_http
}
}

View File

@@ -3,17 +3,38 @@ OpenTelemetry distributed tracing integration
Provides end-to-end request tracking across all services Provides end-to-end request tracking across all services
""" """
import os
import structlog import structlog
from typing import Optional from typing import Optional
from opentelemetry import trace from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION from opentelemetry.sdk.resources import Resource
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
# Core instrumentations (should always be available)
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
# Optional instrumentations (may not be installed in all services)
try:
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
HTTPX_AVAILABLE = True
except ImportError:
HTTPX_AVAILABLE = False
try:
from opentelemetry.instrumentation.redis import RedisInstrumentor from opentelemetry.instrumentation.redis import RedisInstrumentor
REDIS_AVAILABLE = True
except ImportError:
REDIS_AVAILABLE = False
try:
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
SQLALCHEMY_AVAILABLE = True
except ImportError:
SQLALCHEMY_AVAILABLE = False
from .otel_config import OTelConfig
logger = structlog.get_logger() logger = structlog.get_logger()
@@ -22,8 +43,8 @@ def setup_tracing(
app, app,
service_name: str, service_name: str,
service_version: str = "1.0.0", service_version: str = "1.0.0",
otel_endpoint: str = "http://signoz-otel-collector.bakery-ia:4318" otel_endpoint: Optional[str] = None
): ) -> Optional[TracerProvider]:
""" """
Setup OpenTelemetry distributed tracing for a FastAPI service. Setup OpenTelemetry distributed tracing for a FastAPI service.
@@ -33,35 +54,56 @@ def setup_tracing(
- Redis operations - Redis operations
- PostgreSQL/SQLAlchemy queries - PostgreSQL/SQLAlchemy queries
Uses gRPC protocol (port 4317) for sending traces to SigNoz.
Args: Args:
app: FastAPI application instance app: FastAPI application instance
service_name: Name of the service (e.g., "auth-service") service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (SigNoz) otel_endpoint: Optional override for OTLP endpoint (gRPC format: host:port)
Returns:
TracerProvider instance if successful, None otherwise
Example: Example:
from shared.monitoring.tracing import setup_tracing from shared.monitoring.tracing import setup_tracing
app = FastAPI(title="Auth Service") app = FastAPI(title="Auth Service")
setup_tracing(app, "auth-service") tracer_provider = setup_tracing(app, "auth-service", "1.0.0")
""" """
# Check if tracing is enabled
if not OTelConfig.is_enabled("traces"):
logger.info(
"Distributed tracing disabled",
service=service_name,
reason="ENABLE_TRACING not set to 'true'"
)
return None
try: try:
# Create resource with service information # Get endpoints from centralized config
resource = Resource(attributes={ endpoints = OTelConfig.get_endpoints()
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version, # Use provided endpoint or get from config
"deployment.environment": "production" if otel_endpoint:
}) # Clean user-provided endpoint for gRPC
grpc_endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
else:
grpc_endpoint = endpoints.traces_grpc
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure tracer provider # Configure tracer provider
tracer_provider = TracerProvider(resource=resource) tracer_provider = TracerProvider(resource=resource)
trace.set_tracer_provider(tracer_provider) trace.set_tracer_provider(tracer_provider)
# Configure OTLP exporter to send to SigNoz # Configure OTLP gRPC exporter for traces
otlp_exporter = OTLPSpanExporter( otlp_exporter = OTLPSpanExporter(
endpoint=otel_endpoint, endpoint=grpc_endpoint,
insecure=True # Use TLS in production insecure=True # Use secure=False in production with proper TLS
) )
# Add span processor with batching for performance # Add span processor with batching for performance
@@ -75,40 +117,46 @@ def setup_tracing(
excluded_urls="health,metrics" # Don't trace health/metrics endpoints excluded_urls="health,metrics" # Don't trace health/metrics endpoints
) )
# Auto-instrument HTTPX (inter-service communication) # Auto-instrument HTTPX (inter-service communication) if available
if HTTPX_AVAILABLE:
try:
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider) HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("HTTPX instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument HTTPX: {e}")
# Auto-instrument Redis # Auto-instrument Redis if available
if REDIS_AVAILABLE:
try: try:
RedisInstrumentor().instrument(tracer_provider=tracer_provider) RedisInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("Redis instrumentation enabled")
except Exception as e: except Exception as e:
logger.warning(f"Failed to instrument Redis: {e}") logger.warning(f"Failed to instrument Redis: {e}")
# Auto-instrument PostgreSQL (psycopg2) - skip if not available # Auto-instrument SQLAlchemy if available
# Most services use asyncpg instead of psycopg2 if SQLALCHEMY_AVAILABLE:
# try:
# Psycopg2Instrumentor().instrument(tracer_provider=tracer_provider)
# except Exception as e:
# logger.warning(f"Failed to instrument Psycopg2: {e}")
# Auto-instrument SQLAlchemy
try: try:
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider) SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("SQLAlchemy instrumentation enabled")
except Exception as e: except Exception as e:
logger.warning(f"Failed to instrument SQLAlchemy: {e}") logger.warning(f"Failed to instrument SQLAlchemy: {e}")
logger.info( logger.info(
"Distributed tracing configured", "Distributed tracing configured successfully",
service=service_name, service=service_name,
otel_endpoint=otel_endpoint grpc_endpoint=grpc_endpoint,
protocol="grpc"
) )
return tracer_provider
except Exception as e: except Exception as e:
logger.error( logger.error(
"Failed to setup tracing - continuing without it", "Failed to setup tracing - continuing without it",
service=service_name, service=service_name,
error=str(e) error=str(e)
) )
return None
def get_current_trace_id() -> Optional[str]: def get_current_trace_id() -> Optional[str]:

View File

@@ -20,10 +20,11 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from fastapi.routing import APIRouter from fastapi.routing import APIRouter
from shared.monitoring import setup_logging, setup_otel_logging, setup_otel_metrics, setup_all_metrics from shared.monitoring import (
from shared.monitoring.metrics import setup_metrics_early setup_logging,
setup_telemetry
)
from shared.monitoring.health_checks import setup_fastapi_health_checks from shared.monitoring.health_checks import setup_fastapi_health_checks
from shared.monitoring.tracing import setup_tracing
from shared.database.base import DatabaseManager from shared.database.base import DatabaseManager
if TYPE_CHECKING: if TYPE_CHECKING:
@@ -77,24 +78,13 @@ class BaseFastAPIService:
# Initialize logging # Initialize logging
setup_logging(service_name, log_level) setup_logging(service_name, log_level)
# Setup OpenTelemetry logging export if enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
setup_otel_logging(service_name, version)
self.logger = structlog.get_logger()
self.logger.info(f"OpenTelemetry logs export enabled for {service_name}")
except Exception as e:
self.logger = structlog.get_logger()
self.logger.warning(f"Failed to setup OpenTelemetry logs export: {e}")
else:
self.logger = structlog.get_logger() self.logger = structlog.get_logger()
# Will be set during app creation # Will be set during app creation
self.app: Optional[FastAPI] = None self.app: Optional[FastAPI] = None
self.metrics_collector = None
self.health_manager = None self.health_manager = None
self.alert_service = None self.alert_service = None
self.telemetry_providers = None # Contains all OTEL providers and metrics collectors
def create_app(self, **fastapi_kwargs) -> FastAPI: def create_app(self, **fastapi_kwargs) -> FastAPI:
""" """
@@ -116,49 +106,25 @@ class BaseFastAPIService:
# Create FastAPI app # Create FastAPI app
self.app = FastAPI(**config) self.app = FastAPI(**config)
# Setup metrics BEFORE middleware and lifespan # Setup unified OpenTelemetry telemetry
if self.enable_metrics: # This single call configures:
self.metrics_collector = setup_metrics_early(self.app, self.service_name) # - Distributed tracing (gRPC, port 4317)
# - OTLP metrics export (gRPC, port 4317)
# Setup OpenTelemetry metrics export if enabled # - System metrics collection (CPU, memory, disk, network)
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true" # - Application metrics (HTTP requests, DB queries)
if enable_otel_metrics: # - Structured logs export (HTTP, port 4318)
try: try:
self.otel_meter_provider = setup_otel_metrics(self.service_name, self.version) self.telemetry_providers = setup_telemetry(
if self.otel_meter_provider: app=self.app,
self.logger.info(f"OpenTelemetry metrics export enabled for {self.service_name}") service_name=self.service_name,
service_version=self.version,
# Setup system metrics collection (CPU, memory, disk, network) enable_traces=self.enable_tracing,
enable_system_metrics = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true" enable_metrics=self.enable_metrics,
if enable_system_metrics: enable_logs=True, # Controlled by OTEL_LOGS_EXPORTER env var
try: enable_system_metrics=True # Controlled by ENABLE_SYSTEM_METRICS env var
self.system_metrics, self.app_metrics = setup_all_metrics(
self.service_name,
self.version,
self.otel_meter_provider
) )
self.logger.info(f"System metrics collection enabled for {self.service_name}")
except Exception as e: except Exception as e:
self.logger.warning(f"Failed to setup system metrics: {e}") self.logger.warning("Failed to setup telemetry", error=str(e))
except Exception as e:
self.logger.warning(f"Failed to setup OpenTelemetry metrics export: {e}")
# Setup distributed tracing
# Check both constructor flag and environment variable
tracing_enabled = self.enable_tracing and os.getenv("ENABLE_TRACING", "true").lower() == "true"
if tracing_enabled:
try:
otel_endpoint = os.getenv(
"OTEL_COLLECTOR_ENDPOINT",
"http://signoz-otel-collector.bakery-ia:4318"
)
setup_tracing(self.app, self.service_name, self.version, otel_endpoint)
self.logger.info(f"Distributed tracing enabled for {self.service_name}")
except Exception as e:
self.logger.warning(f"Failed to setup tracing, continuing without it: {e}")
else:
self.logger.info(f"Distributed tracing disabled for {self.service_name}")
# Setup lifespan # Setup lifespan
self.app.router.lifespan_context = self._create_lifespan() self.app.router.lifespan_context = self._create_lifespan()
@@ -361,10 +327,6 @@ class BaseFastAPIService:
method=request.method method=request.method
) )
# Record error metric if available
if self.metrics_collector:
self.metrics_collector.increment_counter("errors_total", labels={"type": "unhandled"})
return JSONResponse( return JSONResponse(
status_code=500, status_code=500,
content={ content={
@@ -409,7 +371,10 @@ class BaseFastAPIService:
def register_custom_metrics(self, metrics_config: Dict[str, Dict[str, Any]]): def register_custom_metrics(self, metrics_config: Dict[str, Dict[str, Any]]):
""" """
Register custom metrics for the service Register custom OTEL metrics for the service.
Note: System metrics (CPU, memory, disk, network) and application metrics (HTTP, DB)
are automatically created by setup_telemetry(). Use this for additional custom metrics.
Args: Args:
metrics_config: Dict with metric name as key and config as value metrics_config: Dict with metric name as key and config as value
@@ -417,25 +382,36 @@ class BaseFastAPIService:
"user_registrations": { "user_registrations": {
"type": "counter", "type": "counter",
"description": "Total user registrations", "description": "Total user registrations",
"labels": ["status"] "unit": "registrations"
} }
} }
""" """
if not self.metrics_collector: if not self.telemetry_providers or not self.telemetry_providers.meter_provider:
self.logger.warning("Metrics collector not available") self.logger.warning("OTEL meter provider not available - metrics not registered")
return return
from opentelemetry.metrics import get_meter
meter = get_meter(self.service_name)
for metric_name, config in metrics_config.items(): for metric_name, config in metrics_config.items():
metric_type = config.get("type", "counter") metric_type = config.get("type", "counter")
description = config.get("description", f"{metric_name} metric") description = config.get("description", f"{metric_name} metric")
labels = config.get("labels", []) unit = config.get("unit", "1")
try:
if metric_type == "counter": if metric_type == "counter":
self.metrics_collector.register_counter(metric_name, description, labels=labels) meter.create_counter(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom counter: {metric_name}")
elif metric_type == "histogram": elif metric_type == "histogram":
self.metrics_collector.register_histogram(metric_name, description, labels=labels) meter.create_histogram(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom histogram: {metric_name}")
elif metric_type == "gauge":
meter.create_up_down_counter(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom gauge: {metric_name}")
else: else:
self.logger.warning(f"Unsupported metric type: {metric_type}") self.logger.warning(f"Unsupported metric type: {metric_type}")
except Exception as e:
self.logger.error(f"Failed to register metric {metric_name}", error=str(e))
def run_development_server(self, host: str = "0.0.0.0", port: int = 8000, reload: Optional[bool] = None): def run_development_server(self, host: str = "0.0.0.0", port: int = 8000, reload: Optional[bool] = None):
""" """