Imporve monitoring 5

This commit is contained in:
Urtzi Alfaro
2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions

View File

@@ -8,13 +8,12 @@ import json
import structlog
import resource
import os
from fastapi import FastAPI, Request, HTTPException, Depends, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse, Response
import httpx
import time
from fastapi import Request, HTTPException, WebSocket, WebSocketDisconnect
from fastapi.responses import StreamingResponse
import httpx
from shared.redis_utils import initialize_redis, close_redis, get_redis_client
from typing import Dict, Any
from shared.service_base import StandardFastAPIService
from app.core.config import settings
from app.middleware.request_id import RequestIDMiddleware
@@ -26,128 +25,84 @@ from app.middleware.subscription import SubscriptionMiddleware
from app.middleware.demo_middleware import DemoMiddleware
from app.middleware.read_only_mode import ReadOnlyModeMiddleware
from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "gateway"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
# Create resource with service name
resource = Resource.create({"service.name": service_name})
# Configure OTLP exporter (sends to OpenTelemetry Collector)
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True # Use insecure connection for internal cluster communication
)
# Configure tracer provider
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
# Set global tracer provider
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("gateway")
# Setup logging
setup_logging("gateway", settings.LOG_LEVEL)
# Initialize logger
logger = structlog.get_logger()
# Check file descriptor limits and warn if too low
# Check file descriptor limits
try:
soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
if soft_limit < 1024:
logger.warning(f"Low file descriptor limit detected: {soft_limit}. Gateway may experience 'too many open files' errors.")
logger.warning(f"Recommended: Increase limit with 'ulimit -n 4096' or higher for production.")
if soft_limit < 256:
logger.error(f"Critical: File descriptor limit ({soft_limit}) is too low for gateway operation!")
logger.warning(f"Low file descriptor limit detected: {soft_limit}")
else:
logger.info(f"File descriptor limit: {soft_limit} (sufficient)")
except Exception as e:
logger.debug(f"Could not check file descriptor limits: {e}")
# Check and log current working directory and permissions
try:
cwd = os.getcwd()
logger.info(f"Current working directory: {cwd}")
# Check if we can write to common log locations
test_locations = ["/var/log", "./logs", "."]
for location in test_locations:
try:
test_file = os.path.join(location, ".gateway_permission_test")
with open(test_file, 'w') as f:
f.write("test")
os.remove(test_file)
logger.info(f"Write permission confirmed for: {location}")
except Exception as e:
logger.warning(f"Cannot write to {location}: {e}")
except Exception as e:
logger.debug(f"Could not check directory permissions: {e}")
# Create FastAPI app
app = FastAPI(
title="Bakery Forecasting API Gateway",
description="Central API Gateway for bakery forecasting microservices",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc",
redirect_slashes=False # Disable automatic trailing slash redirects
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis (will be active once redis client is initialized)
RedisInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("gateway")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# Redis client for SSE streaming
redis_client = None
# CORS middleware - Add first
app.add_middleware(
CORSMiddleware,
allow_origins=settings.CORS_ORIGINS_LIST,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
class GatewayService(StandardFastAPIService):
"""Gateway Service with standardized monitoring setup"""
async def on_startup(self, app):
"""Custom startup logic for Gateway"""
global redis_client
# Initialize Redis
try:
await initialize_redis(settings.REDIS_URL, db=0, max_connections=50)
redis_client = await get_redis_client()
logger.info("Connected to Redis for SSE streaming")
# Add API rate limiting middleware with Redis client
app.add_middleware(APIRateLimitMiddleware, redis_client=redis_client)
logger.info("API rate limiting middleware enabled")
except Exception as e:
logger.error(f"Failed to connect to Redis: {e}")
# Register custom metrics for gateway-specific operations
if self.telemetry_providers and self.telemetry_providers.app_metrics:
logger.info("Gateway-specific metrics tracking enabled")
await super().on_startup(app)
async def on_shutdown(self, app):
"""Custom shutdown logic for Gateway"""
await super().on_shutdown(app)
# Close Redis
await close_redis()
logger.info("Redis connection closed")
# Create service instance
service = GatewayService(
service_name="gateway",
app_name="Bakery Forecasting API Gateway",
description="Central API Gateway for bakery forecasting microservices",
version="1.0.0",
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
cors_origins=settings.CORS_ORIGINS_LIST,
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
)
# Custom middleware - Add in REVERSE order (last added = first executed)
# Create FastAPI app
app = service.create_app()
# Add gateway-specific middleware (in REVERSE order of execution)
# Execution order: RequestIDMiddleware -> DemoMiddleware -> AuthMiddleware -> ReadOnlyModeMiddleware -> SubscriptionMiddleware -> APIRateLimitMiddleware -> RateLimitMiddleware -> LoggingMiddleware
app.add_middleware(LoggingMiddleware) # Executes 8th (outermost)
app.add_middleware(RateLimitMiddleware, calls_per_minute=300) # Executes 7th - Simple rate limit
# Note: APIRateLimitMiddleware will be added on startup with Redis client
app.add_middleware(SubscriptionMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL) # Executes 5th
app.add_middleware(ReadOnlyModeMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL) # Executes 4th - Enforce read-only mode
app.add_middleware(AuthMiddleware) # Executes 3rd - Checks for demo context
app.add_middleware(DemoMiddleware) # Executes 2nd - Sets demo user context
app.add_middleware(RequestIDMiddleware) # Executes 1st (innermost) - Generates request ID for tracing
app.add_middleware(LoggingMiddleware)
app.add_middleware(RateLimitMiddleware, calls_per_minute=300)
app.add_middleware(SubscriptionMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)
app.add_middleware(ReadOnlyModeMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)
app.add_middleware(AuthMiddleware)
app.add_middleware(DemoMiddleware)
app.add_middleware(RequestIDMiddleware)
# Include routers
app.include_router(auth.router, prefix="/api/v1/auth", tags=["authentication"])
@@ -156,114 +111,18 @@ app.include_router(subscription.router, prefix="/api/v1", tags=["subscriptions"]
app.include_router(notification.router, prefix="/api/v1/notifications", tags=["notifications"])
app.include_router(nominatim.router, prefix="/api/v1/nominatim", tags=["location"])
app.include_router(geocoding.router, prefix="/api/v1/geocoding", tags=["geocoding"])
# app.include_router(poi_context.router, prefix="/api/v1/poi-context", tags=["poi-context"]) # Removed to implement tenant-based architecture
app.include_router(pos.router, prefix="/api/v1/pos", tags=["pos"])
app.include_router(demo.router, prefix="/api/v1", tags=["demo"])
@app.on_event("startup")
async def startup_event():
"""Application startup"""
global redis_client
logger.info("Starting API Gateway")
# Initialize shared Redis connection
try:
await initialize_redis(settings.REDIS_URL, db=0, max_connections=50)
redis_client = await get_redis_client()
logger.info("Connected to Redis for SSE streaming")
# Add API rate limiting middleware with Redis client
app.add_middleware(APIRateLimitMiddleware, redis_client=redis_client)
logger.info("API rate limiting middleware enabled with subscription-based quotas")
except Exception as e:
logger.error(f"Failed to connect to Redis: {e}")
logger.warning("API rate limiting middleware will fail open (allow all requests)")
metrics_collector.register_counter(
"gateway_auth_requests_total",
"Total authentication requests"
)
metrics_collector.register_counter(
"gateway_auth_responses_total",
"Total authentication responses"
)
metrics_collector.register_counter(
"gateway_auth_errors_total",
"Total authentication errors"
)
metrics_collector.register_histogram(
"gateway_request_duration_seconds",
"Request duration in seconds"
)
logger.info("Metrics registered successfully")
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("gateway")
logger.info("System metrics collection started")
logger.info("Metrics export configured via OpenTelemetry OTLP")
logger.info("API Gateway started successfully")
@app.on_event("shutdown")
async def shutdown_event():
"""Application shutdown"""
logger.info("Shutting down API Gateway")
# Close shared Redis connection
await close_redis()
# Clean up service discovery
# await service_discovery.cleanup()
logger.info("API Gateway shutdown complete")
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy",
"service": "api-gateway",
"version": "1.0.0",
"timestamp": time.time()
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
# ================================================================
# SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS
# ================================================================
def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list:
"""
Determine which Redis channels to subscribe to based on filters.
Args:
tenant_id: Tenant identifier
channel_filters: List of channel patterns (e.g., ["inventory.alerts", "*.notifications"])
Returns:
List of full channel names to subscribe to
Examples:
>>> _get_subscription_channels("abc", ["inventory.alerts"])
["tenant:abc:inventory.alerts"]
>>> _get_subscription_channels("abc", ["*.alerts"])
["tenant:abc:inventory.alerts", "tenant:abc:production.alerts", ...]
>>> _get_subscription_channels("abc", [])
["tenant:abc:inventory.alerts", "tenant:abc:inventory.notifications", ...]
"""
"""Determine which Redis channels to subscribe to based on filters"""
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
all_classes = ["alerts", "notifications"]
channels = []
if not channel_filters:
@@ -271,70 +130,49 @@ def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list:
for domain in all_domains:
for event_class in all_classes:
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
# Also subscribe to recommendations (tenant-wide)
channels.append(f"tenant:{tenant_id}:recommendations")
# Also subscribe to legacy channel for backward compatibility
channels.append(f"alerts:{tenant_id}")
channels.append(f"alerts:{tenant_id}") # Legacy
return channels
# Parse filters and expand wildcards
for filter_pattern in channel_filters:
if filter_pattern == "*.*":
# All channels
for domain in all_domains:
for event_class in all_classes:
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
channels.append(f"tenant:{tenant_id}:recommendations")
elif filter_pattern.endswith(".*"):
# Domain wildcard (e.g., "inventory.*")
domain = filter_pattern.split(".")[0]
for event_class in all_classes:
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
elif filter_pattern.startswith("*."):
# Class wildcard (e.g., "*.alerts")
event_class = filter_pattern.split(".")[1]
if event_class == "recommendations":
channels.append(f"tenant:{tenant_id}:recommendations")
else:
for domain in all_domains:
channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
elif filter_pattern == "recommendations":
# Recommendations channel
channels.append(f"tenant:{tenant_id}:recommendations")
else:
# Specific channel (e.g., "inventory.alerts")
channels.append(f"tenant:{tenant_id}:{filter_pattern}")
return list(set(channels)) # Remove duplicates
return list(set(channels))
async def _load_initial_state(redis_client, tenant_id: str, channel_filters: list) -> list:
"""
Load initial state from Redis cache based on channel filters.
Args:
redis_client: Redis client
tenant_id: Tenant identifier
channel_filters: List of channel patterns
Returns:
List of initial events
"""
"""Load initial state from Redis cache based on channel filters"""
initial_events = []
try:
if not channel_filters:
# Load from legacy cache if no filters (backward compat)
# Legacy cache
legacy_cache_key = f"active_alerts:{tenant_id}"
cached_data = await redis_client.get(legacy_cache_key)
if cached_data:
return json.loads(cached_data)
# Also try loading from new domain-specific caches
# New domain-specific caches
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
all_classes = ["alerts", "notifications"]
@@ -343,10 +181,9 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
cached_data = await redis_client.get(cache_key)
if cached_data:
events = json.loads(cached_data)
initial_events.extend(events)
initial_events.extend(json.loads(cached_data))
# Load recommendations
# Recommendations
recommendations_cache_key = f"active_events:{tenant_id}:recommendations"
cached_data = await redis_client.get(recommendations_cache_key)
if cached_data:
@@ -356,36 +193,29 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
# Load based on specific filters
for filter_pattern in channel_filters:
# Extract domain and class from filter
if "." in filter_pattern:
parts = filter_pattern.split(".")
domain = parts[0] if parts[0] != "*" else None
event_class = parts[1] if len(parts) > 1 and parts[1] != "*" else None
if domain and event_class:
# Specific cache (e.g., "inventory.alerts")
cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
cached_data = await redis_client.get(cache_key)
if cached_data:
initial_events.extend(json.loads(cached_data))
elif domain and not event_class:
# Domain wildcard (e.g., "inventory.*")
for ec in ["alerts", "notifications"]:
cache_key = f"active_events:{tenant_id}:{domain}.{ec}"
cached_data = await redis_client.get(cache_key)
if cached_data:
initial_events.extend(json.loads(cached_data))
elif not domain and event_class:
# Class wildcard (e.g., "*.alerts")
all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
for d in all_domains:
cache_key = f"active_events:{tenant_id}:{d}.{event_class}s"
cached_data = await redis_client.get(cache_key)
if cached_data:
initial_events.extend(json.loads(cached_data))
elif filter_pattern == "recommendations":
cache_key = f"active_events:{tenant_id}:recommendations"
cached_data = await redis_client.get(cache_key)
@@ -400,27 +230,14 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
def _determine_event_type(event_data: dict) -> str:
"""
Determine SSE event type from event data.
Args:
event_data: Event data dictionary
Returns:
SSE event type: 'alert', 'notification', or 'recommendation'
"""
# New event architecture uses 'event_class'
"""Determine SSE event type from event data"""
if 'event_class' in event_data:
return event_data['event_class'] # 'alert', 'notification', or 'recommendation'
# Legacy format uses 'item_type'
return event_data['event_class']
if 'item_type' in event_data:
if event_data['item_type'] == 'recommendation':
return 'recommendation'
else:
return 'alert'
# Default to 'alert' for backward compatibility
return 'alert'
@@ -432,42 +249,25 @@ def _determine_event_type(event_data: dict) -> str:
async def events_stream(
request: Request,
tenant_id: str,
channels: str = None # Comma-separated channel filters (e.g., "inventory.alerts,production.notifications")
channels: str = None
):
"""
Server-Sent Events stream for real-time notifications with multi-channel support.
Authentication is handled by auth middleware via query param token.
User context is available in request.state.user (injected by middleware).
Query Parameters:
tenant_id: Tenant identifier (required)
channels: Comma-separated channel filters (optional)
Examples:
- "inventory.alerts,production.notifications" - Specific channels
- "*.alerts" - All alert channels
- "inventory.*" - All inventory events
- None - All channels (default, backward compatible)
New channel pattern: tenant:{tenant_id}:{domain}.{class}
Examples:
- tenant:abc:inventory.alerts
- tenant:abc:production.notifications
- tenant:abc:recommendations
Legacy channel (backward compat): alerts:{tenant_id}
"""
global redis_client
if not redis_client:
raise HTTPException(status_code=503, detail="SSE service unavailable")
# Extract user context from request state (set by auth middleware)
# Extract user context from request state
user_context = request.state.user
user_id = user_context.get('user_id')
email = user_context.get('email')
# Validate tenant_id parameter
if not tenant_id:
raise HTTPException(status_code=400, detail="tenant_id query parameter is required")
@@ -479,79 +279,53 @@ async def events_stream(
logger.info(f"SSE connection request for user {email}, tenant {tenant_id}, channels: {channel_filters or 'all'}")
async def event_generator():
"""Generate server-sent events from Redis pub/sub with multi-channel support"""
"""Generate server-sent events from Redis pub/sub"""
pubsub = None
try:
# Create pubsub connection with resource monitoring
pubsub = redis_client.pubsub()
logger.debug(f"Created Redis pubsub connection for tenant: {tenant_id}")
# Monitor connection count
try:
connection_info = await redis_client.info('clients')
connected_clients = connection_info.get('connected_clients', 'unknown')
logger.debug(f"Redis connected clients: {connected_clients}")
except Exception:
# Don't fail if we can't get connection info
pass
# Determine which channels to subscribe to
# Determine channels
subscription_channels = _get_subscription_channels(tenant_id, channel_filters)
# Subscribe to all determined channels
# Subscribe
if subscription_channels:
await pubsub.subscribe(*subscription_channels)
logger.info(f"Subscribed to {len(subscription_channels)} channels for tenant {tenant_id}")
else:
# Fallback to legacy channel if no channels specified
legacy_channel = f"alerts:{tenant_id}"
await pubsub.subscribe(legacy_channel)
logger.info(f"Subscribed to legacy channel: {legacy_channel}")
# Send initial connection event
# Connection event
yield f"event: connection\n"
yield f"data: {json.dumps({'type': 'connected', 'message': 'SSE connection established', 'channels': subscription_channels or ['all'], 'timestamp': time.time()})}\n\n"
# Fetch and send initial state from cache (domain-specific or legacy)
# Initial state
initial_events = await _load_initial_state(redis_client, tenant_id, channel_filters)
if initial_events:
logger.info(f"Sending {len(initial_events)} initial events to tenant {tenant_id}")
yield f"event: initial_state\n"
yield f"data: {json.dumps(initial_events)}\n\n"
else:
# Send empty initial state for compatibility
yield f"event: initial_state\n"
yield f"data: {json.dumps([])}\n\n"
yield f"event: initial_state\n"
yield f"data: {json.dumps(initial_events)}\n\n"
heartbeat_counter = 0
while True:
# Check if client has disconnected
if await request.is_disconnected():
logger.info(f"SSE client disconnected for tenant: {tenant_id}")
break
try:
# Get message from Redis with timeout
message = await asyncio.wait_for(pubsub.get_message(ignore_subscribe_messages=True), timeout=10.0)
if message and message['type'] == 'message':
# Forward the event from Redis
event_data = json.loads(message['data'])
# Determine event type for SSE
event_type = _determine_event_type(event_data)
# Add channel metadata for frontend routing
event_data['_channel'] = message['channel'].decode('utf-8') if isinstance(message['channel'], bytes) else message['channel']
yield f"event: {event_type}\n"
yield f"data: {json.dumps(event_data)}\n\n"
logger.debug(f"SSE event sent to tenant {tenant_id}: {event_type} - {event_data.get('title')}")
except asyncio.TimeoutError:
# Send heartbeat every 10 timeouts (100 seconds)
heartbeat_counter += 1
if heartbeat_counter >= 10:
yield f"event: heartbeat\n"
@@ -563,24 +337,13 @@ async def events_stream(
except Exception as e:
logger.error(f"SSE error for tenant {tenant_id}: {e}", exc_info=True)
finally:
try:
if pubsub:
try:
# Unsubscribe from all channels
await pubsub.unsubscribe()
logger.debug(f"Unsubscribed from Redis channels for tenant: {tenant_id}")
except Exception as unsubscribe_error:
logger.error(f"Failed to unsubscribe Redis pubsub for tenant {tenant_id}: {unsubscribe_error}")
try:
# Close pubsub connection
await pubsub.close()
logger.debug(f"Closed Redis pubsub connection for tenant: {tenant_id}")
except Exception as close_error:
logger.error(f"Failed to close Redis pubsub for tenant {tenant_id}: {close_error}")
logger.info(f"SSE connection closed for tenant: {tenant_id}")
except Exception as finally_error:
logger.error(f"Error in SSE cleanup for tenant {tenant_id}: {finally_error}")
if pubsub:
try:
await pubsub.unsubscribe()
await pubsub.close()
except Exception as e:
logger.error(f"Error closing pubsub: {e}")
logger.info(f"SSE connection closed for tenant: {tenant_id}")
return StreamingResponse(
event_generator(),
@@ -593,55 +356,35 @@ async def events_stream(
}
)
# ================================================================
# WEBSOCKET ROUTING FOR TRAINING SERVICE
# ================================================================
@app.websocket("/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live")
async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_id: str):
"""
Simple WebSocket proxy with token verification only.
Validates the token and forwards the connection to the training service.
"""
# Get token from query params
"""WebSocket proxy with token verification for training service"""
token = websocket.query_params.get("token")
if not token:
logger.warning("WebSocket proxy rejected - missing token",
job_id=job_id,
tenant_id=tenant_id)
await websocket.accept()
await websocket.close(code=1008, reason="Authentication token required")
return
# Verify token
from shared.auth.jwt_handler import JWTHandler
jwt_handler = JWTHandler(settings.JWT_SECRET_KEY, settings.JWT_ALGORITHM)
try:
payload = jwt_handler.verify_token(token)
if not payload or not payload.get('user_id'):
logger.warning("WebSocket proxy rejected - invalid token",
job_id=job_id,
tenant_id=tenant_id)
await websocket.accept()
await websocket.close(code=1008, reason="Invalid token")
return
logger.info("WebSocket proxy - token verified",
user_id=payload.get('user_id'),
tenant_id=tenant_id,
job_id=job_id)
except Exception as e:
logger.warning("WebSocket proxy - token verification failed",
job_id=job_id,
error=str(e))
await websocket.accept()
await websocket.close(code=1008, reason="Token verification failed")
return
# Accept the connection
await websocket.accept()
# Build WebSocket URL to training service
@@ -649,33 +392,24 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
training_ws_url = training_service_base.replace('http://', 'ws://').replace('https://', 'wss://')
training_ws_url = f"{training_ws_url}/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live?token={token}"
logger.info("Gateway proxying WebSocket to training service",
job_id=job_id,
training_ws_url=training_ws_url.replace(token, '***'))
training_ws = None
try:
# Connect to training service WebSocket
import websockets
from websockets.protocol import State
training_ws = await websockets.connect(
training_ws_url,
ping_interval=120, # Send ping every 2 minutes (tolerates long training operations)
ping_timeout=60, # Wait up to 1 minute for pong (graceful timeout)
close_timeout=60, # Increase close timeout for graceful shutdown
ping_interval=120,
ping_timeout=60,
close_timeout=60,
open_timeout=30
)
logger.info("Gateway connected to training service WebSocket", job_id=job_id)
async def forward_frontend_to_training():
"""Forward messages from frontend to training service"""
try:
while training_ws and training_ws.state == State.OPEN:
data = await websocket.receive()
if data.get("type") == "websocket.receive":
if "text" in data:
await training_ws.send(data["text"])
@@ -683,30 +417,17 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
await training_ws.send(data["bytes"])
elif data.get("type") == "websocket.disconnect":
break
except Exception as e:
logger.debug("Frontend to training forward ended", error=str(e))
except Exception:
pass
async def forward_training_to_frontend():
"""Forward messages from training service to frontend"""
message_count = 0
try:
while training_ws and training_ws.state == State.OPEN:
message = await training_ws.recv()
await websocket.send_text(message)
message_count += 1
except Exception:
pass
# Log every 10th message to track connectivity
if message_count % 10 == 0:
logger.debug("WebSocket proxy active",
job_id=job_id,
messages_forwarded=message_count)
except Exception as e:
logger.info("Training to frontend forward ended",
job_id=job_id,
messages_forwarded=message_count,
error=str(e))
# Run both forwarding tasks concurrently
await asyncio.gather(
forward_frontend_to_training(),
forward_training_to_frontend(),
@@ -716,20 +437,17 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
except Exception as e:
logger.error("WebSocket proxy error", job_id=job_id, error=str(e))
finally:
# Cleanup
if training_ws and training_ws.state == State.OPEN:
try:
await training_ws.close()
except:
pass
try:
if not websocket.client_state.name == 'DISCONNECTED':
await websocket.close(code=1000, reason="Proxy closed")
except:
pass
logger.info("WebSocket proxy connection closed", job_id=job_id)
if __name__ == "__main__":
import uvicorn

View File

@@ -48,9 +48,9 @@ signoz:
signoz_traces_ttl_duration_hrs: "168"
signoz_metrics_ttl_duration_hrs: "168"
signoz_logs_ttl_duration_hrs: "168"
# OpAMP Server Configuration
signoz_opamp_server_enabled: "true"
signoz_opamp_server_endpoint: "0.0.0.0:4320"
# OpAMP Server Configuration - DISABLED for dev (causes gRPC instability)
signoz_opamp_server_enabled: "false"
# signoz_opamp_server_endpoint: "0.0.0.0:4320"
persistence:
enabled: true
@@ -149,9 +149,10 @@ otelCollector:
repository: signoz/signoz-otel-collector
tag: v0.129.12 # Latest recommended version
# OpAMP Configuration - Enabled for dynamic configuration management
# Note: OpAMP allows remote configuration management via SigNoz backend
# This replaces the manual kubectl patch approach
# OpAMP Configuration - DISABLED for development
# OpAMP is designed for production with remote config management
# In dev, it causes gRPC instability and collector reloads
# We use static configuration instead
# Init containers for the Otel Collector pod
initContainers:
@@ -231,6 +232,9 @@ otelCollector:
secretName: postgres-tls
- name: postgres-tls-fixed
emptyDir: {}
- name: varlogpods
hostPath:
path: /var/log/pods
extraVolumeMounts:
- name: redis-tls
@@ -242,13 +246,16 @@ otelCollector:
- name: postgres-tls-fixed
mountPath: /etc/postgres-tls
readOnly: false
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
# Enable OpAMP for dynamic configuration management
# Disable OpAMP - use static configuration only
# Use 'args' instead of 'extraArgs' to completely override the command
command:
name: /signoz-otel-collector
extraArgs:
args:
- --config=/conf/otel-collector-config.yaml
- --manager-config=/conf/otel-collector-opamp-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName
# OpenTelemetry Collector configuration
@@ -275,6 +282,63 @@ otelCollector:
allowed_origins:
- "*"
# Filelog receiver for Kubernetes pod logs
# Collects container stdout/stderr from /var/log/pods
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
# Exclude SigNoz's own logs to avoid recursive collection
- /var/log/pods/bakery-ia_signoz-*/*/*.log
include_file_path: true
include_file_name: false
operators:
# Parse CRI-O / containerd log format
- type: regex_parser
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Fix timestamp parsing - extract from the parsed time field
- type: move
from: attributes.time
to: attributes.timestamp
# Extract Kubernetes metadata from file path
- type: regex_parser
id: extract_metadata_from_filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
parse_from: attributes["log.file.path"]
# Move metadata to resource attributes
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.log
to: body
# Kubernetes Cluster Receiver - Collects cluster-level metrics
# Provides information about nodes, namespaces, pods, and other cluster resources
k8s_cluster:
collection_interval: 30s
node_conditions_to_report:
- Ready
- MemoryPressure
- DiskPressure
- PIDPressure
- NetworkUnavailable
allocatable_types_to_report:
- cpu
- memory
- pods
# PostgreSQL receivers for database metrics
# ENABLED: Monitor users configured and credentials stored in secrets
# Collects metrics directly from PostgreSQL databases with proper TLS
@@ -538,6 +602,43 @@ otelCollector:
password: ${env:RABBITMQ_PASSWORD}
collection_interval: 30s
# Prometheus Receiver - Scrapes metrics from Kubernetes API
# Simplified configuration using only Kubernetes API metrics
prometheus:
config:
scrape_configs:
- job_name: 'kubernetes-nodes-cadvisor'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
processors:
# Batch processor for better performance (optimized for high throughput)
batch:
@@ -562,6 +663,25 @@ otelCollector:
detectors: [env, system, docker]
timeout: 5s
# Kubernetes attributes processor - CRITICAL for logs
# Extracts pod, namespace, container metadata from log attributes
k8sattributes:
auth_type: "serviceAccount"
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
- k8s.container.name
labels:
- tag_name: "app"
- tag_name: "pod-template-hash"
annotations:
- tag_name: "description"
# SigNoz span metrics processor with delta aggregation (recommended)
# Generates RED metrics (Rate, Error, Duration) from trace spans
signozspanmetrics/delta:
@@ -643,7 +763,7 @@ otelCollector:
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
postgresql/production, postgresql/recipes, postgresql/sales,
postgresql/suppliers, postgresql/tenant, postgresql/training,
redis, rabbitmq]
redis, rabbitmq, k8s_cluster, prometheus]
processors: [memory_limiter, batch, resourcedetection]
exporters: [signozclickhousemetrics]
@@ -653,17 +773,38 @@ otelCollector:
processors: [batch/meter]
exporters: [signozclickhousemeter]
# Logs pipeline
# Logs pipeline - includes both OTLP and Kubernetes pod logs
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection]
receivers: [otlp, filelog]
processors: [memory_limiter, batch, resourcedetection, k8sattributes]
exporters: [clickhouselogsexporter]
# Additional Configuration
serviceAccount:
create: true
annotations: {}
name: ""
name: "signoz-otel-collector"
# RBAC Configuration for Kubernetes monitoring
# Required for k8s_cluster and kubeletstats receivers to access Kubernetes API
rbac:
create: true
rules:
- apiGroups: [""]
resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources: ["deployments", "daemonsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["metrics.k8s.io"]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch"]
# Security Context
securityContext:

View File

@@ -66,6 +66,11 @@ signoz:
signoz_traces_ttl_duration_hrs: "720"
signoz_metrics_ttl_duration_hrs: "720"
signoz_logs_ttl_duration_hrs: "720"
# OpAMP Server Configuration
# WARNING: OpAMP can cause gRPC instability and collector reloads
# Only enable if you have a stable OpAMP backend server
signoz_opamp_server_enabled: "false"
# signoz_opamp_server_endpoint: "0.0.0.0:4320"
# SMTP configuration for email alerts
signoz_smtp_enabled: "true"
signoz_smtp_host: "smtp.gmail.com"
@@ -247,17 +252,52 @@ otelCollector:
tag: v0.129.12 # Updated to latest recommended version
pullPolicy: IfNotPresent
# Init containers for the Otel Collector pod
initContainers:
fix-postgres-tls:
enabled: true
image:
registry: docker.io
repository: busybox
tag: 1.35
pullPolicy: IfNotPresent
command:
- sh
- -c
- |
echo "Fixing PostgreSQL TLS file permissions..."
cp /etc/postgres-tls-source/* /etc/postgres-tls/
chmod 600 /etc/postgres-tls/server-key.pem
chmod 644 /etc/postgres-tls/server-cert.pem
chmod 644 /etc/postgres-tls/ca-cert.pem
echo "PostgreSQL TLS permissions fixed"
volumeMounts:
- name: postgres-tls-source
mountPath: /etc/postgres-tls-source
readOnly: true
- name: postgres-tls-fixed
mountPath: /etc/postgres-tls
readOnly: false
service:
type: ClusterIP
ports:
- name: otlp-grpc
port: 4317
targetPort: 4317
protocol: TCP
- name: otlp-http
port: 4318
targetPort: 4318
protocol: TCP
- name: prometheus
port: 8889
targetPort: 8889
protocol: TCP
- name: metrics
port: 8888
- name: healthcheck
port: 13133
targetPort: 8888
protocol: TCP
resources:
requests:
@@ -267,6 +307,50 @@ otelCollector:
cpu: 2000m
memory: 2Gi
# Additional environment variables for receivers
additionalEnvs:
POSTGRES_MONITOR_USER: "monitoring"
POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca"
REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k"
RABBITMQ_USER: "bakery"
RABBITMQ_PASSWORD: "forecast123"
# Mount TLS certificates for secure connections
extraVolumes:
- name: redis-tls
secret:
secretName: redis-tls-secret
- name: postgres-tls
secret:
secretName: postgres-tls
- name: postgres-tls-fixed
emptyDir: {}
- name: varlogpods
hostPath:
path: /var/log/pods
extraVolumeMounts:
- name: redis-tls
mountPath: /etc/redis-tls
readOnly: true
- name: postgres-tls
mountPath: /etc/postgres-tls-source
readOnly: true
- name: postgres-tls-fixed
mountPath: /etc/postgres-tls
readOnly: false
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
# Enable OpAMP for dynamic configuration management
command:
name: /signoz-otel-collector
extraArgs:
- --config=/conf/otel-collector-config.yaml
- --manager-config=/conf/otel-collector-opamp-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName
# Full OTEL Collector Configuration
config:
# Connectors - bridge between pipelines
@@ -297,14 +381,358 @@ otelCollector:
- "https://monitoring.bakewise.ai"
- "https://*.bakewise.ai"
# Filelog receiver for Kubernetes pod logs
# Collects container stdout/stderr from /var/log/pods
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
# Exclude SigNoz's own logs to avoid recursive collection
- /var/log/pods/bakery-ia_signoz-*/*/*.log
include_file_path: true
include_file_name: false
operators:
# Parse CRI-O / containerd log format
- type: regex_parser
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Fix timestamp parsing - extract from the parsed time field
- type: move
from: attributes.time
to: attributes.timestamp
# Extract Kubernetes metadata from file path
- type: regex_parser
id: extract_metadata_from_filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
parse_from: attributes["log.file.path"]
# Move metadata to resource attributes
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.log
to: body
# Kubernetes Cluster Receiver - Collects cluster-level metrics
# Provides information about nodes, namespaces, pods, and other cluster resources
k8s_cluster:
collection_interval: 30s
node_conditions_to_report:
- Ready
- MemoryPressure
- DiskPressure
- PIDPressure
- NetworkUnavailable
allocatable_types_to_report:
- cpu
- memory
- pods
# Prometheus receiver for scraping metrics
prometheus:
config:
scrape_configs:
- job_name: 'otel-collector'
- job_name: 'kubernetes-nodes-cadvisor'
scrape_interval: 30s
static_configs:
- targets: ['localhost:8888']
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Redis receiver for cache metrics
# ENABLED: Using existing credentials from redis-secrets with TLS
redis:
endpoint: redis-service.bakery-ia:6379
password: ${env:REDIS_PASSWORD}
collection_interval: 60s
transport: tcp
tls:
insecure_skip_verify: false
cert_file: /etc/redis-tls/redis-cert.pem
key_file: /etc/redis-tls/redis-key.pem
ca_file: /etc/redis-tls/ca-cert.pem
metrics:
redis.maxmemory:
enabled: true
redis.cmd.latency:
enabled: true
# RabbitMQ receiver via management API
# ENABLED: Using existing credentials from rabbitmq-secrets
rabbitmq:
endpoint: http://rabbitmq-service.bakery-ia:15672
username: ${env:RABBITMQ_USER}
password: ${env:RABBITMQ_PASSWORD}
collection_interval: 30s
# PostgreSQL receivers for database metrics
# Monitor all databases with proper TLS configuration
postgresql/auth:
endpoint: auth-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- auth_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/inventory:
endpoint: inventory-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- inventory_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/orders:
endpoint: orders-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- orders_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/ai-insights:
endpoint: ai-insights-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- ai_insights_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/alert-processor:
endpoint: alert-processor-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- alert_processor_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/distribution:
endpoint: distribution-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- distribution_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/external:
endpoint: external-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- external_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/forecasting:
endpoint: forecasting-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- forecasting_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/notification:
endpoint: notification-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- notification_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/orchestrator:
endpoint: orchestrator-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- orchestrator_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/pos:
endpoint: pos-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- pos_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/procurement:
endpoint: procurement-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- procurement_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/production:
endpoint: production-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- production_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/recipes:
endpoint: recipes-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- recipes_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/sales:
endpoint: sales-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- sales_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/suppliers:
endpoint: suppliers-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- suppliers_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/tenant:
endpoint: tenant-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- tenant_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/training:
endpoint: training-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- training_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
processors:
# High-performance batch processing (official recommendation)
@@ -326,7 +754,7 @@ otelCollector:
# Resource detection for K8s
resourcedetection:
detectors: [env, system, docker, kubernetes]
detectors: [env, system, docker]
timeout: 5s
# Add resource attributes
@@ -339,6 +767,26 @@ otelCollector:
value: bakery-ia-prod
action: upsert
# Kubernetes attributes processor - CRITICAL for logs
# Extracts pod, namespace, container metadata from log attributes
k8sattributes:
auth_type: "serviceAccount"
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
- k8s.container.name
labels:
- tag_name: "app"
- tag_name: "pod-template-hash"
- tag_name: "version"
annotations:
- tag_name: "description"
# SigNoz span metrics processor with delta aggregation (recommended)
# Generates RED metrics (Rate, Error, Duration) from trace spans
signozspanmetrics/delta:
@@ -354,9 +802,9 @@ otelCollector:
- name: signoz.collector.id
exporters:
# Export to SigNoz ClickHouse
# ClickHouse exporter for traces
clickhousetraces:
datasource: tcp://clickhouse:9000/?database=signoz_traces
datasource: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_traces
timeout: 10s
retry_on_failure:
enabled: true
@@ -364,8 +812,9 @@ otelCollector:
max_interval: 30s
max_elapsed_time: 300s
# ClickHouse exporter for metrics
signozclickhousemetrics:
endpoint: "tcp://clickhouse:9000/?database=signoz_metrics"
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
timeout: 10s
retry_on_failure:
enabled: true
@@ -375,32 +824,32 @@ otelCollector:
# ClickHouse exporter for meter data (usage metrics)
signozclickhousemeter:
dsn: "tcp://clickhouse:9000/?database=signoz_meter"
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_meter"
timeout: 45s
sending_queue:
enabled: false
# ClickHouse exporter for logs
clickhouselogsexporter:
dsn: tcp://clickhouse:9000/?database=signoz_logs
dsn: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_logs
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
# Metadata exporter for service metadata
metadataexporter:
dsn: "tcp://clickhouse:9000/?database=signoz_metadata"
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metadata"
timeout: 10s
cache:
provider: in_memory
# Debug exporter for debugging (replaces deprecated logging exporter)
# Debug exporter for debugging (optional)
debug:
verbosity: detailed
sampling_initial: 2
sampling_thereafter: 500
sampling_initial: 5
sampling_thereafter: 200
service:
extensions: [health_check, zpages]
@@ -411,9 +860,16 @@ otelCollector:
processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource]
exporters: [clickhousetraces, metadataexporter, signozmeter]
# Metrics pipeline
# Metrics pipeline - includes all infrastructure receivers
metrics:
receivers: [otlp, prometheus]
receivers: [otlp,
postgresql/auth, postgresql/inventory, postgresql/orders,
postgresql/ai-insights, postgresql/alert-processor, postgresql/distribution,
postgresql/external, postgresql/forecasting, postgresql/notification,
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
postgresql/production, postgresql/recipes, postgresql/sales,
postgresql/suppliers, postgresql/tenant, postgresql/training,
redis, rabbitmq, k8s_cluster, prometheus]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [signozclickhousemetrics]
@@ -423,10 +879,10 @@ otelCollector:
processors: [batch/meter]
exporters: [signozclickhousemeter]
# Logs pipeline
# Logs pipeline - includes both OTLP and Kubernetes pod logs
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
receivers: [otlp, filelog]
processors: [memory_limiter, batch, resourcedetection, resource, k8sattributes]
exporters: [clickhouselogsexporter]
# HPA for OTEL Collector
@@ -455,6 +911,27 @@ serviceAccount:
annotations: {}
name: "signoz"
# RBAC Configuration for Kubernetes monitoring
# Required for k8s_cluster receiver to access Kubernetes API
rbac:
create: true
rules:
- apiGroups: [""]
resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources: ["deployments", "daemonsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["metrics.k8s.io"]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch"]
# Security Context
securityContext:
runAsNonRoot: true

View File

@@ -15,9 +15,13 @@ data:
LOG_LEVEL: "INFO"
# Observability Settings - SigNoz enabled
# Note: Detailed OTEL configuration is in the OBSERVABILITY section below
ENABLE_TRACING: "true"
ENABLE_METRICS: "true"
ENABLE_LOGS: "true"
ENABLE_OTEL_METRICS: "true"
ENABLE_SYSTEM_METRICS: "true"
OTEL_LOGS_EXPORTER: "otlp"
# Database initialization settings
# IMPORTANT: Services NEVER run migrations - they only verify DB is ready
@@ -384,15 +388,44 @@ data:
# ================================================================
# OBSERVABILITY - SigNoz (Unified Monitoring)
# ================================================================
# OpenTelemetry Configuration - Direct to SigNoz
# IMPORTANT: gRPC endpoints should NOT include http:// prefix
# OpenTelemetry Configuration - Direct to SigNoz OTel Collector
#
# ENDPOINT CONFIGURATION:
# - OTEL_EXPORTER_OTLP_ENDPOINT: Base gRPC endpoint (host:port format, NO http:// prefix)
# Used by traces and metrics (gRPC) by default
# Format: "host:4317" (gRPC port)
#
# PROTOCOL USAGE:
# - Traces: gRPC (port 4317) - High performance, low latency
# - Metrics: gRPC (port 4317) - Efficient batch export
# - Logs: HTTP (port 4318) - Required for OTLP log protocol
#
# The monitoring library automatically handles:
# - Converting gRPC endpoint (4317) to HTTP endpoint (4318) for logs
# - Adding proper paths (/v1/traces, /v1/metrics, /v1/logs)
# - Protocol prefixes (http:// for HTTP, none for gRPC)
#
# Base OTLP endpoint (gRPC format - used by traces and metrics)
OTEL_EXPORTER_OTLP_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
# Protocol configuration (gRPC is recommended for better performance)
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
# Optional: Signal-specific endpoint overrides (if different from base)
# OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
# OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
# OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
# Optional: Protocol overrides per signal
# OTEL_EXPORTER_OTLP_TRACES_PROTOCOL: "grpc"
# OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: "grpc"
# Note: Logs always use HTTP protocol regardless of this setting
# Resource attributes (added to all telemetry signals)
OTEL_SERVICE_NAME: "bakery-ia"
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
OTEL_LOGS_EXPORTER: "otlp"
# SigNoz Endpoints (v0.106.0+ unified service)
# SigNoz service endpoints (for UI and API access)
SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local"

View File

@@ -1,104 +1,170 @@
{
"dashboard": {
"title": "Bakery IA - Alert Management",
"description": "Alert monitoring and management dashboard",
"tags": ["alerts", "monitoring", "management"],
"panels": [
{
"title": "Active Alerts",
"type": "stat",
"query": {
"metric": "alerts_active",
"aggregate": "sum",
"filters": [
{
"key": "severity",
"operator": "=",
"value": "${severity}"
},
{
"key": "status",
"operator": "=",
"value": "firing"
}
]
},
"unit": "number"
},
{
"title": "Alert Rate",
"type": "timeseries",
"query": {
"metric": "alerts_total",
"aggregate": "rate",
"filters": [
{
"key": "severity",
"operator": "=",
"value": "${severity}"
}
]
},
"unit": "alerts/s"
},
{
"title": "Alerts by Severity",
"type": "pie",
"query": {
"metric": "alerts_total",
"aggregate": "sum",
"groupBy": ["severity"],
"filters": [
{
"key": "severity",
"operator": "=",
"value": "${severity}"
}
]
}
},
{
"title": "Alerts by Status",
"type": "pie",
"query": {
"metric": "alerts_total",
"aggregate": "sum",
"groupBy": ["status"],
"filters": [
{
"key": "status",
"operator": "=",
"value": "${status}"
}
]
}
}
],
"variables": [
{
"name": "severity",
"label": "Severity",
"type": "dropdown",
"default": "*",
"values": ["*", "critical", "high", "medium", "low"]
},
{
"name": "status",
"label": "Status",
"type": "dropdown",
"default": "*",
"values": ["*", "firing", "resolved", "acknowledged"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Alert monitoring and management dashboard",
"tags": ["alerts", "monitoring", "management"],
"name": "bakery-ia-alert-management",
"title": "Bakery IA - Alert Management",
"uploadedGrafana": false,
"uuid": "bakery-ia-alerts-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "active-alerts",
"moved": false,
"static": false
},
"refresh": "15s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "alert-rate",
"moved": false,
"static": false
}
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'alerts_active' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "active-alerts",
"title": "Active Alerts",
"description": "Number of currently active alerts",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "alerts_active",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Active Alerts",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "alert-rate",
"title": "Alert Rate",
"description": "Rate of alerts over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "alerts_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "alerts/s"
}
]
}

View File

@@ -1,102 +1,351 @@
{
"dashboard": {
"title": "Bakery IA - API Performance",
"description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints",
"tags": ["api", "performance", "rest", "graphql"],
"panels": [
{
"title": "Request Volume",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["api"],
"filters": [
{
"key": "api",
"operator": "=",
"value": "${api}"
}
]
},
"unit": "req/s"
},
{
"title": "Error Rate",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["api", "status"],
"filters": [
{
"key": "api",
"operator": "=",
"value": "${api}"
},
{
"key": "status",
"operator": "=~",
"value": "5.."
}
]
},
"unit": "req/s"
},
{
"title": "Average Response Time",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_sum",
"aggregate": "avg",
"groupBy": ["api", "endpoint"],
"filters": [
{
"key": "api",
"operator": "=",
"value": "${api}"
}
]
},
"unit": "seconds"
},
{
"title": "P95 Latency",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_bucket",
"aggregate": "histogram_quantile",
"quantile": 0.95,
"groupBy": ["api", "endpoint"],
"filters": [
{
"key": "api",
"operator": "=",
"value": "${api}"
}
]
},
"unit": "seconds"
}
],
"variables": [
{
"name": "api",
"label": "API Service",
"type": "dropdown",
"default": "*",
"values": ["*", "gateway-api", "auth-api", "inventory-api", "production-api", "forecasting-api", "procurement-api"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints",
"tags": ["api", "performance", "rest", "graphql"],
"name": "bakery-ia-api-performance",
"title": "Bakery IA - API Performance",
"uploadedGrafana": false,
"uuid": "bakery-ia-api-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "request-volume",
"moved": false,
"static": false
},
"refresh": "15s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "error-rate",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "avg-response-time",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "p95-latency",
"moved": false,
"static": false
}
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by API service",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'http_server_requests_seconds_count' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "request-volume",
"title": "Request Volume",
"description": "API request volume by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{api.name}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "req/s"
},
{
"id": "error-rate",
"title": "Error Rate",
"description": "API error rate by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.api}}"
},
{
"key": {
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=~",
"value": "5.."
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
{
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{api.name}} - {{status_code}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "req/s"
},
{
"id": "avg-response-time",
"title": "Average Response Time",
"description": "Average API response time by endpoint",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "http_server_requests_seconds_sum",
"dataType": "float64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.api}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
{
"key": "endpoint",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{api.name}} - {{endpoint}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "seconds"
},
{
"id": "p95-latency",
"title": "P95 Latency",
"description": "95th percentile latency by endpoint",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "histogram_quantile",
"aggregateAttribute": {
"key": "http_server_requests_seconds_bucket",
"dataType": "float64",
"type": "Histogram",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.api}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
{
"key": "endpoint",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{api.name}} - {{endpoint}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "seconds"
}
]
}

View File

@@ -1,101 +1,333 @@
{
"dashboard": {
"title": "Bakery IA - Application Performance",
"description": "Application performance monitoring dashboard for Bakery IA microservices",
"tags": ["application", "performance", "apm"],
"panels": [
{
"title": "Request Rate",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "req/s"
},
{
"title": "Error Rate",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["service", "status"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
},
{
"key": "status",
"operator": "=~",
"value": "5.."
}
]
},
"unit": "req/s"
},
{
"title": "Average Response Time",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_sum",
"aggregate": "avg",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "seconds"
},
{
"title": "Throughput",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "rate",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "req/s"
}
],
"variables": [
{
"name": "service",
"label": "Service",
"type": "dropdown",
"default": "*",
"values": ["*", "auth-service", "gateway-service", "forecasting-service", "inventory-service", "production-service", "procurement-service"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Application performance monitoring dashboard using distributed traces and metrics",
"tags": ["application", "performance", "traces", "apm"],
"name": "bakery-ia-application-performance",
"title": "Bakery IA - Application Performance (APM)",
"uploadedGrafana": false,
"uuid": "bakery-ia-apm-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "latency-p99",
"moved": false,
"static": false
},
"refresh": "15s",
"time": {
"from": "now-30m",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "request-rate",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "error-rate",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "avg-duration",
"moved": false,
"static": false
}
}
}
],
"variables": {
"service_name": {
"id": "service-var",
"name": "service_name",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(serviceName) FROM signoz_traces.distributed_signoz_index_v2 ORDER BY serviceName",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "latency-p99",
"title": "P99 Latency",
"description": "99th percentile latency for selected service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "traces",
"queryName": "A",
"aggregateOperator": "p99",
"aggregateAttribute": {
"key": "duration_ns",
"dataType": "float64",
"type": "",
"isColumn": true
},
"timeAggregation": "avg",
"spaceAggregation": "p99",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "ms"
},
{
"id": "request-rate",
"title": "Request Rate",
"description": "Requests per second for the service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "traces",
"queryName": "A",
"aggregateOperator": "count",
"aggregateAttribute": {
"key": "",
"dataType": "",
"type": "",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "reqps"
},
{
"id": "error-rate",
"title": "Error Rate",
"description": "Error rate percentage for the service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "traces",
"queryName": "A",
"aggregateOperator": "count",
"aggregateAttribute": {
"key": "",
"dataType": "",
"type": "",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
},
{
"key": {
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "STATUS_CODE_ERROR"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "reqps"
},
{
"id": "avg-duration",
"title": "Average Duration",
"description": "Average request duration",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "traces",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "duration_ns",
"dataType": "float64",
"type": "",
"isColumn": true
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "ms"
}
]
}

View File

@@ -1,101 +1,425 @@
{
"dashboard": {
"title": "Bakery IA - Database Performance",
"description": "Comprehensive database performance monitoring for PostgreSQL and Redis",
"tags": ["database", "postgresql", "redis", "performance"],
"panels": [
{
"title": "Database Connections",
"type": "timeseries",
"query": {
"metric": "pg_stat_activity_count",
"aggregate": "sum",
"groupBy": ["datname"],
"filters": [
{
"key": "datname",
"operator": "=",
"value": "${database}"
}
]
},
"unit": "number"
},
{
"title": "Active Queries",
"type": "timeseries",
"query": {
"metric": "pg_stat_activity_count",
"aggregate": "sum",
"groupBy": ["datname"],
"filters": [
{
"key": "datname",
"operator": "=",
"value": "${database}"
},
{
"key": "state",
"operator": "=",
"value": "active"
}
]
},
"unit": "number"
},
{
"title": "Database Size",
"type": "timeseries",
"query": {
"metric": "pg_database_size_bytes",
"aggregate": "sum",
"groupBy": ["datname"],
"filters": [
{
"key": "datname",
"operator": "=",
"value": "${database}"
}
]
},
"unit": "bytes"
},
{
"title": "Query Execution Time",
"type": "timeseries",
"query": {
"metric": "pg_stat_statements_total_time",
"aggregate": "avg",
"groupBy": ["datname"],
"filters": [
{
"key": "datname",
"operator": "=",
"value": "${database}"
}
]
},
"unit": "seconds"
}
],
"variables": [
{
"name": "database",
"label": "Database",
"type": "dropdown",
"default": "*",
"values": ["*", "postgresql", "redis"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Comprehensive database performance monitoring for PostgreSQL, Redis, and RabbitMQ",
"tags": ["database", "postgresql", "redis", "rabbitmq", "performance"],
"name": "bakery-ia-database-performance",
"title": "Bakery IA - Database Performance",
"uploadedGrafana": false,
"uuid": "bakery-ia-db-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "pg-connections",
"moved": false,
"static": false
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "pg-db-size",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "redis-connected-clients",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "redis-memory",
"moved": false,
"static": false
},
{
"x": 0,
"y": 6,
"w": 6,
"h": 3,
"i": "rabbitmq-messages",
"moved": false,
"static": false
},
{
"x": 6,
"y": 6,
"w": 6,
"h": 3,
"i": "rabbitmq-consumers",
"moved": false,
"static": false
}
}
}
],
"variables": {
"database": {
"id": "database-var",
"name": "database",
"description": "Filter by PostgreSQL database name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['postgresql.database.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'postgresql.db_size' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "pg-connections",
"title": "PostgreSQL - Active Connections",
"description": "Number of active PostgreSQL connections",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "postgresql.backends",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.database}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{postgresql.database.name}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "pg-db-size",
"title": "PostgreSQL - Database Size",
"description": "Size of PostgreSQL databases in bytes",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "postgresql.db_size",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.database}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{postgresql.database.name}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "bytes"
},
{
"id": "redis-connected-clients",
"title": "Redis - Connected Clients",
"description": "Number of clients connected to Redis",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "redis.clients.connected",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "host.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{host.name}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "redis-memory",
"title": "Redis - Memory Usage",
"description": "Redis memory usage in bytes",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "redis.memory.used",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "host.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{host.name}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "bytes"
},
{
"id": "rabbitmq-messages",
"title": "RabbitMQ - Current Messages",
"description": "Number of messages currently in RabbitMQ queues",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "rabbitmq.message.current",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "queue",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "Queue: {{queue}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "rabbitmq-consumers",
"title": "RabbitMQ - Consumer Count",
"description": "Number of consumers per queue",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "rabbitmq.consumer.count",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "queue",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "Queue: {{queue}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
}
]
}

View File

@@ -1,105 +1,348 @@
{
"dashboard": {
"title": "Bakery IA - Error Tracking",
"description": "Comprehensive error tracking and analysis dashboard",
"tags": ["errors", "exceptions", "tracking"],
"panels": [
{
"title": "Total Errors",
"type": "stat",
"query": {
"metric": "error_total",
"aggregate": "sum",
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "number"
},
{
"title": "Error Rate",
"type": "timeseries",
"query": {
"metric": "error_total",
"aggregate": "rate",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "errors/s"
},
{
"title": "HTTP 5xx Errors",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["service", "status"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
},
{
"key": "status",
"operator": "=~",
"value": "5.."
}
]
},
"unit": "number"
},
{
"title": "HTTP 4xx Errors",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["service", "status"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
},
{
"key": "status",
"operator": "=~",
"value": "4.."
}
]
},
"unit": "number"
}
],
"variables": [
{
"name": "service",
"label": "Service",
"type": "dropdown",
"default": "*",
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Comprehensive error tracking and analysis dashboard",
"tags": ["errors", "exceptions", "tracking"],
"name": "bakery-ia-error-tracking",
"title": "Bakery IA - Error Tracking",
"uploadedGrafana": false,
"uuid": "bakery-ia-errors-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "total-errors",
"moved": false,
"static": false
},
"refresh": "15s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "error-rate",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "http-5xx",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "http-4xx",
"moved": false,
"static": false
}
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'error_total' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "total-errors",
"title": "Total Errors",
"description": "Total number of errors across all services",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "error_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Total Errors",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "error-rate",
"title": "Error Rate",
"description": "Error rate over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "error_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "errors/s"
},
{
"id": "http-5xx",
"title": "HTTP 5xx Errors",
"description": "Server errors (5xx status codes)",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
},
{
"key": {
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=~",
"value": "5.."
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
{
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{serviceName}} - {{status_code}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "number"
},
{
"id": "http-4xx",
"title": "HTTP 4xx Errors",
"description": "Client errors (4xx status codes)",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
},
{
"key": {
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=~",
"value": "4.."
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
{
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{serviceName}} - {{status_code}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "number"
}
]
}

View File

@@ -1,105 +1,423 @@
{
"dashboard": {
"title": "Bakery IA - Infrastructure Monitoring",
"description": "Comprehensive infrastructure monitoring dashboard for Bakery IA system",
"tags": ["infrastructure", "system", "kubernetes"],
"panels": [
{
"title": "CPU Usage",
"type": "timeseries",
"query": {
"metric": "container_cpu_usage_seconds_total",
"aggregate": "sum",
"groupBy": ["namespace"],
"filters": [
"description": "Comprehensive infrastructure monitoring dashboard for Bakery IA Kubernetes cluster",
"tags": ["infrastructure", "kubernetes", "k8s", "system"],
"name": "bakery-ia-infrastructure-monitoring",
"title": "Bakery IA - Infrastructure Monitoring",
"uploadedGrafana": false,
"uuid": "bakery-ia-infra-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "pod-count",
"moved": false,
"static": false
},
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "pod-phase",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "container-restarts",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "node-condition",
"moved": false,
"static": false
},
{
"x": 0,
"y": 6,
"w": 12,
"h": 3,
"i": "deployment-status",
"moved": false,
"static": false
}
],
"variables": {
"namespace": {
"id": "namespace-var",
"name": "namespace",
"description": "Filter by Kubernetes namespace",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'k8s.pod.phase' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": "bakery-ia"
}
},
"widgets": [
{
"id": "pod-count",
"title": "Total Pods",
"description": "Total number of pods in the namespace",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"key": "namespace",
"operator": "=",
"value": "bakery-ia"
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "count",
"aggregateAttribute": {
"key": "k8s.pod.phase",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Total Pods",
"reduceTo": "sum"
}
]
],
"queryFormulas": []
},
"unit": "percent",
"yAxis": {
"min": 0,
"max": 100
}
"queryType": "builder"
},
{
"title": "Memory Usage",
"type": "timeseries",
"query": {
"metric": "container_memory_working_set_bytes",
"aggregate": "sum",
"groupBy": ["namespace"],
"filters": [
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "pod-phase",
"title": "Pod Phase Distribution",
"description": "Pods by phase (Running, Pending, Failed, etc.)",
"isStacked": true,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"key": "namespace",
"operator": "=",
"value": "bakery-ia"
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "k8s.pod.phase",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "phase",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{phase}}",
"reduceTo": "sum"
}
]
],
"queryFormulas": []
},
"unit": "bytes"
"queryType": "builder"
},
{
"title": "Network Traffic",
"type": "timeseries",
"query": {
"metric": "container_network_receive_bytes_total",
"aggregate": "sum",
"groupBy": ["namespace"],
"filters": [
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "container-restarts",
"title": "Container Restarts",
"description": "Container restart count over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"key": "namespace",
"operator": "=",
"value": "bakery-ia"
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "k8s.container.restarts",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "increase",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.pod.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.pod.name}}",
"reduceTo": "sum"
}
]
],
"queryFormulas": []
},
"unit": "bytes"
"queryType": "builder"
},
{
"title": "Pod Status",
"type": "stat",
"query": {
"metric": "kube_pod_status_phase",
"aggregate": "count",
"groupBy": ["phase"],
"filters": [
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "node-condition",
"title": "Node Conditions",
"description": "Node condition status (Ready, MemoryPressure, DiskPressure, etc.)",
"isStacked": true,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"key": "namespace",
"operator": "=",
"value": "bakery-ia"
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "k8s.node.condition_ready",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.node.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.node.name}} Ready",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "deployment-status",
"title": "Deployment Status (Desired vs Available)",
"description": "Deployment replicas: desired vs available",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "k8s.deployment.desired",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.deployment.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.deployment.name}} (desired)",
"reduceTo": "avg"
},
{
"key": "phase",
"operator": "=",
"value": "Running"
"dataSource": "metrics",
"queryName": "B",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "k8s.deployment.available",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "B",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.deployment.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.deployment.name}} (available)",
"reduceTo": "avg"
}
]
],
"queryFormulas": []
},
"unit": "number"
}
],
"variables": [
{
"name": "namespace",
"label": "Namespace",
"type": "dropdown",
"default": "bakery-ia",
"values": ["bakery-ia", "default", "kube-system"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
}
}
}
]
}

View File

@@ -1,99 +1,333 @@
{
"dashboard": {
"title": "Bakery IA - Log Analysis",
"description": "Comprehensive log analysis and search dashboard",
"tags": ["logs", "analysis", "search"],
"panels": [
{
"title": "Log Volume",
"type": "timeseries",
"query": {
"metric": "log_lines_total",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "logs/s"
},
{
"title": "Error Logs",
"type": "timeseries",
"query": {
"metric": "log_lines_total",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
},
{
"key": "level",
"operator": "=",
"value": "error"
}
]
},
"unit": "logs/s"
},
{
"title": "Logs by Level",
"type": "pie",
"query": {
"metric": "log_lines_total",
"aggregate": "sum",
"groupBy": ["level"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
}
},
{
"title": "Logs by Service",
"type": "pie",
"query": {
"metric": "log_lines_total",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
}
}
],
"variables": [
{
"name": "service",
"label": "Service",
"type": "dropdown",
"default": "*",
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Comprehensive log analysis and search dashboard",
"tags": ["logs", "analysis", "search"],
"name": "bakery-ia-log-analysis",
"title": "Bakery IA - Log Analysis",
"uploadedGrafana": false,
"uuid": "bakery-ia-logs-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "log-volume",
"moved": false,
"static": false
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "error-logs",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "logs-by-level",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "logs-by-service",
"moved": false,
"static": false
}
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'log_lines_total' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "log-volume",
"title": "Log Volume",
"description": "Total log volume by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "logs/s"
},
{
"id": "error-logs",
"title": "Error Logs",
"description": "Error log volume by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
},
{
"key": {
"key": "level",
"dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=",
"value": "error"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}} (errors)",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "logs/s"
},
{
"id": "logs-by-level",
"title": "Logs by Level",
"description": "Distribution of logs by severity level",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "pie",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "level",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{level}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "logs-by-service",
"title": "Logs by Service",
"description": "Distribution of logs by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "pie",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
}
]
}

View File

@@ -1,92 +1,295 @@
{
"dashboard": {
"title": "Bakery IA - System Health",
"description": "Comprehensive system health monitoring dashboard",
"tags": ["system", "health", "monitoring"],
"panels": [
{
"title": "System Availability",
"type": "stat",
"query": {
"metric": "system_availability",
"aggregate": "avg",
"filters": [
{
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "percent"
},
{
"title": "Service Health Score",
"type": "stat",
"query": {
"metric": "service_health_score",
"aggregate": "avg",
"filters": [
{
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "number"
},
{
"title": "CPU Usage",
"type": "timeseries",
"query": {
"metric": "system_cpu_usage",
"aggregate": "avg",
"filters": [
{
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "percent"
},
{
"title": "Memory Usage",
"type": "timeseries",
"query": {
"metric": "system_memory_usage",
"aggregate": "avg",
"filters": [
{
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "percent"
}
],
"variables": [
{
"name": "namespace",
"label": "Namespace",
"type": "dropdown",
"default": "bakery-ia",
"values": ["bakery-ia", "default"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Comprehensive system health monitoring dashboard",
"tags": ["system", "health", "monitoring"],
"name": "bakery-ia-system-health",
"title": "Bakery IA - System Health",
"uploadedGrafana": false,
"uuid": "bakery-ia-health-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "system-availability",
"moved": false,
"static": false
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "health-score",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "cpu-usage",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "memory-usage",
"moved": false,
"static": false
}
}
],
"variables": {
"namespace": {
"id": "namespace-var",
"name": "namespace",
"description": "Filter by Kubernetes namespace",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'system_availability' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": "bakery-ia"
}
},
"widgets": [
{
"id": "system-availability",
"title": "System Availability",
"description": "Overall system availability percentage",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "system_availability",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "System Availability",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "percent"
},
{
"id": "health-score",
"title": "Service Health Score",
"description": "Overall service health score",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "service_health_score",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Health Score",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "cpu-usage",
"title": "CPU Usage",
"description": "System CPU usage over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "system_cpu_usage",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "CPU Usage",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "percent"
},
{
"id": "memory-usage",
"title": "Memory Usage",
"description": "System memory usage over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "system_memory_usage",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Memory Usage",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "percent"
}
]
}

View File

@@ -1,96 +1,323 @@
{
"dashboard": {
"title": "Bakery IA - User Activity",
"description": "User activity and behavior monitoring dashboard",
"tags": ["user", "activity", "behavior"],
"panels": [
{
"title": "Active Users",
"type": "timeseries",
"query": {
"metric": "active_users",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "number"
},
{
"title": "User Sessions",
"type": "timeseries",
"query": {
"metric": "user_sessions_total",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "number"
},
{
"title": "API Calls per User",
"type": "timeseries",
"query": {
"metric": "api_calls_per_user",
"aggregate": "avg",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "number"
},
{
"title": "Session Duration",
"type": "timeseries",
"query": {
"metric": "session_duration_seconds",
"aggregate": "avg",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "seconds"
}
],
"variables": [
{
"name": "service",
"label": "Service",
"type": "dropdown",
"default": "*",
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "User activity and behavior monitoring dashboard",
"tags": ["user", "activity", "behavior"],
"name": "bakery-ia-user-activity",
"title": "Bakery IA - User Activity",
"uploadedGrafana": false,
"uuid": "bakery-ia-user-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "active-users",
"moved": false,
"static": false
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "user-sessions",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "api-calls-per-user",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "session-duration",
"moved": false,
"static": false
}
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'active_users' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "active-users",
"title": "Active Users",
"description": "Number of active users by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "active_users",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{service.name}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "user-sessions",
"title": "User Sessions",
"description": "Total user sessions by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "user_sessions_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "api-calls-per-user",
"title": "API Calls per User",
"description": "Average API calls per user by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "api_calls_per_user",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "session-duration",
"title": "Session Duration",
"description": "Average session duration by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "session_duration_seconds",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "seconds"
}
]
}

View File

@@ -1,160 +1,61 @@
"""Main FastAPI application for AI Insights Service."""
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import structlog
import os
from app.core.config import settings
from app.core.database import init_db, close_db
from app.api import insights
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
from shared.service_base import StandardFastAPIService
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "ai-insights"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("ai-insights")
# Setup logging
setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO'))
# Initialize logger
logger = structlog.get_logger()
# Setup OpenTelemetry logging export if enabled
logger.info(f"OTEL_LOGS_EXPORTER env var: {os.getenv('OTEL_LOGS_EXPORTER', 'not set')}")
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
logger.info("Attempting to setup OpenTelemetry logging")
from shared.monitoring.logs_exporter import setup_otel_logging
result = setup_otel_logging("ai-insights", settings.SERVICE_VERSION)
if result:
logger.info("OpenTelemetry logs export enabled for ai-insights")
else:
logger.warning("OpenTelemetry logs export setup returned None")
except Exception as e:
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
else:
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
class AIInsightsService(StandardFastAPIService):
"""AI Insights Service with standardized monitoring setup"""
async def on_startup(self, app):
"""Custom startup logic for AI Insights"""
# Initialize database
await init_db()
logger.info("Database initialized")
await super().on_startup(app)
async def on_shutdown(self, app):
"""Custom shutdown logic for AI Insights"""
await super().on_shutdown(app)
# Close database
await close_db()
logger.info("Database connections closed")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Lifespan event handler for startup and shutdown."""
# Startup
logger.info("Starting AI Insights Service", service=settings.SERVICE_NAME, version=settings.SERVICE_VERSION)
await init_db()
logger.info("Database initialized")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("ai-insights")
logger.info("System metrics collection started")
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
logger.info("Metrics export configured via OpenTelemetry OTLP")
yield
# Shutdown
logger.info("Shutting down AI Insights Service")
await close_db()
logger.info("Database connections closed")
# Create FastAPI app
app = FastAPI(
title="AI Insights Service",
# Create service instance
service = AIInsightsService(
service_name="ai-insights",
app_name="AI Insights Service",
description="Intelligent insights and recommendations for bakery operations",
version=settings.SERVICE_VERSION,
lifespan=lifespan
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
cors_origins=getattr(settings, 'ALLOWED_ORIGINS', ["*"]),
api_prefix=settings.API_V1_PREFIX,
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Create FastAPI app
app = service.create_app()
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Instrument SQLAlchemy
SQLAlchemyInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("ai-insights")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=settings.ALLOWED_ORIGINS,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router(
# Add service-specific routers
service.add_router(
insights.router,
prefix=settings.API_V1_PREFIX,
tags=["insights"]
)
@app.get("/")
async def root():
"""Root endpoint."""
return {
"service": settings.SERVICE_NAME,
"version": settings.SERVICE_VERSION,
"status": "running"
}
@app.get("/health")
async def health_check():
"""Health check endpoint."""
return {
"status": "healthy",
"service": settings.SERVICE_NAME,
"version": settings.SERVICE_VERSION
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__":
import uvicorn

View File

@@ -4,90 +4,28 @@ Alert Processor Service v2.0
Main FastAPI application with RabbitMQ consumer lifecycle management.
"""
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import structlog
import os
from app.core.config import settings
from app.consumer.event_consumer import EventConsumer
from app.api import alerts, sse
from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
from shared.service_base import StandardFastAPIService
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.sdk.resources import Resource
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "alert-processor"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("alert-processor")
# Setup logging
setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
# Setup OpenTelemetry logging export if enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
from shared.monitoring.logs_exporter import setup_otel_logging
result = setup_otel_logging("alert-processor", settings.VERSION)
if result:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export enabled for alert-processor")
else:
logger = structlog.get_logger()
logger.warning("OpenTelemetry logs export setup returned None")
except Exception as e:
logger = structlog.get_logger()
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
else:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
# Initialize logger
logger = structlog.get_logger()
# Global consumer instance
consumer: EventConsumer = None
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
Application lifecycle manager.
class AlertProcessorService(StandardFastAPIService):
"""Alert Processor Service with standardized monitoring setup and RabbitMQ consumer"""
Startup: Initialize Redis and RabbitMQ consumer
Shutdown: Close consumer and Redis connections
"""
global consumer
async def on_startup(self, app):
"""Custom startup logic for Alert Processor"""
global consumer
logger.info("alert_processor_starting", version=settings.VERSION)
# Startup: Initialize Redis and start consumer
try:
# Initialize Redis connection
await initialize_redis(
settings.REDIS_URL,
@@ -96,69 +34,48 @@ async def lifespan(app: FastAPI):
)
logger.info("redis_initialized")
# Start RabbitMQ consumer
consumer = EventConsumer()
await consumer.start()
logger.info("alert_processor_started")
logger.info("rabbitmq_consumer_started")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("alert-processor")
logger.info("System metrics collection started")
await super().on_startup(app)
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
logger.info("Metrics export configured via OpenTelemetry OTLP")
except Exception as e:
logger.error("alert_processor_startup_failed", error=str(e))
raise
async def on_shutdown(self, app):
"""Custom shutdown logic for Alert Processor"""
global consumer
yield
await super().on_shutdown(app)
# Shutdown: Stop consumer and close Redis
try:
# Stop RabbitMQ consumer
if consumer:
await consumer.stop()
logger.info("rabbitmq_consumer_stopped")
# Close Redis
await close_redis()
logger.info("alert_processor_shutdown")
except Exception as e:
logger.error("alert_processor_shutdown_failed", error=str(e))
logger.info("redis_closed")
# Create FastAPI app
app = FastAPI(
title="Alert Processor Service",
# Create service instance
service = AlertProcessorService(
service_name="alert-processor",
app_name="Alert Processor Service",
description="Event processing, enrichment, and alert management system",
version=settings.VERSION,
lifespan=lifespan,
debug=settings.DEBUG
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
cors_origins=["*"], # Configure appropriately for production
api_prefix="/api/v1",
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Create FastAPI app
app = service.create_app(debug=settings.DEBUG)
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Instrument SQLAlchemy
SQLAlchemyInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("alert-processor")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure appropriately for production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
# Add service-specific routers
app.include_router(
alerts.router,
prefix="/api/v1/tenants/{tenant_id}",
@@ -172,34 +89,6 @@ app.include_router(
)
@app.get("/health")
async def health_check():
"""
Health check endpoint.
Returns service status and version.
"""
return {
"status": "healthy",
"service": settings.SERVICE_NAME,
"version": settings.VERSION
}
@app.get("/")
async def root():
"""Root endpoint with service info"""
return {
"service": settings.SERVICE_NAME,
"version": settings.VERSION,
"description": "Event processing, enrichment, and alert management system"
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__":
import uvicorn

View File

@@ -3,192 +3,74 @@ Demo Session Service - Main Application
Manages isolated demo sessions with ephemeral data
"""
from fastapi import FastAPI, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import structlog
from contextlib import asynccontextmanager
import os
from app.core import settings, DatabaseManager
from app.api import demo_sessions, demo_accounts, demo_operations, internal
from shared.redis_utils import initialize_redis, close_redis
from shared.monitoring.logging import setup_logging
from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
from shared.monitoring.system_metrics import SystemMetricsCollector
from shared.service_base import StandardFastAPIService
# OpenTelemetry imports
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.sdk.resources import Resource
# Initialize logger
logger = structlog.get_logger()
# Configure OpenTelemetry tracing
def setup_tracing(service_name: str = "demo-session"):
"""Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
resource = Resource.create({"service.name": service_name})
otlp_exporter = OTLPSpanExporter(
endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
insecure=True
)
provider = TracerProvider(resource=resource)
processor = BatchSpanProcessor(otlp_exporter)
provider.add_span_processor(processor)
trace.set_tracer_provider(provider)
return provider
# Initialize tracing
tracer_provider = setup_tracing("demo-session")
# Setup logging
setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO'))
# Setup OpenTelemetry logging export if enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
from shared.monitoring.logs_exporter import setup_otel_logging
result = setup_otel_logging("demo-session", settings.VERSION)
if result:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export enabled for demo-session")
else:
logger = structlog.get_logger()
logger.warning("OpenTelemetry logs export setup returned None")
except Exception as e:
logger = structlog.get_logger()
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
else:
logger = structlog.get_logger()
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
# Initialize database
# Initialize database manager
db_manager = DatabaseManager()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan handler"""
logger.info("Starting Demo Session Service", version=settings.VERSION)
class DemoSessionService(StandardFastAPIService):
"""Demo Session Service with standardized monitoring setup"""
# Initialize database
db_manager.initialize()
async def on_startup(self, app):
"""Custom startup logic for Demo Session"""
# Initialize database
db_manager.initialize()
logger.info("Database initialized")
# Initialize Redis using shared implementation
await initialize_redis(
redis_url=settings.REDIS_URL,
db=0,
max_connections=50
)
# Initialize Redis
await initialize_redis(
redis_url=settings.REDIS_URL,
db=0,
max_connections=50
)
logger.info("Redis initialized")
# Initialize system metrics collection
system_metrics = SystemMetricsCollector("demo-session")
logger.info("System metrics collection started")
await super().on_startup(app)
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
logger.info("Metrics export configured via OpenTelemetry OTLP")
async def on_shutdown(self, app):
"""Custom shutdown logic for Demo Session"""
await super().on_shutdown(app)
logger.info("Demo Session Service started successfully")
yield
# Cleanup on shutdown
await db_manager.close()
await close_redis()
logger.info("Demo Session Service stopped")
# Cleanup
await db_manager.close()
await close_redis()
logger.info("Database and Redis connections closed")
app = FastAPI(
title="Demo Session Service",
# Create service instance
service = DemoSessionService(
service_name="demo-session",
app_name="Demo Session Service",
description="Manages isolated demo sessions for prospect users",
version=settings.VERSION,
lifespan=lifespan
log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
cors_origins=["*"], # Configure appropriately for production
api_prefix="/api/v1",
enable_metrics=True,
enable_health_checks=True,
enable_tracing=True,
enable_cors=True
)
# Instrument FastAPI with OpenTelemetry
FastAPIInstrumentor.instrument_app(app)
# Create FastAPI app
app = service.create_app(debug=settings.DEBUG)
# Instrument httpx for outgoing requests
HTTPXClientInstrumentor().instrument()
# Instrument Redis
RedisInstrumentor().instrument()
# Initialize metrics collector
metrics_collector = MetricsCollector("demo-session")
# Add metrics middleware to track HTTP requests
add_metrics_middleware(app, metrics_collector)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
"""Global exception handler"""
logger.error(
"Unhandled exception",
path=request.url.path,
method=request.method,
error=str(exc)
)
return JSONResponse(
status_code=500,
content={"detail": "Internal server error"}
)
# Include routers
# Add service-specific routers
app.include_router(demo_sessions.router)
app.include_router(demo_accounts.router)
app.include_router(demo_operations.router)
app.include_router(internal.router)
@app.get("/")
async def root():
"""Root endpoint"""
return {
"service": "demo-session",
"version": settings.VERSION,
"status": "running"
}
@app.get("/health")
async def health():
"""Health check endpoint"""
from shared.redis_utils import get_redis_manager
redis_manager = await get_redis_manager()
redis_ok = await redis_manager.health_check()
return {
"status": "healthy" if redis_ok else "degraded",
"service": "demo-session",
"version": settings.VERSION,
"redis": "connected" if redis_ok else "disconnected"
}
# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
# The /metrics endpoint is not needed as metrics are pushed automatically
if __name__ == "__main__":
import uvicorn
uvicorn.run(

View File

@@ -1,14 +1,34 @@
"""
Shared monitoring package for microservices
Provides unified OpenTelemetry-based observability:
- Traces: Distributed tracing
- Metrics: System and application metrics
- Logs: Structured logging
All signals exported to SigNoz via OTLP.
"""
# Core setup - START HERE
from .logging import setup_logging
from .metrics import setup_metrics_early, get_metrics_collector, MetricsCollector
from .health_checks import (
HealthCheckManager,
FastAPIHealthChecker,
create_health_manager,
setup_fastapi_health_checks
from .telemetry import (
setup_telemetry,
setup_telemetry_simple,
get_telemetry_status,
TelemetryProviders
)
# Configuration
from .otel_config import OTelConfig, OTelEndpoints
# Individual signal setup (used by telemetry.py)
from .tracing import (
setup_tracing,
get_current_trace_id,
get_current_span_id,
add_trace_attributes,
add_trace_event,
record_exception
)
from .logs_exporter import (
setup_otel_logging,
@@ -27,23 +47,51 @@ from .system_metrics import (
setup_all_metrics
)
# Health checks
from .health_checks import (
HealthCheckManager,
FastAPIHealthChecker,
create_health_manager,
setup_fastapi_health_checks
)
__all__ = [
# CORE - Start with these
'setup_logging',
'setup_metrics_early',
'get_metrics_collector',
'MetricsCollector',
'HealthCheckManager',
'FastAPIHealthChecker',
'create_health_manager',
'setup_fastapi_health_checks',
'setup_telemetry',
'setup_telemetry_simple',
'get_telemetry_status',
'TelemetryProviders',
# Configuration
'OTelConfig',
'OTelEndpoints',
# Tracing
'setup_tracing',
'get_current_trace_id',
'get_current_span_id',
'add_trace_attributes',
'add_trace_event',
'record_exception',
# Logs
'setup_otel_logging',
'add_log_context',
'get_current_trace_context',
'StructlogOTELProcessor',
# Metrics
'setup_otel_metrics',
'OTelMetricsCollector',
'create_dual_metrics_collector',
'SystemMetricsCollector',
'ApplicationMetricsCollector',
'setup_all_metrics'
'setup_all_metrics',
# Health checks
'HealthCheckManager',
'FastAPIHealthChecker',
'create_health_manager',
'setup_fastapi_health_checks',
]

View File

@@ -1,6 +1,6 @@
"""
OpenTelemetry Logs Integration for SigNoz
Exports structured logs to SigNoz via OpenTelemetry Collector
Exports structured logs to SigNoz via OpenTelemetry Collector using HTTP protocol
"""
import os
@@ -10,14 +10,21 @@ from typing import Optional
from opentelemetry._logs import set_logger_provider
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.sdk.resources import Resource
# Try to import HTTP log exporter (logs always use HTTP)
try:
from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
HTTP_LOG_EXPORTER_AVAILABLE = True
except ImportError:
try:
from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
HTTP_LOG_EXPORTER_AVAILABLE = True
except ImportError:
OTLPLogExporter = None
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
HTTP_LOG_EXPORTER_AVAILABLE = False
from .otel_config import OTelConfig
logger = structlog.get_logger()
@@ -31,13 +38,14 @@ def setup_otel_logging(
"""
Setup OpenTelemetry logging to export logs to SigNoz.
This integrates with Python's standard logging to automatically
export all log records to SigNoz via the OTLP protocol.
Uses HTTP protocol (port 4318) for sending logs to SigNoz.
Integrates with Python's standard logging to automatically export
all log records to SigNoz via the OTLP HTTP protocol.
Args:
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (default from env)
otel_endpoint: Optional override for OTLP endpoint (HTTP format with path)
enable_console: Whether to also log to console (default: True)
Returns:
@@ -47,7 +55,7 @@ def setup_otel_logging(
from shared.monitoring.logs_exporter import setup_otel_logging
# Setup during service initialization
setup_otel_logging("auth-service", "1.0.0")
handler = setup_otel_logging("auth-service", "1.0.0")
# Now all standard logging calls will be exported to SigNoz
import logging
@@ -56,7 +64,7 @@ def setup_otel_logging(
"""
# Check if logging export is enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() != "otlp":
if not OTelConfig.is_enabled("logs"):
logger.info(
"OpenTelemetry logs export disabled",
service=service_name,
@@ -64,59 +72,36 @@ def setup_otel_logging(
)
return None
# Get OTLP endpoint from environment or parameter
# For logs, we need to use the HTTP endpoint (port 4318), not the gRPC endpoint (port 4317)
if otel_endpoint is None:
# Try logs-specific endpoint first, then fall back to general OTLP endpoint
otel_endpoint = os.getenv(
"OTEL_EXPORTER_OTLP_LOGS_ENDPOINT",
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
# Check if HTTP log exporter is available
if not HTTP_LOG_EXPORTER_AVAILABLE or OTLPLogExporter is None:
logger.warning(
"OpenTelemetry HTTP log exporter not available",
service=service_name,
reason="opentelemetry-exporter-otlp-proto-http package not installed"
)
logger.info(f"Original OTLP endpoint for logs: {otel_endpoint}")
# If we got the tracing endpoint (4317), switch to logs endpoint (4318)
if otel_endpoint.endswith(":4317"):
logger.info("Converting tracing endpoint (4317) to logs endpoint (4318)")
otel_endpoint = otel_endpoint.replace(":4317", ":4318")
logger.info(f"Final OTLP endpoint for logs: {otel_endpoint}")
# Ensure endpoint has proper protocol prefix
if not otel_endpoint.startswith(("http://", "https://")):
# Default to HTTP for insecure connections
otel_endpoint = f"http://{otel_endpoint}"
# Ensure endpoint has /v1/logs path for HTTP
if not otel_endpoint.endswith("/v1/logs"):
otel_endpoint = f"{otel_endpoint}/v1/logs"
return None
try:
# Check if OTLPLogExporter is available
if OTLPLogExporter is None:
logger.warning(
"OpenTelemetry HTTP OTLP exporter not available",
service=service_name,
reason="opentelemetry-exporter-otlp-proto-http package not installed"
)
return None
# Get endpoints from centralized config
endpoints = OTelConfig.get_endpoints()
# Create resource with service information
resource = Resource(attributes={
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
})
# Use provided endpoint or get from config
if otel_endpoint:
http_endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/logs")
else:
http_endpoint = endpoints.logs_http
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure logger provider
logger_provider = LoggerProvider(resource=resource)
set_logger_provider(logger_provider)
# Configure OTLP exporter for logs
# Configure OTLP HTTP exporter for logs
otlp_exporter = OTLPLogExporter(
endpoint=otel_endpoint,
endpoint=http_endpoint,
timeout=10
)
@@ -135,9 +120,10 @@ def setup_otel_logging(
root_logger.addHandler(otel_handler)
logger.info(
"OpenTelemetry logs export configured",
"OpenTelemetry logs export configured successfully",
service=service_name,
otel_endpoint=otel_endpoint,
http_endpoint=http_endpoint,
protocol="http",
console_logging=enable_console
)
@@ -147,8 +133,7 @@ def setup_otel_logging(
logger.error(
"Failed to setup OpenTelemetry logs export",
service=service_name,
error=str(e),
reason="Will continue with standard logging only"
error=str(e)
)
return None

View File

@@ -1,6 +1,6 @@
"""
OpenTelemetry Metrics Integration for SigNoz
Exports metrics to SigNoz via OpenTelemetry Collector in addition to Prometheus
Exports metrics to SigNoz via OpenTelemetry Collector using gRPC protocol
"""
import os
@@ -9,8 +9,24 @@ from typing import Optional
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from opentelemetry.sdk.resources import Resource
# Import both gRPC and HTTP exporters
try:
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GrpcMetricExporter
GRPC_AVAILABLE = True
except ImportError:
GRPC_AVAILABLE = False
GrpcMetricExporter = None
try:
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HttpMetricExporter
HTTP_AVAILABLE = True
except ImportError:
HTTP_AVAILABLE = False
HttpMetricExporter = None
from .otel_config import OTelConfig
logger = structlog.get_logger()
@@ -19,20 +35,21 @@ def setup_otel_metrics(
service_name: str,
service_version: str = "1.0.0",
otel_endpoint: Optional[str] = None,
export_interval_millis: int = 60000 # Export every 60 seconds
export_interval_millis: int = 60000, # Export every 60 seconds
protocol: Optional[str] = None # "grpc" or "http", defaults to grpc
) -> Optional[MeterProvider]:
"""
Setup OpenTelemetry metrics to export to SigNoz.
This creates a dual-export strategy:
- Prometheus exposition format at /metrics (for Prometheus scraping)
- OTLP push to SigNoz collector (for direct ingestion)
Supports both gRPC (recommended, port 4317) and HTTP (port 4318) protocols.
Default protocol is gRPC for better performance.
Args:
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (default from env)
export_interval_millis: How often to push metrics (default 60s)
otel_endpoint: Optional override for OTLP endpoint
export_interval_millis: How often to push metrics in milliseconds (default 60s)
protocol: Protocol to use ("grpc" or "http"). Defaults to "grpc"
Returns:
MeterProvider instance if successful, None otherwise
@@ -40,9 +57,12 @@ def setup_otel_metrics(
Example:
from shared.monitoring.metrics_exporter import setup_otel_metrics
# Setup during service initialization
# Setup with gRPC (default)
meter_provider = setup_otel_metrics("auth-service", "1.0.0")
# Or with HTTP
meter_provider = setup_otel_metrics("auth-service", "1.0.0", protocol="http")
# Create meters for your metrics
meter = meter_provider.get_meter(__name__)
request_counter = meter.create_counter(
@@ -56,8 +76,7 @@ def setup_otel_metrics(
"""
# Check if metrics export is enabled
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
if not enable_otel_metrics:
if not OTelConfig.is_enabled("metrics"):
logger.info(
"OpenTelemetry metrics export disabled",
service=service_name,
@@ -65,32 +84,66 @@ def setup_otel_metrics(
)
return None
# Get OTLP endpoint from environment or parameter
if otel_endpoint is None:
otel_endpoint = os.getenv(
"OTEL_EXPORTER_OTLP_ENDPOINT",
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
)
# Determine protocol to use
if protocol is None:
protocol = OTelConfig.get_protocol("metrics")
# Ensure endpoint has /v1/metrics path for HTTP
if not otel_endpoint.endswith("/v1/metrics"):
otel_endpoint = f"{otel_endpoint}/v1/metrics"
# Validate protocol is available
if protocol == "grpc" and not GRPC_AVAILABLE:
logger.warning(
"gRPC exporter not available, falling back to HTTP",
service=service_name
)
protocol = "http"
elif protocol == "http" and not HTTP_AVAILABLE:
logger.warning(
"HTTP exporter not available, falling back to gRPC",
service=service_name
)
protocol = "grpc"
if protocol not in ["grpc", "http"]:
logger.error(
"Invalid protocol specified",
service=service_name,
protocol=protocol
)
return None
try:
# Create resource with service information
resource = Resource(attributes={
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
})
# Get endpoints from centralized config
endpoints = OTelConfig.get_endpoints()
# Configure OTLP exporter for metrics
otlp_exporter = OTLPMetricExporter(
endpoint=otel_endpoint,
timeout=10
)
# Determine which endpoint to use
if otel_endpoint:
# User provided override
if protocol == "grpc":
endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
else:
endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/metrics")
else:
# Use config-determined endpoint
if protocol == "grpc":
endpoint = endpoints.metrics_grpc
else:
endpoint = endpoints.metrics_http
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure OTLP exporter based on protocol
if protocol == "grpc":
otlp_exporter = GrpcMetricExporter(
endpoint=endpoint,
insecure=True, # Use secure=False in production with proper TLS
timeout=10
)
else: # http
otlp_exporter = HttpMetricExporter(
endpoint=endpoint,
timeout=10
)
# Create periodic metric reader
metric_reader = PeriodicExportingMetricReader(
@@ -108,9 +161,10 @@ def setup_otel_metrics(
metrics.set_meter_provider(meter_provider)
logger.info(
"OpenTelemetry metrics export configured",
"OpenTelemetry metrics export configured successfully",
service=service_name,
otel_endpoint=otel_endpoint,
endpoint=endpoint,
protocol=protocol,
export_interval_seconds=export_interval_millis / 1000
)
@@ -121,7 +175,7 @@ def setup_otel_metrics(
"Failed to setup OpenTelemetry metrics export",
service=service_name,
error=str(e),
reason="Will continue with Prometheus-only metrics"
protocol=protocol
)
return None

View File

@@ -0,0 +1,286 @@
"""
Centralized OpenTelemetry Configuration
Manages OTEL endpoints and settings for traces, metrics, and logs
"""
import os
from typing import Optional, Tuple
from dataclasses import dataclass
import structlog
logger = structlog.get_logger()
@dataclass
class OTelEndpoints:
"""
Container for OpenTelemetry endpoints.
SigNoz uses different protocols for different signals:
- Traces: gRPC (port 4317)
- Metrics: gRPC (port 4317) or HTTP (port 4318)
- Logs: HTTP (port 4318)
"""
traces_grpc: str # gRPC endpoint for traces (e.g., "host:4317")
metrics_grpc: str # gRPC endpoint for metrics (e.g., "host:4317")
metrics_http: str # HTTP endpoint for metrics (e.g., "http://host:4318/v1/metrics")
logs_http: str # HTTP endpoint for logs (e.g., "http://host:4318/v1/logs")
class OTelConfig:
"""
Centralized configuration for OpenTelemetry exporters.
This class manages endpoint URLs and ensures proper protocol usage:
- gRPC endpoints: host:port (no protocol prefix)
- HTTP endpoints: http://host:port/path (with protocol and path)
"""
# Default base endpoint (can be overridden by environment variables)
DEFAULT_OTEL_COLLECTOR_HOST = "signoz-otel-collector.bakery-ia.svc.cluster.local"
DEFAULT_GRPC_PORT = 4317
DEFAULT_HTTP_PORT = 4318
@classmethod
def get_endpoints(cls) -> OTelEndpoints:
"""
Get OpenTelemetry endpoints from environment variables with proper fallbacks.
Environment variables (in order of precedence):
1. OTEL_EXPORTER_OTLP_ENDPOINT - Base endpoint (gRPC format: host:port)
2. OTEL_EXPORTER_OTLP_TRACES_ENDPOINT - Specific traces endpoint
3. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT - Specific metrics endpoint
4. OTEL_EXPORTER_OTLP_LOGS_ENDPOINT - Specific logs endpoint
5. OTEL_COLLECTOR_ENDPOINT - Legacy variable (HTTP format)
Returns:
OTelEndpoints with all configured endpoints
"""
# Get base endpoint from environment
base_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
if base_endpoint:
# Clean and parse base endpoint
base_grpc = cls._clean_grpc_endpoint(base_endpoint)
base_http_host = cls._extract_host(base_endpoint)
else:
# Use default collector
base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
# Get signal-specific endpoints (or use base endpoint)
traces_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", base_grpc)
metrics_endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", base_grpc)
logs_endpoint = os.getenv("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT")
# Build final endpoints
traces_grpc = cls._clean_grpc_endpoint(traces_endpoint)
metrics_grpc = cls._clean_grpc_endpoint(metrics_endpoint)
# For metrics HTTP, convert gRPC endpoint to HTTP if needed
metrics_http = cls._grpc_to_http_endpoint(metrics_grpc, "/v1/metrics")
# For logs, use HTTP endpoint
if logs_endpoint:
logs_http = cls._ensure_http_endpoint(logs_endpoint, "/v1/logs")
else:
logs_http = cls._grpc_to_http_endpoint(base_grpc, "/v1/logs")
endpoints = OTelEndpoints(
traces_grpc=traces_grpc,
metrics_grpc=metrics_grpc,
metrics_http=metrics_http,
logs_http=logs_http
)
logger.info(
"OpenTelemetry endpoints configured",
traces_grpc=endpoints.traces_grpc,
metrics_grpc=endpoints.metrics_grpc,
metrics_http=endpoints.metrics_http,
logs_http=endpoints.logs_http
)
return endpoints
@staticmethod
def _clean_grpc_endpoint(endpoint: str) -> str:
"""
Clean endpoint for gRPC usage (remove protocol, paths).
Args:
endpoint: Raw endpoint string
Returns:
Cleaned endpoint in format "host:port"
"""
# Remove protocol prefixes
endpoint = endpoint.replace("http://", "").replace("https://", "")
# Remove paths (gRPC doesn't use paths)
if "/" in endpoint:
endpoint = endpoint.split("/")[0]
# Ensure it has a port
if ":" not in endpoint:
endpoint = f"{endpoint}:4317"
return endpoint
@staticmethod
def _extract_host(endpoint: str) -> str:
"""
Extract host and convert to HTTP endpoint.
Args:
endpoint: Raw endpoint string
Returns:
HTTP endpoint without path (e.g., "http://host:4318")
"""
# Remove protocol if present
clean = endpoint.replace("http://", "").replace("https://", "")
# Remove path if present
if "/" in clean:
clean = clean.split("/")[0]
# Extract host without port
if ":" in clean:
host = clean.split(":")[0]
else:
host = clean
return f"http://{host}:4318"
@staticmethod
def _grpc_to_http_endpoint(grpc_endpoint: str, path: str) -> str:
"""
Convert gRPC endpoint to HTTP endpoint with path.
Args:
grpc_endpoint: gRPC endpoint (e.g., "host:4317")
path: HTTP path (e.g., "/v1/metrics")
Returns:
HTTP endpoint (e.g., "http://host:4318/v1/metrics")
"""
# Extract host from gRPC endpoint
if ":" in grpc_endpoint:
host = grpc_endpoint.split(":")[0]
else:
host = grpc_endpoint
# Build HTTP endpoint with port 4318
return f"http://{host}:4318{path}"
@staticmethod
def _ensure_http_endpoint(endpoint: str, path: str) -> str:
"""
Ensure endpoint is in HTTP format with proper path.
Args:
endpoint: Raw endpoint string
path: Required path (e.g., "/v1/logs")
Returns:
HTTP endpoint with protocol and path
"""
# Add protocol if missing
if not endpoint.startswith(("http://", "https://")):
endpoint = f"http://{endpoint}"
# Ensure it has the correct port for HTTP
if ":4317" in endpoint:
endpoint = endpoint.replace(":4317", ":4318")
elif ":4318" not in endpoint and ":" in endpoint:
# Has a port but not the right one, replace it
parts = endpoint.split(":")
if len(parts) >= 2:
# Remove existing port and path
base = ":".join(parts[:-1])
endpoint = f"{base}:4318"
elif ":" not in endpoint.replace("http://", "").replace("https://", ""):
# No port at all, add it
endpoint = f"{endpoint}:4318"
# Ensure path is present
if not endpoint.endswith(path):
# Remove any existing path first
if "/" in endpoint.split("://")[1]:
base = endpoint.split("://")[0] + "://" + endpoint.split("://")[1].split("/")[0]
endpoint = base
endpoint = f"{endpoint}{path}"
return endpoint
@classmethod
def get_resource_attributes(
cls,
service_name: str,
service_version: str = "1.0.0"
) -> dict:
"""
Get common resource attributes for all OTEL signals.
Args:
service_name: Name of the service
service_version: Version of the service
Returns:
Dictionary of resource attributes
"""
return {
"service.name": service_name,
"service.version": service_version,
"deployment.environment": os.getenv("ENVIRONMENT", "development"),
"k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
"k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
"k8s.cluster.name": os.getenv("K8S_CLUSTER_NAME", "bakery-ia-cluster"),
}
@classmethod
def is_enabled(cls, signal: str) -> bool:
"""
Check if a specific telemetry signal is enabled.
Args:
signal: One of "traces", "metrics", "logs"
Returns:
True if signal is enabled, False otherwise
"""
signal = signal.lower()
if signal == "traces":
return os.getenv("ENABLE_TRACING", "true").lower() == "true"
elif signal == "metrics":
return os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
elif signal == "logs":
return os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp"
else:
return False
@classmethod
def get_protocol(cls, signal: str) -> str:
"""
Get the preferred protocol for a signal.
Args:
signal: One of "traces", "metrics", "logs"
Returns:
Protocol name ("grpc" or "http")
"""
protocol = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc")
# Signal-specific overrides
if signal == "traces":
return os.getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", protocol)
elif signal == "metrics":
return os.getenv("OTEL_EXPORTER_OTLP_METRICS_PROTOCOL", protocol)
elif signal == "logs":
# Logs always use HTTP in our setup
return "http"
return protocol

View File

@@ -0,0 +1,271 @@
"""
Unified OpenTelemetry Telemetry Setup
Provides a single entry point to configure all telemetry signals:
- Traces: Distributed tracing across services
- Metrics: OTLP metrics export + system metrics collection
- Logs: Structured logs with trace correlation
All signals are exported to SigNoz via OTLP.
"""
import os
import structlog
from typing import Optional, Dict, Any, Tuple
from dataclasses import dataclass
from .otel_config import OTelConfig
from .tracing import setup_tracing
from .metrics_exporter import setup_otel_metrics
from .logs_exporter import setup_otel_logging
from .system_metrics import setup_all_metrics, SystemMetricsCollector, ApplicationMetricsCollector
logger = structlog.get_logger()
@dataclass
class TelemetryProviders:
"""
Container for all OpenTelemetry providers and collectors.
Attributes:
tracer_provider: Provider for distributed tracing
meter_provider: Provider for metrics export
logging_handler: Handler for structured logs
system_metrics: Collector for system-level metrics (CPU, memory, disk, network)
app_metrics: Collector for application-level metrics (HTTP, DB)
"""
tracer_provider: Optional[Any] = None
meter_provider: Optional[Any] = None
logging_handler: Optional[Any] = None
system_metrics: Optional[SystemMetricsCollector] = None
app_metrics: Optional[ApplicationMetricsCollector] = None
def setup_telemetry(
app,
service_name: str,
service_version: str = "1.0.0",
enable_traces: bool = True,
enable_metrics: bool = True,
enable_logs: bool = True,
enable_system_metrics: bool = True,
metrics_protocol: Optional[str] = None, # "grpc" or "http", defaults to grpc
export_interval_millis: int = 60000
) -> TelemetryProviders:
"""
Setup all OpenTelemetry telemetry signals (traces, metrics, logs) for a service.
This is the UNIFIED setup function that configures everything:
- Distributed tracing (gRPC, port 4317)
- Metrics export (gRPC by default, port 4317)
- System metrics collection (CPU, memory, disk, network)
- Application metrics (HTTP requests, DB queries)
- Structured logs export (HTTP, port 4318)
All signals use the centralized OTelConfig for endpoint management.
Args:
app: FastAPI application instance
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
enable_traces: Enable distributed tracing (default: True)
enable_metrics: Enable metrics export to OTLP (default: True)
enable_logs: Enable logs export to OTLP (default: True)
enable_system_metrics: Enable system metrics collection (default: True, can be disabled via ENABLE_SYSTEM_METRICS env)
metrics_protocol: Protocol for metrics ("grpc" or "http", default: "grpc")
export_interval_millis: How often to export metrics in milliseconds
Returns:
TelemetryProviders containing all initialized providers and collectors
Example:
from shared.monitoring.telemetry import setup_telemetry
app = FastAPI(title="Auth Service")
providers = setup_telemetry(
app,
service_name="auth-service",
service_version="1.0.0"
)
# All telemetry is now configured:
# - Traces automatically captured for HTTP requests
# - System metrics automatically collected
# - Application metrics via providers.app_metrics
# - Logs automatically correlated with traces
"""
logger.info(
"Setting up unified OpenTelemetry telemetry",
service=service_name,
version=service_version,
traces=enable_traces,
metrics=enable_metrics,
logs=enable_logs,
system_metrics=enable_system_metrics
)
providers = TelemetryProviders()
# Setup distributed tracing
if enable_traces and OTelConfig.is_enabled("traces"):
try:
providers.tracer_provider = setup_tracing(
app,
service_name=service_name,
service_version=service_version
)
if providers.tracer_provider:
logger.info("✓ Distributed tracing configured", service=service_name)
else:
logger.warning("✗ Distributed tracing setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup distributed tracing", service=service_name, error=str(e))
# Setup OTLP metrics export
if enable_metrics and OTelConfig.is_enabled("metrics"):
try:
providers.meter_provider = setup_otel_metrics(
service_name=service_name,
service_version=service_version,
protocol=metrics_protocol,
export_interval_millis=export_interval_millis
)
if providers.meter_provider:
logger.info("✓ OTLP metrics export configured", service=service_name)
# Setup system and application metrics collectors
if enable_system_metrics:
enable_system_env = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
if enable_system_env:
try:
providers.system_metrics, providers.app_metrics = setup_all_metrics(
service_name=service_name,
service_version=service_version,
meter_provider=providers.meter_provider
)
logger.info(
"✓ System and application metrics collectors initialized",
service=service_name,
system_metrics=["cpu", "memory", "disk", "network"],
app_metrics=["http_requests", "db_queries"]
)
except Exception as e:
logger.warning("✗ Failed to setup metrics collectors", service=service_name, error=str(e))
else:
logger.warning("✗ OTLP metrics export setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup OTLP metrics export", service=service_name, error=str(e))
# Setup logs export
if enable_logs and OTelConfig.is_enabled("logs"):
try:
providers.logging_handler = setup_otel_logging(
service_name=service_name,
service_version=service_version
)
if providers.logging_handler:
logger.info("✓ Structured logs export configured", service=service_name)
else:
logger.warning("✗ Logs export setup returned None", service=service_name)
except Exception as e:
logger.error("✗ Failed to setup logs export", service=service_name, error=str(e))
# Log endpoint configuration summary
try:
endpoints = OTelConfig.get_endpoints()
summary = {
"service": service_name,
"version": service_version,
"traces": {
"enabled": bool(providers.tracer_provider),
"endpoint": endpoints.traces_grpc if providers.tracer_provider else "disabled"
},
"metrics": {
"enabled": bool(providers.meter_provider),
"endpoint": (endpoints.metrics_grpc if metrics_protocol != "http" else endpoints.metrics_http) if providers.meter_provider else "disabled",
"system_metrics": bool(providers.system_metrics),
"app_metrics": bool(providers.app_metrics)
},
"logs": {
"enabled": bool(providers.logging_handler),
"endpoint": endpoints.logs_http if providers.logging_handler else "disabled"
}
}
logger.info("🎉 Telemetry setup complete", **summary)
except Exception as e:
logger.warning("Could not log endpoint summary", error=str(e))
return providers
def setup_telemetry_simple(
app,
service_name: str,
service_version: str = "1.0.0"
) -> TelemetryProviders:
"""
Simplified telemetry setup with all defaults.
Uses:
- gRPC for traces (port 4317)
- gRPC for metrics (port 4317)
- HTTP for logs (port 4318)
All settings are read from environment variables and OTelConfig.
Args:
app: FastAPI application instance
service_name: Name of the service
service_version: Version of the service
Returns:
TelemetryProviders containing all initialized providers
Example:
from shared.monitoring.telemetry import setup_telemetry_simple
app = FastAPI(title="Auth Service")
providers = setup_telemetry_simple(app, "auth-service")
"""
return setup_telemetry(
app=app,
service_name=service_name,
service_version=service_version
)
def get_telemetry_status() -> Dict[str, Any]:
"""
Get current telemetry configuration status.
Returns:
Dictionary with telemetry status information
Example:
from shared.monitoring.telemetry import get_telemetry_status
status = get_telemetry_status()
print(f"Tracing enabled: {status['traces']['enabled']}")
"""
endpoints = OTelConfig.get_endpoints()
return {
"traces": {
"enabled": OTelConfig.is_enabled("traces"),
"protocol": "grpc",
"endpoint": endpoints.traces_grpc
},
"metrics": {
"enabled": OTelConfig.is_enabled("metrics"),
"protocol": OTelConfig.get_protocol("metrics"),
"grpc_endpoint": endpoints.metrics_grpc,
"http_endpoint": endpoints.metrics_http
},
"logs": {
"enabled": OTelConfig.is_enabled("logs"),
"protocol": "http",
"endpoint": endpoints.logs_http
}
}

View File

@@ -3,17 +3,38 @@ OpenTelemetry distributed tracing integration
Provides end-to-end request tracking across all services
"""
import os
import structlog
from typing import Optional
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from opentelemetry.sdk.resources import Resource
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
# Core instrumentations (should always be available)
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
# Optional instrumentations (may not be installed in all services)
try:
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
HTTPX_AVAILABLE = True
except ImportError:
HTTPX_AVAILABLE = False
try:
from opentelemetry.instrumentation.redis import RedisInstrumentor
REDIS_AVAILABLE = True
except ImportError:
REDIS_AVAILABLE = False
try:
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
SQLALCHEMY_AVAILABLE = True
except ImportError:
SQLALCHEMY_AVAILABLE = False
from .otel_config import OTelConfig
logger = structlog.get_logger()
@@ -22,8 +43,8 @@ def setup_tracing(
app,
service_name: str,
service_version: str = "1.0.0",
otel_endpoint: str = "http://signoz-otel-collector.bakery-ia:4318"
):
otel_endpoint: Optional[str] = None
) -> Optional[TracerProvider]:
"""
Setup OpenTelemetry distributed tracing for a FastAPI service.
@@ -33,35 +54,56 @@ def setup_tracing(
- Redis operations
- PostgreSQL/SQLAlchemy queries
Uses gRPC protocol (port 4317) for sending traces to SigNoz.
Args:
app: FastAPI application instance
service_name: Name of the service (e.g., "auth-service")
service_version: Version of the service
otel_endpoint: OpenTelemetry collector endpoint (SigNoz)
otel_endpoint: Optional override for OTLP endpoint (gRPC format: host:port)
Returns:
TracerProvider instance if successful, None otherwise
Example:
from shared.monitoring.tracing import setup_tracing
app = FastAPI(title="Auth Service")
setup_tracing(app, "auth-service")
tracer_provider = setup_tracing(app, "auth-service", "1.0.0")
"""
# Check if tracing is enabled
if not OTelConfig.is_enabled("traces"):
logger.info(
"Distributed tracing disabled",
service=service_name,
reason="ENABLE_TRACING not set to 'true'"
)
return None
try:
# Create resource with service information
resource = Resource(attributes={
SERVICE_NAME: service_name,
SERVICE_VERSION: service_version,
"deployment.environment": "production"
})
# Get endpoints from centralized config
endpoints = OTelConfig.get_endpoints()
# Use provided endpoint or get from config
if otel_endpoint:
# Clean user-provided endpoint for gRPC
grpc_endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
else:
grpc_endpoint = endpoints.traces_grpc
# Get resource attributes
resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
resource = Resource(attributes=resource_attrs)
# Configure tracer provider
tracer_provider = TracerProvider(resource=resource)
trace.set_tracer_provider(tracer_provider)
# Configure OTLP exporter to send to SigNoz
# Configure OTLP gRPC exporter for traces
otlp_exporter = OTLPSpanExporter(
endpoint=otel_endpoint,
insecure=True # Use TLS in production
endpoint=grpc_endpoint,
insecure=True # Use secure=False in production with proper TLS
)
# Add span processor with batching for performance
@@ -75,40 +117,46 @@ def setup_tracing(
excluded_urls="health,metrics" # Don't trace health/metrics endpoints
)
# Auto-instrument HTTPX (inter-service communication)
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
# Auto-instrument HTTPX (inter-service communication) if available
if HTTPX_AVAILABLE:
try:
HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("HTTPX instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument HTTPX: {e}")
# Auto-instrument Redis
try:
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
except Exception as e:
logger.warning(f"Failed to instrument Redis: {e}")
# Auto-instrument Redis if available
if REDIS_AVAILABLE:
try:
RedisInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("Redis instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument Redis: {e}")
# Auto-instrument PostgreSQL (psycopg2) - skip if not available
# Most services use asyncpg instead of psycopg2
# try:
# Psycopg2Instrumentor().instrument(tracer_provider=tracer_provider)
# except Exception as e:
# logger.warning(f"Failed to instrument Psycopg2: {e}")
# Auto-instrument SQLAlchemy
try:
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
except Exception as e:
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
# Auto-instrument SQLAlchemy if available
if SQLALCHEMY_AVAILABLE:
try:
SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
logger.debug("SQLAlchemy instrumentation enabled")
except Exception as e:
logger.warning(f"Failed to instrument SQLAlchemy: {e}")
logger.info(
"Distributed tracing configured",
"Distributed tracing configured successfully",
service=service_name,
otel_endpoint=otel_endpoint
grpc_endpoint=grpc_endpoint,
protocol="grpc"
)
return tracer_provider
except Exception as e:
logger.error(
"Failed to setup tracing - continuing without it",
service=service_name,
error=str(e)
)
return None
def get_current_trace_id() -> Optional[str]:

View File

@@ -20,10 +20,11 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from fastapi.routing import APIRouter
from shared.monitoring import setup_logging, setup_otel_logging, setup_otel_metrics, setup_all_metrics
from shared.monitoring.metrics import setup_metrics_early
from shared.monitoring import (
setup_logging,
setup_telemetry
)
from shared.monitoring.health_checks import setup_fastapi_health_checks
from shared.monitoring.tracing import setup_tracing
from shared.database.base import DatabaseManager
if TYPE_CHECKING:
@@ -77,24 +78,13 @@ class BaseFastAPIService:
# Initialize logging
setup_logging(service_name, log_level)
# Setup OpenTelemetry logging export if enabled
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
try:
setup_otel_logging(service_name, version)
self.logger = structlog.get_logger()
self.logger.info(f"OpenTelemetry logs export enabled for {service_name}")
except Exception as e:
self.logger = structlog.get_logger()
self.logger.warning(f"Failed to setup OpenTelemetry logs export: {e}")
else:
self.logger = structlog.get_logger()
self.logger = structlog.get_logger()
# Will be set during app creation
self.app: Optional[FastAPI] = None
self.metrics_collector = None
self.health_manager = None
self.alert_service = None
self.telemetry_providers = None # Contains all OTEL providers and metrics collectors
def create_app(self, **fastapi_kwargs) -> FastAPI:
"""
@@ -116,49 +106,25 @@ class BaseFastAPIService:
# Create FastAPI app
self.app = FastAPI(**config)
# Setup metrics BEFORE middleware and lifespan
if self.enable_metrics:
self.metrics_collector = setup_metrics_early(self.app, self.service_name)
# Setup OpenTelemetry metrics export if enabled
enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
if enable_otel_metrics:
try:
self.otel_meter_provider = setup_otel_metrics(self.service_name, self.version)
if self.otel_meter_provider:
self.logger.info(f"OpenTelemetry metrics export enabled for {self.service_name}")
# Setup system metrics collection (CPU, memory, disk, network)
enable_system_metrics = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
if enable_system_metrics:
try:
self.system_metrics, self.app_metrics = setup_all_metrics(
self.service_name,
self.version,
self.otel_meter_provider
)
self.logger.info(f"System metrics collection enabled for {self.service_name}")
except Exception as e:
self.logger.warning(f"Failed to setup system metrics: {e}")
except Exception as e:
self.logger.warning(f"Failed to setup OpenTelemetry metrics export: {e}")
# Setup distributed tracing
# Check both constructor flag and environment variable
tracing_enabled = self.enable_tracing and os.getenv("ENABLE_TRACING", "true").lower() == "true"
if tracing_enabled:
try:
otel_endpoint = os.getenv(
"OTEL_COLLECTOR_ENDPOINT",
"http://signoz-otel-collector.bakery-ia:4318"
)
setup_tracing(self.app, self.service_name, self.version, otel_endpoint)
self.logger.info(f"Distributed tracing enabled for {self.service_name}")
except Exception as e:
self.logger.warning(f"Failed to setup tracing, continuing without it: {e}")
else:
self.logger.info(f"Distributed tracing disabled for {self.service_name}")
# Setup unified OpenTelemetry telemetry
# This single call configures:
# - Distributed tracing (gRPC, port 4317)
# - OTLP metrics export (gRPC, port 4317)
# - System metrics collection (CPU, memory, disk, network)
# - Application metrics (HTTP requests, DB queries)
# - Structured logs export (HTTP, port 4318)
try:
self.telemetry_providers = setup_telemetry(
app=self.app,
service_name=self.service_name,
service_version=self.version,
enable_traces=self.enable_tracing,
enable_metrics=self.enable_metrics,
enable_logs=True, # Controlled by OTEL_LOGS_EXPORTER env var
enable_system_metrics=True # Controlled by ENABLE_SYSTEM_METRICS env var
)
except Exception as e:
self.logger.warning("Failed to setup telemetry", error=str(e))
# Setup lifespan
self.app.router.lifespan_context = self._create_lifespan()
@@ -361,10 +327,6 @@ class BaseFastAPIService:
method=request.method
)
# Record error metric if available
if self.metrics_collector:
self.metrics_collector.increment_counter("errors_total", labels={"type": "unhandled"})
return JSONResponse(
status_code=500,
content={
@@ -409,7 +371,10 @@ class BaseFastAPIService:
def register_custom_metrics(self, metrics_config: Dict[str, Dict[str, Any]]):
"""
Register custom metrics for the service
Register custom OTEL metrics for the service.
Note: System metrics (CPU, memory, disk, network) and application metrics (HTTP, DB)
are automatically created by setup_telemetry(). Use this for additional custom metrics.
Args:
metrics_config: Dict with metric name as key and config as value
@@ -417,25 +382,36 @@ class BaseFastAPIService:
"user_registrations": {
"type": "counter",
"description": "Total user registrations",
"labels": ["status"]
"unit": "registrations"
}
}
"""
if not self.metrics_collector:
self.logger.warning("Metrics collector not available")
if not self.telemetry_providers or not self.telemetry_providers.meter_provider:
self.logger.warning("OTEL meter provider not available - metrics not registered")
return
from opentelemetry.metrics import get_meter
meter = get_meter(self.service_name)
for metric_name, config in metrics_config.items():
metric_type = config.get("type", "counter")
description = config.get("description", f"{metric_name} metric")
labels = config.get("labels", [])
unit = config.get("unit", "1")
if metric_type == "counter":
self.metrics_collector.register_counter(metric_name, description, labels=labels)
elif metric_type == "histogram":
self.metrics_collector.register_histogram(metric_name, description, labels=labels)
else:
self.logger.warning(f"Unsupported metric type: {metric_type}")
try:
if metric_type == "counter":
meter.create_counter(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom counter: {metric_name}")
elif metric_type == "histogram":
meter.create_histogram(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom histogram: {metric_name}")
elif metric_type == "gauge":
meter.create_up_down_counter(metric_name, description=description, unit=unit)
self.logger.info(f"Registered custom gauge: {metric_name}")
else:
self.logger.warning(f"Unsupported metric type: {metric_type}")
except Exception as e:
self.logger.error(f"Failed to register metric {metric_name}", error=str(e))
def run_development_server(self, host: str = "0.0.0.0", port: int = 8000, reload: Optional[bool] = None):
"""