Imporve monitoring 5

2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions
--- a/gateway/app/main.py
+++ b/gateway/app/main.py
@@ -8,13 +8,12 @@ import json
 import structlog
 import resource
 import os
 from fastapi import FastAPI, Request, HTTPException, Depends, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse, Response
 import httpx
 import time
 from fastapi import Request, HTTPException, WebSocket, WebSocketDisconnect
 from fastapi.responses import StreamingResponse
 import httpx
 from shared.redis_utils import initialize_redis, close_redis, get_redis_client
-from typing import Dict, Any
+from shared.service_base import StandardFastAPIService
 from app.core.config import settings
 from app.middleware.request_id import RequestIDMiddleware
@@ -26,128 +25,84 @@ from app.middleware.subscription import SubscriptionMiddleware
 from app.middleware.demo_middleware import DemoMiddleware
 from app.middleware.read_only_mode import ReadOnlyModeMiddleware
 from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context
 from shared.monitoring.logging import setup_logging
 from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
 from shared.monitoring.system_metrics import SystemMetricsCollector
-# OpenTelemetry imports
+# Initialize logger
 from opentelemetry import trace
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
 from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
 from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
 from opentelemetry.instrumentation.redis import RedisInstrumentor
 from opentelemetry.sdk.resources import Resource
 # Configure OpenTelemetry tracing
 def setup_tracing(service_name: str = "gateway"):
    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
    # Create resource with service name
    resource = Resource.create({"service.name": service_name})
    # Configure OTLP exporter (sends to OpenTelemetry Collector)
    otlp_exporter = OTLPSpanExporter(
        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
        insecure=True  # Use insecure connection for internal cluster communication
    )
    # Configure tracer provider
    provider = TracerProvider(resource=resource)
    processor = BatchSpanProcessor(otlp_exporter)
    provider.add_span_processor(processor)
    # Set global tracer provider
    trace.set_tracer_provider(provider)
    return provider
 # Initialize tracing
 tracer_provider = setup_tracing("gateway")
 # Setup logging
 setup_logging("gateway", settings.LOG_LEVEL)
 logger = structlog.get_logger()
-# Check file descriptor limits and warn if too low
+# Check file descriptor limits
 try:
    soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
    if soft_limit < 1024:
-        logger.warning(f"Low file descriptor limit detected: {soft_limit}. Gateway may experience 'too many open files' errors.")
+        logger.warning(f"Low file descriptor limit detected: {soft_limit}")
        logger.warning(f"Recommended: Increase limit with 'ulimit -n 4096' or higher for production.")
        if soft_limit < 256:
            logger.error(f"Critical: File descriptor limit ({soft_limit}) is too low for gateway operation!")
    else:
        logger.info(f"File descriptor limit: {soft_limit} (sufficient)")
 except Exception as e:
    logger.debug(f"Could not check file descriptor limits: {e}")
 # Check and log current working directory and permissions
 try:
    cwd = os.getcwd()
    logger.info(f"Current working directory: {cwd}")
    # Check if we can write to common log locations
    test_locations = ["/var/log", "./logs", "."]
    for location in test_locations:
        try:
            test_file = os.path.join(location, ".gateway_permission_test")
            with open(test_file, 'w') as f:
                f.write("test")
            os.remove(test_file)
            logger.info(f"Write permission confirmed for: {location}")
        except Exception as e:
            logger.warning(f"Cannot write to {location}: {e}")
 except Exception as e:
    logger.debug(f"Could not check directory permissions: {e}")
 # Create FastAPI app
 app = FastAPI(
    title="Bakery Forecasting API Gateway",
    description="Central API Gateway for bakery forecasting microservices",
    version="1.0.0",
    docs_url="/docs",
    redoc_url="/redoc",
    redirect_slashes=False  # Disable automatic trailing slash redirects
 )
 # Instrument FastAPI with OpenTelemetry
 FastAPIInstrumentor.instrument_app(app)
 # Instrument httpx for outgoing requests
 HTTPXClientInstrumentor().instrument()
 # Instrument Redis (will be active once redis client is initialized)
 RedisInstrumentor().instrument()
 # Initialize metrics collector
 metrics_collector = MetricsCollector("gateway")
 # Add metrics middleware to track HTTP requests
 add_metrics_middleware(app, metrics_collector)
 # Redis client for SSE streaming
 redis_client = None
-# CORS middleware - Add first
+
-app.add_middleware(
+class GatewayService(StandardFastAPIService):
-    CORSMiddleware,
+    """Gateway Service with standardized monitoring setup"""
-    allow_origins=settings.CORS_ORIGINS_LIST,
+
-    allow_credentials=True,
+    async def on_startup(self, app):
-    allow_methods=["*"],
+        """Custom startup logic for Gateway"""
-    allow_headers=["*"],
+        global redis_client
        # Initialize Redis
        try:
            await initialize_redis(settings.REDIS_URL, db=0, max_connections=50)
            redis_client = await get_redis_client()
            logger.info("Connected to Redis for SSE streaming")
            # Add API rate limiting middleware with Redis client
            app.add_middleware(APIRateLimitMiddleware, redis_client=redis_client)
            logger.info("API rate limiting middleware enabled")
        except Exception as e:
            logger.error(f"Failed to connect to Redis: {e}")
        # Register custom metrics for gateway-specific operations
        if self.telemetry_providers and self.telemetry_providers.app_metrics:
            logger.info("Gateway-specific metrics tracking enabled")
        await super().on_startup(app)
    async def on_shutdown(self, app):
        """Custom shutdown logic for Gateway"""
        await super().on_shutdown(app)
        # Close Redis
        await close_redis()
        logger.info("Redis connection closed")
 # Create service instance
 service = GatewayService(
    service_name="gateway",
    app_name="Bakery Forecasting API Gateway",
    description="Central API Gateway for bakery forecasting microservices",
    version="1.0.0",
    log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
    cors_origins=settings.CORS_ORIGINS_LIST,
    enable_metrics=True,
    enable_health_checks=True,
    enable_tracing=True,
    enable_cors=True
 )
-# Custom middleware - Add in REVERSE order (last added = first executed)
+# Create FastAPI app
 app = service.create_app()
 # Add gateway-specific middleware (in REVERSE order of execution)
 # Execution order: RequestIDMiddleware -> DemoMiddleware -> AuthMiddleware -> ReadOnlyModeMiddleware -> SubscriptionMiddleware -> APIRateLimitMiddleware -> RateLimitMiddleware -> LoggingMiddleware
-app.add_middleware(LoggingMiddleware)          # Executes 8th (outermost)
+app.add_middleware(LoggingMiddleware)
-app.add_middleware(RateLimitMiddleware, calls_per_minute=300)  # Executes 7th - Simple rate limit
+app.add_middleware(RateLimitMiddleware, calls_per_minute=300)
-# Note: APIRateLimitMiddleware will be added on startup with Redis client
+app.add_middleware(SubscriptionMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)
-app.add_middleware(SubscriptionMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)  # Executes 5th
+app.add_middleware(ReadOnlyModeMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)
-app.add_middleware(ReadOnlyModeMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)  # Executes 4th - Enforce read-only mode
+app.add_middleware(AuthMiddleware)
-app.add_middleware(AuthMiddleware)             # Executes 3rd - Checks for demo context
+app.add_middleware(DemoMiddleware)
-app.add_middleware(DemoMiddleware)             # Executes 2nd - Sets demo user context
+app.add_middleware(RequestIDMiddleware)
 app.add_middleware(RequestIDMiddleware)        # Executes 1st (innermost) - Generates request ID for tracing
 # Include routers
 app.include_router(auth.router, prefix="/api/v1/auth", tags=["authentication"])
@@ -156,114 +111,18 @@ app.include_router(subscription.router, prefix="/api/v1", tags=["subscriptions"]
 app.include_router(notification.router, prefix="/api/v1/notifications", tags=["notifications"])
 app.include_router(nominatim.router, prefix="/api/v1/nominatim", tags=["location"])
 app.include_router(geocoding.router, prefix="/api/v1/geocoding", tags=["geocoding"])
 # app.include_router(poi_context.router, prefix="/api/v1/poi-context", tags=["poi-context"])  # Removed to implement tenant-based architecture
 app.include_router(pos.router, prefix="/api/v1/pos", tags=["pos"])
 app.include_router(demo.router, prefix="/api/v1", tags=["demo"])
@app.on_event("startup")
 async def startup_event():
    """Application startup"""
    global redis_client
    logger.info("Starting API Gateway")
    # Initialize shared Redis connection
    try:
        await initialize_redis(settings.REDIS_URL, db=0, max_connections=50)
        redis_client = await get_redis_client()
        logger.info("Connected to Redis for SSE streaming")
        # Add API rate limiting middleware with Redis client
        app.add_middleware(APIRateLimitMiddleware, redis_client=redis_client)
        logger.info("API rate limiting middleware enabled with subscription-based quotas")
    except Exception as e:
        logger.error(f"Failed to connect to Redis: {e}")
        logger.warning("API rate limiting middleware will fail open (allow all requests)")
    metrics_collector.register_counter(
        "gateway_auth_requests_total",
        "Total authentication requests"
    )
    metrics_collector.register_counter(
        "gateway_auth_responses_total", 
        "Total authentication responses"
    )
    metrics_collector.register_counter(
        "gateway_auth_errors_total",
        "Total authentication errors"
    )
    metrics_collector.register_histogram(
        "gateway_request_duration_seconds",
        "Request duration in seconds"
    )
    logger.info("Metrics registered successfully")
    # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
    # Initialize system metrics collection
    system_metrics = SystemMetricsCollector("gateway")
    logger.info("System metrics collection started")
    logger.info("Metrics export configured via OpenTelemetry OTLP")
    logger.info("API Gateway started successfully")
@app.on_event("shutdown")
 async def shutdown_event():
    """Application shutdown"""
    logger.info("Shutting down API Gateway")
    # Close shared Redis connection
    await close_redis()
    # Clean up service discovery
    # await service_discovery.cleanup()
    logger.info("API Gateway shutdown complete")
@app.get("/health")
 async def health_check():
    """Health check endpoint"""
    return {
        "status": "healthy",
        "service": "api-gateway",
        "version": "1.0.0",
        "timestamp": time.time()
    }
 # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
 # The /metrics endpoint is not needed as metrics are pushed automatically
 # ================================================================
 # SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS
 # ================================================================
 def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list:
-    """
+    """Determine which Redis channels to subscribe to based on filters"""
    Determine which Redis channels to subscribe to based on filters.
    Args:
        tenant_id: Tenant identifier
        channel_filters: List of channel patterns (e.g., ["inventory.alerts", "*.notifications"])
    Returns:
        List of full channel names to subscribe to
    Examples:
        >>> _get_subscription_channels("abc", ["inventory.alerts"])
        ["tenant:abc:inventory.alerts"]
        >>> _get_subscription_channels("abc", ["*.alerts"])
        ["tenant:abc:inventory.alerts", "tenant:abc:production.alerts", ...]
        >>> _get_subscription_channels("abc", [])
        ["tenant:abc:inventory.alerts", "tenant:abc:inventory.notifications", ...]
    """
    all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
    all_classes = ["alerts", "notifications"]
    channels = []
    if not channel_filters:
@@ -271,70 +130,49 @@ def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list:
        for domain in all_domains:
            for event_class in all_classes:
                channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
        # Also subscribe to recommendations (tenant-wide)
        channels.append(f"tenant:{tenant_id}:recommendations")
-        # Also subscribe to legacy channel for backward compatibility
+        channels.append(f"alerts:{tenant_id}")  # Legacy
        channels.append(f"alerts:{tenant_id}")
        return channels
    # Parse filters and expand wildcards
    for filter_pattern in channel_filters:
        if filter_pattern == "*.*":
            # All channels
            for domain in all_domains:
                for event_class in all_classes:
                    channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
            channels.append(f"tenant:{tenant_id}:recommendations")
        elif filter_pattern.endswith(".*"):
            # Domain wildcard (e.g., "inventory.*")
            domain = filter_pattern.split(".")[0]
            for event_class in all_classes:
                channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
        elif filter_pattern.startswith("*."):
            # Class wildcard (e.g., "*.alerts")
            event_class = filter_pattern.split(".")[1]
            if event_class == "recommendations":
                channels.append(f"tenant:{tenant_id}:recommendations")
            else:
                for domain in all_domains:
                    channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
        elif filter_pattern == "recommendations":
            # Recommendations channel
            channels.append(f"tenant:{tenant_id}:recommendations")
        else:
            # Specific channel (e.g., "inventory.alerts")
            channels.append(f"tenant:{tenant_id}:{filter_pattern}")
-    return list(set(channels))  # Remove duplicates
+    return list(set(channels))
 async def _load_initial_state(redis_client, tenant_id: str, channel_filters: list) -> list:
-    """
+    """Load initial state from Redis cache based on channel filters"""
    Load initial state from Redis cache based on channel filters.
    Args:
        redis_client: Redis client
        tenant_id: Tenant identifier
        channel_filters: List of channel patterns
    Returns:
        List of initial events
    """
    initial_events = []
    try:
        if not channel_filters:
-            # Load from legacy cache if no filters (backward compat)
+            # Legacy cache
            legacy_cache_key = f"active_alerts:{tenant_id}"
            cached_data = await redis_client.get(legacy_cache_key)
            if cached_data:
                return json.loads(cached_data)
-            # Also try loading from new domain-specific caches
+            # New domain-specific caches
            all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
            all_classes = ["alerts", "notifications"]
@@ -343,10 +181,9 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
                    cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
                    cached_data = await redis_client.get(cache_key)
                    if cached_data:
-                        events = json.loads(cached_data)
+                        initial_events.extend(json.loads(cached_data))
                        initial_events.extend(events)
-            # Load recommendations
+            # Recommendations
            recommendations_cache_key = f"active_events:{tenant_id}:recommendations"
            cached_data = await redis_client.get(recommendations_cache_key)
            if cached_data:
@@ -356,36 +193,29 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
        # Load based on specific filters
        for filter_pattern in channel_filters:
            # Extract domain and class from filter
            if "." in filter_pattern:
                parts = filter_pattern.split(".")
                domain = parts[0] if parts[0] != "*" else None
                event_class = parts[1] if len(parts) > 1 and parts[1] != "*" else None
                if domain and event_class:
                    # Specific cache (e.g., "inventory.alerts")
                    cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
                    cached_data = await redis_client.get(cache_key)
                    if cached_data:
                        initial_events.extend(json.loads(cached_data))
                elif domain and not event_class:
                    # Domain wildcard (e.g., "inventory.*")
                    for ec in ["alerts", "notifications"]:
                        cache_key = f"active_events:{tenant_id}:{domain}.{ec}"
                        cached_data = await redis_client.get(cache_key)
                        if cached_data:
                            initial_events.extend(json.loads(cached_data))
                elif not domain and event_class:
                    # Class wildcard (e.g., "*.alerts")
                    all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
                    for d in all_domains:
                        cache_key = f"active_events:{tenant_id}:{d}.{event_class}s"
                        cached_data = await redis_client.get(cache_key)
                        if cached_data:
                            initial_events.extend(json.loads(cached_data))
            elif filter_pattern == "recommendations":
                cache_key = f"active_events:{tenant_id}:recommendations"
                cached_data = await redis_client.get(cache_key)
@@ -400,27 +230,14 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
 def _determine_event_type(event_data: dict) -> str:
-    """
+    """Determine SSE event type from event data"""
    Determine SSE event type from event data.
    Args:
        event_data: Event data dictionary
    Returns:
        SSE event type: 'alert', 'notification', or 'recommendation'
    """
    # New event architecture uses 'event_class'
    if 'event_class' in event_data:
-        return event_data['event_class']  # 'alert', 'notification', or 'recommendation'
+        return event_data['event_class']
    # Legacy format uses 'item_type'
    if 'item_type' in event_data:
        if event_data['item_type'] == 'recommendation':
            return 'recommendation'
        else:
            return 'alert'
    # Default to 'alert' for backward compatibility
    return 'alert'
@@ -432,42 +249,25 @@ def _determine_event_type(event_data: dict) -> str:
 async def events_stream(
    request: Request,
    tenant_id: str,
-    channels: str = None  # Comma-separated channel filters (e.g., "inventory.alerts,production.notifications")
+    channels: str = None
 ):
    """
    Server-Sent Events stream for real-time notifications with multi-channel support.
    Authentication is handled by auth middleware via query param token.
    User context is available in request.state.user (injected by middleware).
    Query Parameters:
        tenant_id: Tenant identifier (required)
        channels: Comma-separated channel filters (optional)
            Examples:
                - "inventory.alerts,production.notifications" - Specific channels
                - "*.alerts" - All alert channels
                - "inventory.*" - All inventory events
                - None - All channels (default, backward compatible)
    New channel pattern: tenant:{tenant_id}:{domain}.{class}
    Examples:
        - tenant:abc:inventory.alerts
        - tenant:abc:production.notifications
        - tenant:abc:recommendations
    Legacy channel (backward compat): alerts:{tenant_id}
    """
    global redis_client
    if not redis_client:
        raise HTTPException(status_code=503, detail="SSE service unavailable")
-    # Extract user context from request state (set by auth middleware)
+    # Extract user context from request state
    user_context = request.state.user
    user_id = user_context.get('user_id')
    email = user_context.get('email')
    # Validate tenant_id parameter
    if not tenant_id:
        raise HTTPException(status_code=400, detail="tenant_id query parameter is required")
@@ -479,79 +279,53 @@ async def events_stream(
    logger.info(f"SSE connection request for user {email}, tenant {tenant_id}, channels: {channel_filters or 'all'}")
    async def event_generator():
-        """Generate server-sent events from Redis pub/sub with multi-channel support"""
+        """Generate server-sent events from Redis pub/sub"""
        pubsub = None
        try:
            # Create pubsub connection with resource monitoring
            pubsub = redis_client.pubsub()
            logger.debug(f"Created Redis pubsub connection for tenant: {tenant_id}")
-            # Monitor connection count
+            # Determine channels
            try:
                connection_info = await redis_client.info('clients')
                connected_clients = connection_info.get('connected_clients', 'unknown')
                logger.debug(f"Redis connected clients: {connected_clients}")
            except Exception:
                # Don't fail if we can't get connection info
                pass
            # Determine which channels to subscribe to
            subscription_channels = _get_subscription_channels(tenant_id, channel_filters)
-            # Subscribe to all determined channels
+            # Subscribe
            if subscription_channels:
                await pubsub.subscribe(*subscription_channels)
                logger.info(f"Subscribed to {len(subscription_channels)} channels for tenant {tenant_id}")
            else:
                # Fallback to legacy channel if no channels specified
                legacy_channel = f"alerts:{tenant_id}"
                await pubsub.subscribe(legacy_channel)
                logger.info(f"Subscribed to legacy channel: {legacy_channel}")
-            # Send initial connection event
+            # Connection event
            yield f"event: connection\n"
            yield f"data: {json.dumps({'type': 'connected', 'message': 'SSE connection established', 'channels': subscription_channels or ['all'], 'timestamp': time.time()})}\n\n"
-            # Fetch and send initial state from cache (domain-specific or legacy)
+            # Initial state
            initial_events = await _load_initial_state(redis_client, tenant_id, channel_filters)
            if initial_events:
                logger.info(f"Sending {len(initial_events)} initial events to tenant {tenant_id}")
            yield f"event: initial_state\n"
            yield f"data: {json.dumps(initial_events)}\n\n"
            else:
                # Send empty initial state for compatibility
                yield f"event: initial_state\n"
                yield f"data: {json.dumps([])}\n\n"
            heartbeat_counter = 0
            while True:
                # Check if client has disconnected
                if await request.is_disconnected():
                    logger.info(f"SSE client disconnected for tenant: {tenant_id}")
                    break
                try:
                    # Get message from Redis with timeout
                    message = await asyncio.wait_for(pubsub.get_message(ignore_subscribe_messages=True), timeout=10.0)
                    if message and message['type'] == 'message':
                        # Forward the event from Redis
                        event_data = json.loads(message['data'])
                        # Determine event type for SSE
                        event_type = _determine_event_type(event_data)
                        # Add channel metadata for frontend routing
                        event_data['_channel'] = message['channel'].decode('utf-8') if isinstance(message['channel'], bytes) else message['channel']
                        yield f"event: {event_type}\n"
                        yield f"data: {json.dumps(event_data)}\n\n"
                        logger.debug(f"SSE event sent to tenant {tenant_id}: {event_type} - {event_data.get('title')}")
                except asyncio.TimeoutError:
                    # Send heartbeat every 10 timeouts (100 seconds)
                    heartbeat_counter += 1
                    if heartbeat_counter >= 10:
                        yield f"event: heartbeat\n"
@@ -563,24 +337,13 @@ async def events_stream(
        except Exception as e:
            logger.error(f"SSE error for tenant {tenant_id}: {e}", exc_info=True)
        finally:
            try:
            if pubsub:
                try:
                        # Unsubscribe from all channels
                    await pubsub.unsubscribe()
                        logger.debug(f"Unsubscribed from Redis channels for tenant: {tenant_id}")
                    except Exception as unsubscribe_error:
                        logger.error(f"Failed to unsubscribe Redis pubsub for tenant {tenant_id}: {unsubscribe_error}")
                    try:
                        # Close pubsub connection
                    await pubsub.close()
-                        logger.debug(f"Closed Redis pubsub connection for tenant: {tenant_id}")
+                except Exception as e:
-                    except Exception as close_error:
+                    logger.error(f"Error closing pubsub: {e}")
                        logger.error(f"Failed to close Redis pubsub for tenant {tenant_id}: {close_error}")
            logger.info(f"SSE connection closed for tenant: {tenant_id}")
            except Exception as finally_error:
                logger.error(f"Error in SSE cleanup for tenant {tenant_id}: {finally_error}")
    return StreamingResponse(
        event_generator(),
@@ -593,55 +356,35 @@ async def events_stream(
        }
    )
 # ================================================================
 # WEBSOCKET ROUTING FOR TRAINING SERVICE
 # ================================================================
@app.websocket("/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live")
 async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_id: str):
-    """
+    """WebSocket proxy with token verification for training service"""
    Simple WebSocket proxy with token verification only.
    Validates the token and forwards the connection to the training service.
    """
    # Get token from query params
    token = websocket.query_params.get("token")
    if not token:
        logger.warning("WebSocket proxy rejected - missing token",
                      job_id=job_id,
                      tenant_id=tenant_id)
        await websocket.accept()
        await websocket.close(code=1008, reason="Authentication token required")
        return
    # Verify token
    from shared.auth.jwt_handler import JWTHandler
    jwt_handler = JWTHandler(settings.JWT_SECRET_KEY, settings.JWT_ALGORITHM)
    try:
        payload = jwt_handler.verify_token(token)
        if not payload or not payload.get('user_id'):
            logger.warning("WebSocket proxy rejected - invalid token",
                          job_id=job_id,
                          tenant_id=tenant_id)
            await websocket.accept()
            await websocket.close(code=1008, reason="Invalid token")
            return
        logger.info("WebSocket proxy - token verified",
                   user_id=payload.get('user_id'),
                   tenant_id=tenant_id,
                   job_id=job_id)
    except Exception as e:
        logger.warning("WebSocket proxy - token verification failed",
                      job_id=job_id,
                      error=str(e))
        await websocket.accept()
        await websocket.close(code=1008, reason="Token verification failed")
        return
    # Accept the connection
    await websocket.accept()
    # Build WebSocket URL to training service
@@ -649,33 +392,24 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
    training_ws_url = training_service_base.replace('http://', 'ws://').replace('https://', 'wss://')
    training_ws_url = f"{training_ws_url}/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live?token={token}"
    logger.info("Gateway proxying WebSocket to training service",
               job_id=job_id,
               training_ws_url=training_ws_url.replace(token, '***'))
    training_ws = None
    try:
        # Connect to training service WebSocket
        import websockets
        from websockets.protocol import State
        training_ws = await websockets.connect(
            training_ws_url,
-            ping_interval=120,  # Send ping every 2 minutes (tolerates long training operations)
+            ping_interval=120,
-            ping_timeout=60,    # Wait up to 1 minute for pong (graceful timeout)
+            ping_timeout=60,
-            close_timeout=60,   # Increase close timeout for graceful shutdown
+            close_timeout=60,
            open_timeout=30
        )
        logger.info("Gateway connected to training service WebSocket", job_id=job_id)
        async def forward_frontend_to_training():
            """Forward messages from frontend to training service"""
            try:
                while training_ws and training_ws.state == State.OPEN:
                    data = await websocket.receive()
                    if data.get("type") == "websocket.receive":
                        if "text" in data:
                            await training_ws.send(data["text"])
@@ -683,30 +417,17 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
                            await training_ws.send(data["bytes"])
                    elif data.get("type") == "websocket.disconnect":
                        break
-            except Exception as e:
+            except Exception:
-                logger.debug("Frontend to training forward ended", error=str(e))
+                pass
        async def forward_training_to_frontend():
            """Forward messages from training service to frontend"""
            message_count = 0
            try:
                while training_ws and training_ws.state == State.OPEN:
                    message = await training_ws.recv()
                    await websocket.send_text(message)
-                    message_count += 1
+            except Exception:
                pass
                    # Log every 10th message to track connectivity
                    if message_count % 10 == 0:
                        logger.debug("WebSocket proxy active",
                                   job_id=job_id,
                                   messages_forwarded=message_count)
            except Exception as e:
                logger.info("Training to frontend forward ended",
                           job_id=job_id,
                           messages_forwarded=message_count,
                           error=str(e))
        # Run both forwarding tasks concurrently
        await asyncio.gather(
            forward_frontend_to_training(),
            forward_training_to_frontend(),
@@ -716,20 +437,17 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
    except Exception as e:
        logger.error("WebSocket proxy error", job_id=job_id, error=str(e))
    finally:
        # Cleanup
        if training_ws and training_ws.state == State.OPEN:
            try:
                await training_ws.close()
            except:
                pass
        try:
            if not websocket.client_state.name == 'DISCONNECTED':
                await websocket.close(code=1000, reason="Proxy closed")
        except:
            pass
        logger.info("WebSocket proxy connection closed", job_id=job_id)
 if __name__ == "__main__":
    import uvicorn
--- a/infrastructure/helm/signoz-values-dev.yaml
+++ b/infrastructure/helm/signoz-values-dev.yaml
@@ -48,9 +48,9 @@ signoz:
    signoz_traces_ttl_duration_hrs: "168"
    signoz_metrics_ttl_duration_hrs: "168"
    signoz_logs_ttl_duration_hrs: "168"
-    # OpAMP Server Configuration
+    # OpAMP Server Configuration - DISABLED for dev (causes gRPC instability)
-    signoz_opamp_server_enabled: "true"
+    signoz_opamp_server_enabled: "false"
-    signoz_opamp_server_endpoint: "0.0.0.0:4320"
+    # signoz_opamp_server_endpoint: "0.0.0.0:4320"
  persistence:
    enabled: true
@@ -149,9 +149,10 @@ otelCollector:
    repository: signoz/signoz-otel-collector
    tag: v0.129.12  # Latest recommended version
-  # OpAMP Configuration - Enabled for dynamic configuration management
+  # OpAMP Configuration - DISABLED for development
-  # Note: OpAMP allows remote configuration management via SigNoz backend
+  # OpAMP is designed for production with remote config management
-  # This replaces the manual kubectl patch approach
+  # In dev, it causes gRPC instability and collector reloads
  # We use static configuration instead
  # Init containers for the Otel Collector pod
  initContainers:
@@ -231,6 +232,9 @@ otelCollector:
        secretName: postgres-tls
    - name: postgres-tls-fixed
      emptyDir: {}
    - name: varlogpods
      hostPath:
        path: /var/log/pods
  extraVolumeMounts:
    - name: redis-tls
@@ -242,13 +246,16 @@ otelCollector:
    - name: postgres-tls-fixed
      mountPath: /etc/postgres-tls
      readOnly: false
    - name: varlogpods
      mountPath: /var/log/pods
      readOnly: true
-  # Enable OpAMP for dynamic configuration management
+  # Disable OpAMP - use static configuration only
  # Use 'args' instead of 'extraArgs' to completely override the command
  command:
    name: /signoz-otel-collector
-    extraArgs:
+    args:
      - --config=/conf/otel-collector-config.yaml
      - --manager-config=/conf/otel-collector-opamp-config.yaml
      - --feature-gates=-pkg.translator.prometheus.NormalizeName
  # OpenTelemetry Collector configuration
@@ -275,6 +282,63 @@ otelCollector:
              allowed_origins:
                - "*"
      # Filelog receiver for Kubernetes pod logs
      # Collects container stdout/stderr from /var/log/pods
      filelog:
        include:
          - /var/log/pods/*/*/*.log
        exclude:
          # Exclude SigNoz's own logs to avoid recursive collection
          - /var/log/pods/bakery-ia_signoz-*/*/*.log
        include_file_path: true
        include_file_name: false
        operators:
          # Parse CRI-O / containerd log format
          - type: regex_parser
            regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
            timestamp:
              parse_from: attributes.time
              layout: '%Y-%m-%dT%H:%M:%S.%LZ'
          # Fix timestamp parsing - extract from the parsed time field
          - type: move
            from: attributes.time
            to: attributes.timestamp
          # Extract Kubernetes metadata from file path
          - type: regex_parser
            id: extract_metadata_from_filepath
            regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
            parse_from: attributes["log.file.path"]
          # Move metadata to resource attributes
          - type: move
            from: attributes.namespace
            to: resource["k8s.namespace.name"]
          - type: move
            from: attributes.pod_name
            to: resource["k8s.pod.name"]
          - type: move
            from: attributes.container_name
            to: resource["k8s.container.name"]
          - type: move
            from: attributes.log
            to: body
      # Kubernetes Cluster Receiver - Collects cluster-level metrics
      # Provides information about nodes, namespaces, pods, and other cluster resources
      k8s_cluster:
        collection_interval: 30s
        node_conditions_to_report:
          - Ready
          - MemoryPressure
          - DiskPressure
          - PIDPressure
          - NetworkUnavailable
        allocatable_types_to_report:
          - cpu
          - memory
          - pods
      # PostgreSQL receivers for database metrics
      # ENABLED: Monitor users configured and credentials stored in secrets
      # Collects metrics directly from PostgreSQL databases with proper TLS
@@ -538,6 +602,43 @@ otelCollector:
        password: ${env:RABBITMQ_PASSWORD}
        collection_interval: 30s
      # Prometheus Receiver - Scrapes metrics from Kubernetes API
      # Simplified configuration using only Kubernetes API metrics
      prometheus:
        config:
          scrape_configs:
            - job_name: 'kubernetes-nodes-cadvisor'
              scrape_interval: 30s
              scrape_timeout: 10s
              scheme: https
              tls_config:
                insecure_skip_verify: true
              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
              kubernetes_sd_configs:
                - role: node
              relabel_configs:
                - action: labelmap
                  regex: __meta_kubernetes_node_label_(.+)
                - target_label: __address__
                  replacement: kubernetes.default.svc:443
                - source_labels: [__meta_kubernetes_node_name]
                  regex: (.+)
                  target_label: __metrics_path__
                  replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
            - job_name: 'kubernetes-apiserver'
              scrape_interval: 30s
              scrape_timeout: 10s
              scheme: https
              tls_config:
                insecure_skip_verify: true
              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
              kubernetes_sd_configs:
                - role: endpoints
              relabel_configs:
                - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
                  action: keep
                  regex: default;kubernetes;https
    processors:
      # Batch processor for better performance (optimized for high throughput)
      batch:
@@ -562,6 +663,25 @@ otelCollector:
        detectors: [env, system, docker]
        timeout: 5s
      # Kubernetes attributes processor - CRITICAL for logs
      # Extracts pod, namespace, container metadata from log attributes
      k8sattributes:
        auth_type: "serviceAccount"
        passthrough: false
        extract:
          metadata:
            - k8s.pod.name
            - k8s.pod.uid
            - k8s.deployment.name
            - k8s.namespace.name
            - k8s.node.name
            - k8s.container.name
          labels:
            - tag_name: "app"
            - tag_name: "pod-template-hash"
          annotations:
            - tag_name: "description"
      # SigNoz span metrics processor with delta aggregation (recommended)
      # Generates RED metrics (Rate, Error, Duration) from trace spans
      signozspanmetrics/delta:
@@ -643,7 +763,7 @@ otelCollector:
            postgresql/orchestrator, postgresql/pos, postgresql/procurement,
            postgresql/production, postgresql/recipes, postgresql/sales,
            postgresql/suppliers, postgresql/tenant, postgresql/training,
-            redis, rabbitmq]
+            redis, rabbitmq, k8s_cluster, prometheus]
          processors: [memory_limiter, batch, resourcedetection]
          exporters: [signozclickhousemetrics]
@@ -653,17 +773,38 @@ otelCollector:
          processors: [batch/meter]
          exporters: [signozclickhousemeter]
-        # Logs pipeline
+        # Logs pipeline - includes both OTLP and Kubernetes pod logs
        logs:
-          receivers: [otlp]
+          receivers: [otlp, filelog]
-          processors: [memory_limiter, batch, resourcedetection]
+          processors: [memory_limiter, batch, resourcedetection, k8sattributes]
          exporters: [clickhouselogsexporter]
 # Additional Configuration
 serviceAccount:
  create: true
  annotations: {}
-  name: ""
+  name: "signoz-otel-collector"
 # RBAC Configuration for Kubernetes monitoring
 # Required for k8s_cluster and kubeletstats receivers to access Kubernetes API
 rbac:
  create: true
  rules:
    - apiGroups: [""]
      resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
      verbs: ["get", "list", "watch"]
    - apiGroups: ["apps"]
      resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
      verbs: ["get", "list", "watch"]
    - apiGroups: ["batch"]
      resources: ["jobs", "cronjobs"]
      verbs: ["get", "list", "watch"]
    - apiGroups: ["extensions"]
      resources: ["deployments", "daemonsets", "replicasets"]
      verbs: ["get", "list", "watch"]
    - apiGroups: ["metrics.k8s.io"]
      resources: ["nodes", "pods"]
      verbs: ["get", "list", "watch"]
 # Security Context
 securityContext:
--- a/infrastructure/helm/signoz-values-prod.yaml
+++ b/infrastructure/helm/signoz-values-prod.yaml
@@ -66,6 +66,11 @@ signoz:
    signoz_traces_ttl_duration_hrs: "720"
    signoz_metrics_ttl_duration_hrs: "720"
    signoz_logs_ttl_duration_hrs: "720"
    # OpAMP Server Configuration
    # WARNING: OpAMP can cause gRPC instability and collector reloads
    # Only enable if you have a stable OpAMP backend server
    signoz_opamp_server_enabled: "false"
    # signoz_opamp_server_endpoint: "0.0.0.0:4320"
    # SMTP configuration for email alerts
    signoz_smtp_enabled: "true"
    signoz_smtp_host: "smtp.gmail.com"
@@ -247,17 +252,52 @@ otelCollector:
    tag: v0.129.12  # Updated to latest recommended version
    pullPolicy: IfNotPresent
  # Init containers for the Otel Collector pod
  initContainers:
    fix-postgres-tls:
      enabled: true
      image:
        registry: docker.io
        repository: busybox
        tag: 1.35
        pullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - |
          echo "Fixing PostgreSQL TLS file permissions..."
          cp /etc/postgres-tls-source/* /etc/postgres-tls/
          chmod 600 /etc/postgres-tls/server-key.pem
          chmod 644 /etc/postgres-tls/server-cert.pem
          chmod 644 /etc/postgres-tls/ca-cert.pem
          echo "PostgreSQL TLS permissions fixed"
      volumeMounts:
        - name: postgres-tls-source
          mountPath: /etc/postgres-tls-source
          readOnly: true
        - name: postgres-tls-fixed
          mountPath: /etc/postgres-tls
          readOnly: false
  service:
    type: ClusterIP
    ports:
      - name: otlp-grpc
        port: 4317
        targetPort: 4317
        protocol: TCP
      - name: otlp-http
        port: 4318
        targetPort: 4318
        protocol: TCP
      - name: prometheus
        port: 8889
        targetPort: 8889
        protocol: TCP
      - name: metrics
        port: 8888
-      - name: healthcheck
+        targetPort: 8888
-        port: 13133
+        protocol: TCP
  resources:
    requests:
@@ -267,6 +307,50 @@ otelCollector:
      cpu: 2000m
      memory: 2Gi
  # Additional environment variables for receivers
  additionalEnvs:
    POSTGRES_MONITOR_USER: "monitoring"
    POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca"
    REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k"
    RABBITMQ_USER: "bakery"
    RABBITMQ_PASSWORD: "forecast123"
  # Mount TLS certificates for secure connections
  extraVolumes:
    - name: redis-tls
      secret:
        secretName: redis-tls-secret
    - name: postgres-tls
      secret:
        secretName: postgres-tls
    - name: postgres-tls-fixed
      emptyDir: {}
    - name: varlogpods
      hostPath:
        path: /var/log/pods
  extraVolumeMounts:
    - name: redis-tls
      mountPath: /etc/redis-tls
      readOnly: true
    - name: postgres-tls
      mountPath: /etc/postgres-tls-source
      readOnly: true
    - name: postgres-tls-fixed
      mountPath: /etc/postgres-tls
      readOnly: false
    - name: varlogpods
      mountPath: /var/log/pods
      readOnly: true
  # Enable OpAMP for dynamic configuration management
  command:
    name: /signoz-otel-collector
    extraArgs:
      - --config=/conf/otel-collector-config.yaml
      - --manager-config=/conf/otel-collector-opamp-config.yaml
      - --feature-gates=-pkg.translator.prometheus.NormalizeName
  # Full OTEL Collector Configuration
  config:
    # Connectors - bridge between pipelines
@@ -297,14 +381,358 @@ otelCollector:
                - "https://monitoring.bakewise.ai"
                - "https://*.bakewise.ai"
      # Filelog receiver for Kubernetes pod logs
      # Collects container stdout/stderr from /var/log/pods
      filelog:
        include:
          - /var/log/pods/*/*/*.log
        exclude:
          # Exclude SigNoz's own logs to avoid recursive collection
          - /var/log/pods/bakery-ia_signoz-*/*/*.log
        include_file_path: true
        include_file_name: false
        operators:
          # Parse CRI-O / containerd log format
          - type: regex_parser
            regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
            timestamp:
              parse_from: attributes.time
              layout: '%Y-%m-%dT%H:%M:%S.%LZ'
          # Fix timestamp parsing - extract from the parsed time field
          - type: move
            from: attributes.time
            to: attributes.timestamp
          # Extract Kubernetes metadata from file path
          - type: regex_parser
            id: extract_metadata_from_filepath
            regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
            parse_from: attributes["log.file.path"]
          # Move metadata to resource attributes
          - type: move
            from: attributes.namespace
            to: resource["k8s.namespace.name"]
          - type: move
            from: attributes.pod_name
            to: resource["k8s.pod.name"]
          - type: move
            from: attributes.container_name
            to: resource["k8s.container.name"]
          - type: move
            from: attributes.log
            to: body
      # Kubernetes Cluster Receiver - Collects cluster-level metrics
      # Provides information about nodes, namespaces, pods, and other cluster resources
      k8s_cluster:
        collection_interval: 30s
        node_conditions_to_report:
          - Ready
          - MemoryPressure
          - DiskPressure
          - PIDPressure
          - NetworkUnavailable
        allocatable_types_to_report:
          - cpu
          - memory
          - pods
      # Prometheus receiver for scraping metrics
      prometheus:
        config:
          scrape_configs:
-            - job_name: 'otel-collector'
+            - job_name: 'kubernetes-nodes-cadvisor'
              scrape_interval: 30s
-              static_configs:
+              scrape_timeout: 10s
-                - targets: ['localhost:8888']
+              scheme: https
              tls_config:
                insecure_skip_verify: true
              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
              kubernetes_sd_configs:
                - role: node
              relabel_configs:
                - action: labelmap
                  regex: __meta_kubernetes_node_label_(.+)
                - target_label: __address__
                  replacement: kubernetes.default.svc:443
                - source_labels: [__meta_kubernetes_node_name]
                  regex: (.+)
                  target_label: __metrics_path__
                  replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
            - job_name: 'kubernetes-apiserver'
              scrape_interval: 30s
              scrape_timeout: 10s
              scheme: https
              tls_config:
                insecure_skip_verify: true
              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
              kubernetes_sd_configs:
                - role: endpoints
              relabel_configs:
                - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
                  action: keep
                  regex: default;kubernetes;https
      # Redis receiver for cache metrics
      # ENABLED: Using existing credentials from redis-secrets with TLS
      redis:
        endpoint: redis-service.bakery-ia:6379
        password: ${env:REDIS_PASSWORD}
        collection_interval: 60s
        transport: tcp
        tls:
          insecure_skip_verify: false
          cert_file: /etc/redis-tls/redis-cert.pem
          key_file: /etc/redis-tls/redis-key.pem
          ca_file: /etc/redis-tls/ca-cert.pem
        metrics:
          redis.maxmemory:
            enabled: true
          redis.cmd.latency:
            enabled: true
      # RabbitMQ receiver via management API
      # ENABLED: Using existing credentials from rabbitmq-secrets
      rabbitmq:
        endpoint: http://rabbitmq-service.bakery-ia:15672
        username: ${env:RABBITMQ_USER}
        password: ${env:RABBITMQ_PASSWORD}
        collection_interval: 30s
      # PostgreSQL receivers for database metrics
      # Monitor all databases with proper TLS configuration
      postgresql/auth:
        endpoint: auth-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - auth_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/inventory:
        endpoint: inventory-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - inventory_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/orders:
        endpoint: orders-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - orders_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/ai-insights:
        endpoint: ai-insights-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - ai_insights_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/alert-processor:
        endpoint: alert-processor-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - alert_processor_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/distribution:
        endpoint: distribution-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - distribution_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/external:
        endpoint: external-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - external_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/forecasting:
        endpoint: forecasting-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - forecasting_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/notification:
        endpoint: notification-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - notification_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/orchestrator:
        endpoint: orchestrator-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - orchestrator_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/pos:
        endpoint: pos-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - pos_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/procurement:
        endpoint: procurement-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - procurement_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/production:
        endpoint: production-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - production_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/recipes:
        endpoint: recipes-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - recipes_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/sales:
        endpoint: sales-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - sales_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/suppliers:
        endpoint: suppliers-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - suppliers_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/tenant:
        endpoint: tenant-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - tenant_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
      postgresql/training:
        endpoint: training-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - training_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem
    processors:
      # High-performance batch processing (official recommendation)
@@ -326,7 +754,7 @@ otelCollector:
      # Resource detection for K8s
      resourcedetection:
-        detectors: [env, system, docker, kubernetes]
+        detectors: [env, system, docker]
        timeout: 5s
      # Add resource attributes
@@ -339,6 +767,26 @@ otelCollector:
            value: bakery-ia-prod
            action: upsert
      # Kubernetes attributes processor - CRITICAL for logs
      # Extracts pod, namespace, container metadata from log attributes
      k8sattributes:
        auth_type: "serviceAccount"
        passthrough: false
        extract:
          metadata:
            - k8s.pod.name
            - k8s.pod.uid
            - k8s.deployment.name
            - k8s.namespace.name
            - k8s.node.name
            - k8s.container.name
          labels:
            - tag_name: "app"
            - tag_name: "pod-template-hash"
            - tag_name: "version"
          annotations:
            - tag_name: "description"
      # SigNoz span metrics processor with delta aggregation (recommended)
      # Generates RED metrics (Rate, Error, Duration) from trace spans
      signozspanmetrics/delta:
@@ -354,9 +802,9 @@ otelCollector:
          - name: signoz.collector.id
    exporters:
-      # Export to SigNoz ClickHouse
+      # ClickHouse exporter for traces
      clickhousetraces:
-        datasource: tcp://clickhouse:9000/?database=signoz_traces
+        datasource: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_traces
        timeout: 10s
        retry_on_failure:
          enabled: true
@@ -364,8 +812,9 @@ otelCollector:
          max_interval: 30s
          max_elapsed_time: 300s
      # ClickHouse exporter for metrics
      signozclickhousemetrics:
-        endpoint: "tcp://clickhouse:9000/?database=signoz_metrics"
+        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
        timeout: 10s
        retry_on_failure:
          enabled: true
@@ -375,32 +824,32 @@ otelCollector:
      # ClickHouse exporter for meter data (usage metrics)
      signozclickhousemeter:
-        dsn: "tcp://clickhouse:9000/?database=signoz_meter"
+        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_meter"
        timeout: 45s
        sending_queue:
          enabled: false
      # ClickHouse exporter for logs
      clickhouselogsexporter:
-        dsn: tcp://clickhouse:9000/?database=signoz_logs
+        dsn: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_logs
        timeout: 10s
        retry_on_failure:
          enabled: true
          initial_interval: 5s
          max_interval: 30s
          max_elapsed_time: 300s
      # Metadata exporter for service metadata
      metadataexporter:
-        dsn: "tcp://clickhouse:9000/?database=signoz_metadata"
+        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metadata"
        timeout: 10s
        cache:
          provider: in_memory
-      # Debug exporter for debugging (replaces deprecated logging exporter)
+      # Debug exporter for debugging (optional)
      debug:
        verbosity: detailed
-        sampling_initial: 2
+        sampling_initial: 5
-        sampling_thereafter: 500
+        sampling_thereafter: 200
    service:
      extensions: [health_check, zpages]
@@ -411,9 +860,16 @@ otelCollector:
          processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource]
          exporters: [clickhousetraces, metadataexporter, signozmeter]
-        # Metrics pipeline
+        # Metrics pipeline - includes all infrastructure receivers
        metrics:
-          receivers: [otlp, prometheus]
+          receivers: [otlp,
            postgresql/auth, postgresql/inventory, postgresql/orders,
            postgresql/ai-insights, postgresql/alert-processor, postgresql/distribution,
            postgresql/external, postgresql/forecasting, postgresql/notification,
            postgresql/orchestrator, postgresql/pos, postgresql/procurement,
            postgresql/production, postgresql/recipes, postgresql/sales,
            postgresql/suppliers, postgresql/tenant, postgresql/training,
            redis, rabbitmq, k8s_cluster, prometheus]
          processors: [memory_limiter, batch, resourcedetection, resource]
          exporters: [signozclickhousemetrics]
@@ -423,10 +879,10 @@ otelCollector:
          processors: [batch/meter]
          exporters: [signozclickhousemeter]
-        # Logs pipeline
+        # Logs pipeline - includes both OTLP and Kubernetes pod logs
        logs:
-          receivers: [otlp]
+          receivers: [otlp, filelog]
-          processors: [memory_limiter, batch, resourcedetection, resource]
+          processors: [memory_limiter, batch, resourcedetection, resource, k8sattributes]
          exporters: [clickhouselogsexporter]
  # HPA for OTEL Collector
@@ -455,6 +911,27 @@ serviceAccount:
  annotations: {}
  name: "signoz"
 # RBAC Configuration for Kubernetes monitoring
 # Required for k8s_cluster receiver to access Kubernetes API
 rbac:
  create: true
  rules:
    - apiGroups: [""]
      resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
      verbs: ["get", "list", "watch"]
    - apiGroups: ["apps"]
      resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
      verbs: ["get", "list", "watch"]
    - apiGroups: ["batch"]
      resources: ["jobs", "cronjobs"]
      verbs: ["get", "list", "watch"]
    - apiGroups: ["extensions"]
      resources: ["deployments", "daemonsets", "replicasets"]
      verbs: ["get", "list", "watch"]
    - apiGroups: ["metrics.k8s.io"]
      resources: ["nodes", "pods"]
      verbs: ["get", "list", "watch"]
 # Security Context
 securityContext:
  runAsNonRoot: true
--- a/infrastructure/kubernetes/base/configmap.yaml
+++ b/infrastructure/kubernetes/base/configmap.yaml
@@ -15,9 +15,13 @@ data:
  LOG_LEVEL: "INFO"
  # Observability Settings - SigNoz enabled
  # Note: Detailed OTEL configuration is in the OBSERVABILITY section below
  ENABLE_TRACING: "true"
  ENABLE_METRICS: "true"
  ENABLE_LOGS: "true"
  ENABLE_OTEL_METRICS: "true"
  ENABLE_SYSTEM_METRICS: "true"
  OTEL_LOGS_EXPORTER: "otlp"
  # Database initialization settings
  # IMPORTANT: Services NEVER run migrations - they only verify DB is ready
@@ -384,15 +388,44 @@ data:
  # ================================================================
  # OBSERVABILITY - SigNoz (Unified Monitoring)
  # ================================================================
-  # OpenTelemetry Configuration - Direct to SigNoz
+  # OpenTelemetry Configuration - Direct to SigNoz OTel Collector
-  # IMPORTANT: gRPC endpoints should NOT include http:// prefix
+  #
  # ENDPOINT CONFIGURATION:
  # - OTEL_EXPORTER_OTLP_ENDPOINT: Base gRPC endpoint (host:port format, NO http:// prefix)
  #   Used by traces and metrics (gRPC) by default
  #   Format: "host:4317" (gRPC port)
  #
  # PROTOCOL USAGE:
  # - Traces: gRPC (port 4317) - High performance, low latency
  # - Metrics: gRPC (port 4317) - Efficient batch export
  # - Logs: HTTP (port 4318) - Required for OTLP log protocol
  #
  # The monitoring library automatically handles:
  # - Converting gRPC endpoint (4317) to HTTP endpoint (4318) for logs
  # - Adding proper paths (/v1/traces, /v1/metrics, /v1/logs)
  # - Protocol prefixes (http:// for HTTP, none for gRPC)
  #
  # Base OTLP endpoint (gRPC format - used by traces and metrics)
  OTEL_EXPORTER_OTLP_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
  # Protocol configuration (gRPC is recommended for better performance)
  OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
  # Optional: Signal-specific endpoint overrides (if different from base)
  # OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
  # OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
  # OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
  # Optional: Protocol overrides per signal
  # OTEL_EXPORTER_OTLP_TRACES_PROTOCOL: "grpc"
  # OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: "grpc"
  # Note: Logs always use HTTP protocol regardless of this setting
  # Resource attributes (added to all telemetry signals)
  OTEL_SERVICE_NAME: "bakery-ia"
  OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
  OTEL_LOGS_EXPORTER: "otlp"
-  # SigNoz Endpoints (v0.106.0+ unified service)
+  # SigNoz service endpoints (for UI and API access)
  SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080"
  SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local"
--- a/infrastructure/signoz/dashboards/alert-management.json
+++ b/infrastructure/signoz/dashboards/alert-management.json
@@ -1,104 +1,170 @@
 {
  "dashboard": {
    "title": "Bakery IA - Alert Management",
  "description": "Alert monitoring and management dashboard",
  "tags": ["alerts", "monitoring", "management"],
-    "panels": [
+  "name": "bakery-ia-alert-management",
  "title": "Bakery IA - Alert Management",
  "uploadedGrafana": false,
  "uuid": "bakery-ia-alerts-01",
  "version": "v4",
  "collapsableRowsMigrated": true,
  "layout": [
    {
      "x": 0,
      "y": 0,
      "w": 6,
      "h": 3,
      "i": "active-alerts",
      "moved": false,
      "static": false
    },
    {
      "x": 6,
      "y": 0,
      "w": 6,
      "h": 3,
      "i": "alert-rate",
      "moved": false,
      "static": false
    }
  ],
  "variables": {
    "service": {
      "id": "service-var",
      "name": "service",
      "description": "Filter by service name",
      "type": "QUERY",
      "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'alerts_active' AND value != '' ORDER BY value",
      "customValue": "",
      "textboxValue": "",
      "showALLOption": true,
      "multiSelect": false,
      "order": 1,
      "modificationUUID": "",
      "sort": "ASC",
      "selectedValue": null
    }
  },
  "widgets": [
    {
      "id": "active-alerts",
      "title": "Active Alerts",
-        "type": "stat",
+      "description": "Number of currently active alerts",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "value",
      "query": {
-          "metric": "alerts_active",
+        "builder": {
-          "aggregate": "sum",
+          "queryData": [
          "filters": [
            {
-              "key": "severity",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${severity}"
+              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "alerts_active",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
-              "key": "status",
+                    "key": {
-              "operator": "=",
+                      "key": "serviceName",
-              "value": "firing"
+                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  }
-          ]
+                ],
                "op": "AND"
              },
-        "unit": "number"
+              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [],
              "legend": "Active Alerts",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "alert-rate",
      "title": "Alert Rate",
-        "type": "timeseries",
+      "description": "Rate of alerts over time",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
-          "metric": "alerts_total",
+        "builder": {
-          "aggregate": "rate",
+          "queryData": [
          "filters": [
            {
-              "key": "severity",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${severity}"
+              "aggregateOperator": "sum",
-            }
+              "aggregateAttribute": {
-          ]
+                "key": "alerts_total",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
-        "unit": "alerts/s"
+              "timeAggregation": "rate",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
-      {
+                    "op": "=",
-        "title": "Alerts by Severity",
+                    "value": "{{.service}}"
        "type": "pie",
        "query": {
          "metric": "alerts_total",
          "aggregate": "sum",
          "groupBy": ["severity"],
          "filters": [
            {
              "key": "severity",
              "operator": "=",
              "value": "${severity}"
            }
          ]
        }
      },
      {
        "title": "Alerts by Status",
        "type": "pie",
        "query": {
          "metric": "alerts_total",
          "aggregate": "sum",
          "groupBy": ["status"],
          "filters": [
            {
              "key": "status",
              "operator": "=",
              "value": "${status}"
            }
          ]
        }
                  }
                ],
-    "variables": [
+                "op": "AND"
      {
        "name": "severity",
        "label": "Severity",
        "type": "dropdown",
        "default": "*",
        "values": ["*", "critical", "high", "medium", "low"]
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
-        "name": "status",
+                  "key": "serviceName",
-        "label": "Status",
+                  "dataType": "string",
-        "type": "dropdown",
+                  "type": "tag",
-        "default": "*",
+                  "isColumn": true
        "values": ["*", "firing", "resolved", "acknowledged"]
                }
              ],
-    "layout": {
+              "legend": "{{serviceName}}",
-      "type": "grid",
+              "reduceTo": "sum"
-      "columns": 12,
+            }
-      "gap": [16, 16]
+          ],
          "queryFormulas": []
        },
-    "refresh": "15s",
+        "queryType": "builder"
-    "time": {
+      },
-      "from": "now-1h",
+      "fillSpans": false,
-      "to": "now"
+      "yAxisUnit": "alerts/s"
    }
    }
  ]
 }
--- a/infrastructure/signoz/dashboards/api-performance.json
+++ b/infrastructure/signoz/dashboards/api-performance.json
@@ -1,102 +1,351 @@
 {
  "dashboard": {
    "title": "Bakery IA - API Performance",
  "description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints",
  "tags": ["api", "performance", "rest", "graphql"],
-    "panels": [
+  "name": "bakery-ia-api-performance",
  "title": "Bakery IA - API Performance",
  "uploadedGrafana": false,
  "uuid": "bakery-ia-api-01",
  "version": "v4",
  "collapsableRowsMigrated": true,
  "layout": [
    {
-        "title": "Request Volume",
+      "x": 0,
-        "type": "timeseries",
+      "y": 0,
-        "query": {
+      "w": 6,
-          "metric": "http_server_requests_seconds_count",
+      "h": 3,
-          "aggregate": "sum",
+      "i": "request-volume",
-          "groupBy": ["api"],
+      "moved": false,
-          "filters": [
+      "static": false
    },
    {
-              "key": "api",
+      "x": 6,
-              "operator": "=",
+      "y": 0,
-              "value": "${api}"
+      "w": 6,
      "h": 3,
      "i": "error-rate",
      "moved": false,
      "static": false
    },
    {
      "x": 0,
      "y": 3,
      "w": 6,
      "h": 3,
      "i": "avg-response-time",
      "moved": false,
      "static": false
    },
    {
      "x": 6,
      "y": 3,
      "w": 6,
      "h": 3,
      "i": "p95-latency",
      "moved": false,
      "static": false
    }
  ],
  "variables": {
    "service": {
      "id": "service-var",
      "name": "service",
      "description": "Filter by API service",
      "type": "QUERY",
      "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'http_server_requests_seconds_count' AND value != '' ORDER BY value",
      "customValue": "",
      "textboxValue": "",
      "showALLOption": true,
      "multiSelect": false,
      "order": 1,
      "modificationUUID": "",
      "sort": "ASC",
      "selectedValue": null
    }
          ]
        },
        "unit": "req/s"
  },
  "widgets": [
    {
-        "title": "Error Rate",
+      "id": "request-volume",
-        "type": "timeseries",
+      "title": "Request Volume",
      "description": "API request volume by service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
-          "metric": "http_server_requests_seconds_count",
+        "builder": {
-          "aggregate": "sum",
+          "queryData": [
          "groupBy": ["api", "status"],
          "filters": [
            {
-              "key": "api",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${api}"
+              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "http_server_requests_seconds_count",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "rate",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "service.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "api.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                }
              ],
              "legend": "{{api.name}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "req/s"
    },
    {
-              "key": "status",
+      "id": "error-rate",
-              "operator": "=~",
+      "title": "Error Rate",
      "description": "API error rate by service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "http_server_requests_seconds_count",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "rate",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "api.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.api}}"
                  },
                  {
                    "key": {
                      "key": "status_code",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": false
                    },
                    "op": "=~",
                    "value": "5.."
                  }
-          ]
+                ],
                "op": "AND"
              },
-        "unit": "req/s"
+              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "api.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                },
                {
                  "key": "status_code",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": false
                }
              ],
              "legend": "{{api.name}} - {{status_code}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "req/s"
    },
    {
      "id": "avg-response-time",
      "title": "Average Response Time",
-        "type": "timeseries",
+      "description": "Average API response time by endpoint",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
-          "metric": "http_server_requests_seconds_sum",
+        "builder": {
-          "aggregate": "avg",
+          "queryData": [
          "groupBy": ["api", "endpoint"],
          "filters": [
            {
-              "key": "api",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${api}"
+              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "http_server_requests_seconds_sum",
                "dataType": "float64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "avg",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "api.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.api}}"
                  }
-          ]
+                ],
                "op": "AND"
              },
-        "unit": "seconds"
+              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "api.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                },
                {
                  "key": "endpoint",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": false
                }
              ],
              "legend": "{{api.name}} - {{endpoint}}",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "seconds"
    },
    {
      "id": "p95-latency",
      "title": "P95 Latency",
-        "type": "timeseries",
+      "description": "95th percentile latency by endpoint",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
-          "metric": "http_server_requests_seconds_bucket",
+        "builder": {
-          "aggregate": "histogram_quantile",
+          "queryData": [
          "quantile": 0.95,
          "groupBy": ["api", "endpoint"],
          "filters": [
            {
-              "key": "api",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${api}"
+              "aggregateOperator": "histogram_quantile",
              "aggregateAttribute": {
                "key": "http_server_requests_seconds_bucket",
                "dataType": "float64",
                "type": "Histogram",
                "isColumn": false
              },
              "timeAggregation": "avg",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "api.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.api}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "api.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                },
                {
                  "key": "endpoint",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": false
                }
              ],
              "legend": "{{api.name}} - {{endpoint}}",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "seconds"
    }
  ]
        },
        "unit": "seconds"
      }
    ],
    "variables": [
      {
        "name": "api",
        "label": "API Service",
        "type": "dropdown",
        "default": "*",
        "values": ["*", "gateway-api", "auth-api", "inventory-api", "production-api", "forecasting-api", "procurement-api"]
      }
    ],
    "layout": {
      "type": "grid",
      "columns": 12,
      "gap": [16, 16]
    },
    "refresh": "15s",
    "time": {
      "from": "now-1h",
      "to": "now"
    }
  }
 }
--- a/infrastructure/signoz/dashboards/application-performance.json
+++ b/infrastructure/signoz/dashboards/application-performance.json
@@ -1,101 +1,333 @@
 {
-  "dashboard": {
+  "description": "Application performance monitoring dashboard using distributed traces and metrics",
-    "title": "Bakery IA - Application Performance",
+  "tags": ["application", "performance", "traces", "apm"],
-    "description": "Application performance monitoring dashboard for Bakery IA microservices",
+  "name": "bakery-ia-application-performance",
-    "tags": ["application", "performance", "apm"],
+  "title": "Bakery IA - Application Performance (APM)",
-    "panels": [
+  "uploadedGrafana": false,
  "uuid": "bakery-ia-apm-01",
  "version": "v4",
  "collapsableRowsMigrated": true,
  "layout": [
    {
      "x": 0,
      "y": 0,
      "w": 6,
      "h": 3,
      "i": "latency-p99",
      "moved": false,
      "static": false
    },
    {
      "x": 6,
      "y": 0,
      "w": 6,
      "h": 3,
      "i": "request-rate",
      "moved": false,
      "static": false
    },
    {
      "x": 0,
      "y": 3,
      "w": 6,
      "h": 3,
      "i": "error-rate",
      "moved": false,
      "static": false
    },
    {
      "x": 6,
      "y": 3,
      "w": 6,
      "h": 3,
      "i": "avg-duration",
      "moved": false,
      "static": false
    }
  ],
  "variables": {
    "service_name": {
      "id": "service-var",
      "name": "service_name",
      "description": "Filter by service name",
      "type": "QUERY",
      "queryValue": "SELECT DISTINCT(serviceName) FROM signoz_traces.distributed_signoz_index_v2 ORDER BY serviceName",
      "customValue": "",
      "textboxValue": "",
      "showALLOption": true,
      "multiSelect": false,
      "order": 1,
      "modificationUUID": "",
      "sort": "ASC",
      "selectedValue": null
    }
  },
  "widgets": [
    {
      "id": "latency-p99",
      "title": "P99 Latency",
      "description": "99th percentile latency for selected service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "traces",
              "queryName": "A",
              "aggregateOperator": "p99",
              "aggregateAttribute": {
                "key": "duration_ns",
                "dataType": "float64",
                "type": "",
                "isColumn": true
              },
              "timeAggregation": "avg",
              "spaceAggregation": "p99",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service_name}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "serviceName",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": true
                }
              ],
              "legend": "{{serviceName}}",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "ms"
    },
    {
      "id": "request-rate",
      "title": "Request Rate",
-        "type": "timeseries",
+      "description": "Requests per second for the service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
-          "metric": "http_server_requests_seconds_count",
+        "builder": {
-          "aggregate": "sum",
+          "queryData": [
          "groupBy": ["service"],
          "filters": [
            {
-              "key": "service",
+              "dataSource": "traces",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${service}"
+              "aggregateOperator": "count",
              "aggregateAttribute": {
                "key": "",
                "dataType": "",
                "type": "",
                "isColumn": false
              },
              "timeAggregation": "rate",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service_name}}"
                  }
-          ]
+                ],
                "op": "AND"
              },
-        "unit": "req/s"
+              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "serviceName",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": true
                }
              ],
              "legend": "{{serviceName}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "reqps"
    },
    {
      "id": "error-rate",
      "title": "Error Rate",
-        "type": "timeseries",
+      "description": "Error rate percentage for the service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
-          "metric": "http_server_requests_seconds_count",
+        "builder": {
-          "aggregate": "sum",
+          "queryData": [
          "groupBy": ["service", "status"],
          "filters": [
            {
-              "key": "service",
+              "dataSource": "traces",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${service}"
+              "aggregateOperator": "count",
              "aggregateAttribute": {
                "key": "",
                "dataType": "",
                "type": "",
                "isColumn": false
              },
              "timeAggregation": "rate",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service_name}}"
                  },
                  {
-              "key": "status",
+                    "key": {
-              "operator": "=~",
+                      "key": "status_code",
-              "value": "5.."
+                      "dataType": "string",
-            }
+                      "type": "tag",
-          ]
+                      "isColumn": true
                    },
-        "unit": "req/s"
+                    "op": "=",
-      },
+                    "value": "STATUS_CODE_ERROR"
      {
        "title": "Average Response Time",
        "type": "timeseries",
        "query": {
          "metric": "http_server_requests_seconds_sum",
          "aggregate": "avg",
          "groupBy": ["service"],
          "filters": [
            {
              "key": "service",
              "operator": "=",
              "value": "${service}"
            }
          ]
        },
        "unit": "seconds"
      },
      {
        "title": "Throughput",
        "type": "timeseries",
        "query": {
          "metric": "http_server_requests_seconds_count",
          "aggregate": "rate",
          "groupBy": ["service"],
          "filters": [
            {
              "key": "service",
              "operator": "=",
              "value": "${service}"
            }
          ]
        },
        "unit": "req/s"
                  }
                ],
-    "variables": [
+                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
-        "name": "service",
+                  "key": "serviceName",
-        "label": "Service",
+                  "dataType": "string",
-        "type": "dropdown",
+                  "type": "tag",
-        "default": "*",
+                  "isColumn": true
        "values": ["*", "auth-service", "gateway-service", "forecasting-service", "inventory-service", "production-service", "procurement-service"]
                }
              ],
-    "layout": {
+              "legend": "{{serviceName}}",
-      "type": "grid",
+              "reduceTo": "sum"
-      "columns": 12,
+            }
-      "gap": [16, 16]
+          ],
          "queryFormulas": []
        },
-    "refresh": "15s",
+        "queryType": "builder"
-    "time": {
+      },
-      "from": "now-30m",
+      "fillSpans": false,
-      "to": "now"
+      "yAxisUnit": "reqps"
    },
    {
      "id": "avg-duration",
      "title": "Average Duration",
      "description": "Average request duration",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "traces",
              "queryName": "A",
              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "duration_ns",
                "dataType": "float64",
                "type": "",
                "isColumn": true
              },
              "timeAggregation": "avg",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service_name}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "serviceName",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": true
                }
              ],
              "legend": "{{serviceName}}",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "ms"
    }
  ]
 }
--- a/infrastructure/signoz/dashboards/database-performance.json
+++ b/infrastructure/signoz/dashboards/database-performance.json
@@ -1,101 +1,425 @@
 {
-  "dashboard": {
+  "description": "Comprehensive database performance monitoring for PostgreSQL, Redis, and RabbitMQ",
  "tags": ["database", "postgresql", "redis", "rabbitmq", "performance"],
  "name": "bakery-ia-database-performance",
  "title": "Bakery IA - Database Performance",
-    "description": "Comprehensive database performance monitoring for PostgreSQL and Redis",
+  "uploadedGrafana": false,
-    "tags": ["database", "postgresql", "redis", "performance"],
+  "uuid": "bakery-ia-db-01",
-    "panels": [
+  "version": "v4",
  "collapsableRowsMigrated": true,
  "layout": [
    {
-        "title": "Database Connections",
+      "x": 0,
-        "type": "timeseries",
+      "y": 0,
-        "query": {
+      "w": 6,
-          "metric": "pg_stat_activity_count",
+      "h": 3,
-          "aggregate": "sum",
+      "i": "pg-connections",
-          "groupBy": ["datname"],
+      "moved": false,
-          "filters": [
+      "static": false
            {
              "key": "datname",
              "operator": "=",
              "value": "${database}"
            }
          ]
        },
        "unit": "number"
    },
    {
-        "title": "Active Queries",
+      "x": 6,
-        "type": "timeseries",
+      "y": 0,
-        "query": {
+      "w": 6,
-          "metric": "pg_stat_activity_count",
+      "h": 3,
-          "aggregate": "sum",
+      "i": "pg-db-size",
-          "groupBy": ["datname"],
+      "moved": false,
-          "filters": [
+      "static": false
            {
              "key": "datname",
              "operator": "=",
              "value": "${database}"
    },
    {
-              "key": "state",
+      "x": 0,
-              "operator": "=",
+      "y": 3,
-              "value": "active"
+      "w": 6,
-            }
+      "h": 3,
-          ]
+      "i": "redis-connected-clients",
-        },
+      "moved": false,
-        "unit": "number"
+      "static": false
    },
    {
-        "title": "Database Size",
+      "x": 6,
-        "type": "timeseries",
+      "y": 3,
-        "query": {
+      "w": 6,
-          "metric": "pg_database_size_bytes",
+      "h": 3,
-          "aggregate": "sum",
+      "i": "redis-memory",
-          "groupBy": ["datname"],
+      "moved": false,
-          "filters": [
+      "static": false
            {
              "key": "datname",
              "operator": "=",
              "value": "${database}"
            }
          ]
        },
        "unit": "bytes"
    },
    {
-        "title": "Query Execution Time",
+      "x": 0,
-        "type": "timeseries",
+      "y": 6,
-        "query": {
+      "w": 6,
-          "metric": "pg_stat_statements_total_time",
+      "h": 3,
-          "aggregate": "avg",
+      "i": "rabbitmq-messages",
-          "groupBy": ["datname"],
+      "moved": false,
-          "filters": [
+      "static": false
            {
              "key": "datname",
              "operator": "=",
              "value": "${database}"
            }
          ]
    },
-        "unit": "seconds"
+    {
      "x": 6,
      "y": 6,
      "w": 6,
      "h": 3,
      "i": "rabbitmq-consumers",
      "moved": false,
      "static": false
    }
  ],
-    "variables": [
+  "variables": {
-      {
+    "database": {
      "id": "database-var",
      "name": "database",
-        "label": "Database",
+      "description": "Filter by PostgreSQL database name",
-        "type": "dropdown",
+      "type": "QUERY",
-        "default": "*",
+      "queryValue": "SELECT DISTINCT(resource_attrs['postgresql.database.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'postgresql.db_size' AND value != '' ORDER BY value",
-        "values": ["*", "postgresql", "redis"]
+      "customValue": "",
      "textboxValue": "",
      "showALLOption": true,
      "multiSelect": false,
      "order": 1,
      "modificationUUID": "",
      "sort": "ASC",
      "selectedValue": null
    }
  },
  "widgets": [
    {
      "id": "pg-connections",
      "title": "PostgreSQL - Active Connections",
      "description": "Number of active PostgreSQL connections",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "postgresql.backends",
                "dataType": "float64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "postgresql.database.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.database}}"
                  }
                ],
-    "layout": {
+                "op": "AND"
      "type": "grid",
      "columns": 12,
      "gap": [16, 16]
              },
-    "refresh": "30s",
+              "expression": "A",
-    "time": {
+              "disabled": false,
-      "from": "now-1h",
+              "having": [],
-      "to": "now"
+              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "postgresql.database.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                }
              ],
              "legend": "{{postgresql.database.name}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "pg-db-size",
      "title": "PostgreSQL - Database Size",
      "description": "Size of PostgreSQL databases in bytes",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "postgresql.db_size",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "postgresql.database.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.database}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "postgresql.database.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                }
              ],
              "legend": "{{postgresql.database.name}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "bytes"
    },
    {
      "id": "redis-connected-clients",
      "title": "Redis - Connected Clients",
      "description": "Number of clients connected to Redis",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "redis.clients.connected",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "host.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                }
              ],
              "legend": "{{host.name}}",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "redis-memory",
      "title": "Redis - Memory Usage",
      "description": "Redis memory usage in bytes",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "redis.memory.used",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "host.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                }
              ],
              "legend": "{{host.name}}",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "bytes"
    },
    {
      "id": "rabbitmq-messages",
      "title": "RabbitMQ - Current Messages",
      "description": "Number of messages currently in RabbitMQ queues",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "rabbitmq.message.current",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "queue",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": false
                }
              ],
              "legend": "Queue: {{queue}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "rabbitmq-consumers",
      "title": "RabbitMQ - Consumer Count",
      "description": "Number of consumers per queue",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "rabbitmq.consumer.count",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "queue",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": false
                }
              ],
              "legend": "Queue: {{queue}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    }
  ]
 }
--- a/infrastructure/signoz/dashboards/error-tracking.json
+++ b/infrastructure/signoz/dashboards/error-tracking.json
@@ -1,105 +1,348 @@
 {
  "dashboard": {
    "title": "Bakery IA - Error Tracking",
  "description": "Comprehensive error tracking and analysis dashboard",
  "tags": ["errors", "exceptions", "tracking"],
-    "panels": [
+  "name": "bakery-ia-error-tracking",
  "title": "Bakery IA - Error Tracking",
  "uploadedGrafana": false,
  "uuid": "bakery-ia-errors-01",
  "version": "v4",
  "collapsableRowsMigrated": true,
  "layout": [
    {
      "x": 0,
      "y": 0,
      "w": 6,
      "h": 3,
      "i": "total-errors",
      "moved": false,
      "static": false
    },
    {
      "x": 6,
      "y": 0,
      "w": 6,
      "h": 3,
      "i": "error-rate",
      "moved": false,
      "static": false
    },
    {
      "x": 0,
      "y": 3,
      "w": 6,
      "h": 3,
      "i": "http-5xx",
      "moved": false,
      "static": false
    },
    {
      "x": 6,
      "y": 3,
      "w": 6,
      "h": 3,
      "i": "http-4xx",
      "moved": false,
      "static": false
    }
  ],
  "variables": {
    "service": {
      "id": "service-var",
      "name": "service",
      "description": "Filter by service name",
      "type": "QUERY",
      "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'error_total' AND value != '' ORDER BY value",
      "customValue": "",
      "textboxValue": "",
      "showALLOption": true,
      "multiSelect": false,
      "order": 1,
      "modificationUUID": "",
      "sort": "ASC",
      "selectedValue": null
    }
  },
  "widgets": [
    {
      "id": "total-errors",
      "title": "Total Errors",
-        "type": "stat",
+      "description": "Total number of errors across all services",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "value",
      "query": {
-          "metric": "error_total",
+        "builder": {
-          "aggregate": "sum",
+          "queryData": [
          "filters": [
            {
-              "key": "service",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${service}"
+              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "error_total",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "sum",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "service.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  }
-          ]
+                ],
                "op": "AND"
              },
-        "unit": "number"
+              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [],
              "legend": "Total Errors",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "error-rate",
      "title": "Error Rate",
-        "type": "timeseries",
+      "description": "Error rate over time",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
-          "metric": "error_total",
+        "builder": {
-          "aggregate": "rate",
+          "queryData": [
          "groupBy": ["service"],
          "filters": [
            {
-              "key": "service",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${service}"
+              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "error_total",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "rate",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "service.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  }
-          ]
+                ],
                "op": "AND"
              },
-        "unit": "errors/s"
+              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "serviceName",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": true
                }
              ],
              "legend": "{{serviceName}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "errors/s"
    },
    {
      "id": "http-5xx",
      "title": "HTTP 5xx Errors",
-        "type": "timeseries",
+      "description": "Server errors (5xx status codes)",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
-          "metric": "http_server_requests_seconds_count",
+        "builder": {
-          "aggregate": "sum",
+          "queryData": [
          "groupBy": ["service", "status"],
          "filters": [
            {
-              "key": "service",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${service}"
+              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "http_server_requests_seconds_count",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "sum",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "service.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  },
                  {
-              "key": "status",
+                    "key": {
-              "operator": "=~",
+                      "key": "status_code",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": false
                    },
                    "op": "=~",
                    "value": "5.."
                  }
-          ]
+                ],
                "op": "AND"
              },
-        "unit": "number"
+              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "serviceName",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": true
                },
                {
                  "key": "status_code",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": false
                }
              ],
              "legend": "{{serviceName}} - {{status_code}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "number"
    },
    {
      "id": "http-4xx",
      "title": "HTTP 4xx Errors",
-        "type": "timeseries",
+      "description": "Client errors (4xx status codes)",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
-          "metric": "http_server_requests_seconds_count",
+        "builder": {
-          "aggregate": "sum",
+          "queryData": [
          "groupBy": ["service", "status"],
          "filters": [
            {
-              "key": "service",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${service}"
+              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "http_server_requests_seconds_count",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "sum",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "service.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  },
                  {
-              "key": "status",
+                    "key": {
-              "operator": "=~",
+                      "key": "status_code",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": false
                    },
                    "op": "=~",
                    "value": "4.."
                  }
          ]
        },
        "unit": "number"
      }
                ],
-    "variables": [
+                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
-        "name": "service",
+                  "key": "serviceName",
-        "label": "Service",
+                  "dataType": "string",
-        "type": "dropdown",
+                  "type": "tag",
-        "default": "*",
+                  "isColumn": true
-        "values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
+                },
                {
                  "key": "status_code",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": false
                }
              ],
-    "layout": {
+              "legend": "{{serviceName}} - {{status_code}}",
-      "type": "grid",
+              "reduceTo": "sum"
-      "columns": 12,
+            }
-      "gap": [16, 16]
+          ],
          "queryFormulas": []
        },
-    "refresh": "15s",
+        "queryType": "builder"
-    "time": {
+      },
-      "from": "now-1h",
+      "fillSpans": false,
-      "to": "now"
+      "yAxisUnit": "number"
    }
    }
  ]
 }
--- a/infrastructure/signoz/dashboards/infrastructure-monitoring.json
+++ b/infrastructure/signoz/dashboards/infrastructure-monitoring.json
@@ -1,105 +1,423 @@
 {
-  "dashboard": {
+  "description": "Comprehensive infrastructure monitoring dashboard for Bakery IA Kubernetes cluster",
  "tags": ["infrastructure", "kubernetes", "k8s", "system"],
  "name": "bakery-ia-infrastructure-monitoring",
  "title": "Bakery IA - Infrastructure Monitoring",
-    "description": "Comprehensive infrastructure monitoring dashboard for Bakery IA system",
+  "uploadedGrafana": false,
-    "tags": ["infrastructure", "system", "kubernetes"],
+  "uuid": "bakery-ia-infra-01",
-    "panels": [
+  "version": "v4",
  "collapsableRowsMigrated": true,
  "layout": [
    {
-        "title": "CPU Usage",
+      "x": 0,
-        "type": "timeseries",
+      "y": 0,
-        "query": {
+      "w": 6,
-          "metric": "container_cpu_usage_seconds_total",
+      "h": 3,
-          "aggregate": "sum",
+      "i": "pod-count",
-          "groupBy": ["namespace"],
+      "moved": false,
-          "filters": [
+      "static": false
            {
              "key": "namespace",
              "operator": "=",
              "value": "bakery-ia"
            }
          ]
    },
-        "unit": "percent",
+    {
-        "yAxis": {
+      "x": 6,
-          "min": 0,
+      "y": 0,
-          "max": 100
+      "w": 6,
      "h": 3,
      "i": "pod-phase",
      "moved": false,
      "static": false
    },
    {
      "x": 0,
      "y": 3,
      "w": 6,
      "h": 3,
      "i": "container-restarts",
      "moved": false,
      "static": false
    },
    {
      "x": 6,
      "y": 3,
      "w": 6,
      "h": 3,
      "i": "node-condition",
      "moved": false,
      "static": false
    },
    {
      "x": 0,
      "y": 6,
      "w": 12,
      "h": 3,
      "i": "deployment-status",
      "moved": false,
      "static": false
    }
  ],
  "variables": {
    "namespace": {
      "id": "namespace-var",
      "name": "namespace",
      "description": "Filter by Kubernetes namespace",
      "type": "QUERY",
      "queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'k8s.pod.phase' AND value != '' ORDER BY value",
      "customValue": "",
      "textboxValue": "",
      "showALLOption": true,
      "multiSelect": false,
      "order": 1,
      "modificationUUID": "",
      "sort": "ASC",
      "selectedValue": "bakery-ia"
    }
  },
  "widgets": [
    {
-        "title": "Memory Usage",
+      "id": "pod-count",
-        "type": "timeseries",
+      "title": "Total Pods",
      "description": "Total number of pods in the namespace",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "value",
      "query": {
-          "metric": "container_memory_working_set_bytes",
+        "builder": {
-          "aggregate": "sum",
+          "queryData": [
          "groupBy": ["namespace"],
          "filters": [
            {
-              "key": "namespace",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "bakery-ia"
+              "aggregateOperator": "count",
              "aggregateAttribute": {
                "key": "k8s.pod.phase",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "k8s.namespace.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.namespace}}"
                  }
-          ]
+                ],
                "op": "AND"
              },
-        "unit": "bytes"
+              "expression": "A",
-      },
+              "disabled": false,
-      {
+              "having": [],
-        "title": "Network Traffic",
+              "stepInterval": 60,
-        "type": "timeseries",
+              "limit": null,
-        "query": {
+              "orderBy": [],
-          "metric": "container_network_receive_bytes_total",
+              "groupBy": [],
-          "aggregate": "sum",
+              "legend": "Total Pods",
-          "groupBy": ["namespace"],
+              "reduceTo": "sum"
          "filters": [
            {
              "key": "namespace",
              "operator": "=",
              "value": "bakery-ia"
            }
-          ]
+          ],
          "queryFormulas": []
        },
-        "unit": "bytes"
+        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
-        "title": "Pod Status",
+      "id": "pod-phase",
-        "type": "stat",
+      "title": "Pod Phase Distribution",
      "description": "Pods by phase (Running, Pending, Failed, etc.)",
      "isStacked": true,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
-          "metric": "kube_pod_status_phase",
+        "builder": {
-          "aggregate": "count",
+          "queryData": [
          "groupBy": ["phase"],
          "filters": [
            {
-              "key": "namespace",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "bakery-ia"
+              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "k8s.pod.phase",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "k8s.namespace.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.namespace}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "phase",
-              "operator": "=",
+                  "dataType": "string",
-              "value": "Running"
+                  "type": "tag",
                  "isColumn": false
                }
              ],
              "legend": "{{phase}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "container-restarts",
      "title": "Container Restarts",
      "description": "Container restart count over time",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "k8s.container.restarts",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "increase",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "k8s.namespace.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.namespace}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "k8s.pod.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                }
              ],
              "legend": "{{k8s.pod.name}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "node-condition",
      "title": "Node Conditions",
      "description": "Node condition status (Ready, MemoryPressure, DiskPressure, etc.)",
      "isStacked": true,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "k8s.node.condition_ready",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "k8s.node.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                }
              ],
              "legend": "{{k8s.node.name}} Ready",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "deployment-status",
      "title": "Deployment Status (Desired vs Available)",
      "description": "Deployment replicas: desired vs available",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "k8s.deployment.desired",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "k8s.namespace.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.namespace}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "k8s.deployment.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                }
              ],
              "legend": "{{k8s.deployment.name}} (desired)",
              "reduceTo": "avg"
            },
            {
              "dataSource": "metrics",
              "queryName": "B",
              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "k8s.deployment.available",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "k8s.namespace.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.namespace}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "B",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "k8s.deployment.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                }
              ],
              "legend": "{{k8s.deployment.name}} (available)",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    }
  ]
        },
        "unit": "number"
      }
    ],
    "variables": [
      {
        "name": "namespace",
        "label": "Namespace",
        "type": "dropdown",
        "default": "bakery-ia",
        "values": ["bakery-ia", "default", "kube-system"]
      }
    ],
    "layout": {
      "type": "grid",
      "columns": 12,
      "gap": [16, 16]
    },
    "refresh": "30s",
    "time": {
      "from": "now-1h",
      "to": "now"
    }
  }
 }
--- a/infrastructure/signoz/dashboards/log-analysis.json
+++ b/infrastructure/signoz/dashboards/log-analysis.json
@@ -1,99 +1,333 @@
 {
  "dashboard": {
    "title": "Bakery IA - Log Analysis",
  "description": "Comprehensive log analysis and search dashboard",
  "tags": ["logs", "analysis", "search"],
-    "panels": [
+  "name": "bakery-ia-log-analysis",
  "title": "Bakery IA - Log Analysis",
  "uploadedGrafana": false,
  "uuid": "bakery-ia-logs-01",
  "version": "v4",
  "collapsableRowsMigrated": true,
  "layout": [
    {
-        "title": "Log Volume",
+      "x": 0,
-        "type": "timeseries",
+      "y": 0,
-        "query": {
+      "w": 6,
-          "metric": "log_lines_total",
+      "h": 3,
-          "aggregate": "sum",
+      "i": "log-volume",
-          "groupBy": ["service"],
+      "moved": false,
-          "filters": [
+      "static": false
    },
    {
-              "key": "service",
+      "x": 6,
-              "operator": "=",
+      "y": 0,
-              "value": "${service}"
+      "w": 6,
      "h": 3,
      "i": "error-logs",
      "moved": false,
      "static": false
    },
    {
      "x": 0,
      "y": 3,
      "w": 6,
      "h": 3,
      "i": "logs-by-level",
      "moved": false,
      "static": false
    },
    {
      "x": 6,
      "y": 3,
      "w": 6,
      "h": 3,
      "i": "logs-by-service",
      "moved": false,
      "static": false
    }
  ],
  "variables": {
    "service": {
      "id": "service-var",
      "name": "service",
      "description": "Filter by service name",
      "type": "QUERY",
      "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'log_lines_total' AND value != '' ORDER BY value",
      "customValue": "",
      "textboxValue": "",
      "showALLOption": true,
      "multiSelect": false,
      "order": 1,
      "modificationUUID": "",
      "sort": "ASC",
      "selectedValue": null
    }
          ]
        },
        "unit": "logs/s"
  },
  "widgets": [
    {
-        "title": "Error Logs",
+      "id": "log-volume",
-        "type": "timeseries",
+      "title": "Log Volume",
      "description": "Total log volume by service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
-          "metric": "log_lines_total",
+        "builder": {
-          "aggregate": "sum",
+          "queryData": [
          "groupBy": ["service"],
          "filters": [
            {
-              "key": "service",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${service}"
+              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "log_lines_total",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "rate",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "serviceName",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": true
                }
              ],
              "legend": "{{serviceName}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "logs/s"
    },
    {
      "id": "error-logs",
      "title": "Error Logs",
      "description": "Error log volume by service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "log_lines_total",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "rate",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  },
                  {
                    "key": {
                      "key": "level",
-              "operator": "=",
+                      "dataType": "string",
                      "type": "tag",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "error"
                  }
-          ]
+                ],
                "op": "AND"
              },
-        "unit": "logs/s"
+              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "serviceName",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": true
                }
              ],
              "legend": "{{serviceName}} (errors)",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "logs/s"
    },
    {
      "id": "logs-by-level",
      "title": "Logs by Level",
-        "type": "pie",
+      "description": "Distribution of logs by severity level",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "pie",
      "query": {
-          "metric": "log_lines_total",
+        "builder": {
-          "aggregate": "sum",
+          "queryData": [
          "groupBy": ["level"],
          "filters": [
            {
-              "key": "service",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${service}"
+              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "log_lines_total",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "sum",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  }
-          ]
+                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "level",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": false
                }
              ],
              "legend": "{{level}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "logs-by-service",
      "title": "Logs by Service",
-        "type": "pie",
+      "description": "Distribution of logs by service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "pie",
      "query": {
-          "metric": "log_lines_total",
+        "builder": {
-          "aggregate": "sum",
+          "queryData": [
          "groupBy": ["service"],
          "filters": [
            {
-              "key": "service",
+              "dataSource": "metrics",
-              "operator": "=",
+              "queryName": "A",
-              "value": "${service}"
+              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "log_lines_total",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "sum",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "serviceName",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": true
                }
              ],
              "legend": "{{serviceName}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    }
  ]
 }
      }
    ],
    "variables": [
      {
        "name": "service",
        "label": "Service",
        "type": "dropdown",
        "default": "*",
        "values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
      }
    ],
    "layout": {
      "type": "grid",
      "columns": 12,
      "gap": [16, 16]
    },
    "refresh": "30s",
    "time": {
      "from": "now-1h",
      "to": "now"
    }
  }
 }
--- a/infrastructure/signoz/dashboards/system-health.json
+++ b/infrastructure/signoz/dashboards/system-health.json
@@ -1,92 +1,295 @@
 {
  "dashboard": {
    "title": "Bakery IA - System Health",
  "description": "Comprehensive system health monitoring dashboard",
  "tags": ["system", "health", "monitoring"],
-    "panels": [
+  "name": "bakery-ia-system-health",
  "title": "Bakery IA - System Health",
  "uploadedGrafana": false,
  "uuid": "bakery-ia-health-01",
  "version": "v4",
  "collapsableRowsMigrated": true,
  "layout": [
    {
-        "title": "System Availability",
+      "x": 0,
-        "type": "stat",
+      "y": 0,
-        "query": {
+      "w": 6,
-          "metric": "system_availability",
+      "h": 3,
-          "aggregate": "avg",
+      "i": "system-availability",
-          "filters": [
+      "moved": false,
-            {
+      "static": false
              "key": "namespace",
              "operator": "=",
              "value": "${namespace}"
            }
          ]
        },
        "unit": "percent"
    },
    {
-        "title": "Service Health Score",
+      "x": 6,
-        "type": "stat",
+      "y": 0,
-        "query": {
+      "w": 6,
-          "metric": "service_health_score",
+      "h": 3,
-          "aggregate": "avg",
+      "i": "health-score",
-          "filters": [
+      "moved": false,
-            {
+      "static": false
              "key": "namespace",
              "operator": "=",
              "value": "${namespace}"
            }
          ]
        },
        "unit": "number"
    },
    {
-        "title": "CPU Usage",
+      "x": 0,
-        "type": "timeseries",
+      "y": 3,
-        "query": {
+      "w": 6,
-          "metric": "system_cpu_usage",
+      "h": 3,
-          "aggregate": "avg",
+      "i": "cpu-usage",
-          "filters": [
+      "moved": false,
-            {
+      "static": false
              "key": "namespace",
              "operator": "=",
              "value": "${namespace}"
            }
          ]
        },
        "unit": "percent"
    },
    {
-        "title": "Memory Usage",
+      "x": 6,
-        "type": "timeseries",
+      "y": 3,
-        "query": {
+      "w": 6,
-          "metric": "system_memory_usage",
+      "h": 3,
-          "aggregate": "avg",
+      "i": "memory-usage",
-          "filters": [
+      "moved": false,
-            {
+      "static": false
              "key": "namespace",
              "operator": "=",
              "value": "${namespace}"
            }
          ]
        },
        "unit": "percent"
    }
  ],
-    "variables": [
+  "variables": {
-      {
+    "namespace": {
      "id": "namespace-var",
      "name": "namespace",
-        "label": "Namespace",
+      "description": "Filter by Kubernetes namespace",
-        "type": "dropdown",
+      "type": "QUERY",
-        "default": "bakery-ia",
+      "queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'system_availability' AND value != '' ORDER BY value",
-        "values": ["bakery-ia", "default"]
+      "customValue": "",
      "textboxValue": "",
      "showALLOption": true,
      "multiSelect": false,
      "order": 1,
      "modificationUUID": "",
      "sort": "ASC",
      "selectedValue": "bakery-ia"
    }
  },
  "widgets": [
    {
      "id": "system-availability",
      "title": "System Availability",
      "description": "Overall system availability percentage",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "value",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "system_availability",
                "dataType": "float64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "k8s.namespace.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.namespace}}"
                  }
                ],
-    "layout": {
+                "op": "AND"
      "type": "grid",
      "columns": 12,
      "gap": [16, 16]
              },
-    "refresh": "30s",
+              "expression": "A",
-    "time": {
+              "disabled": false,
-      "from": "now-1h",
+              "having": [],
-      "to": "now"
+              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [],
              "legend": "System Availability",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "percent"
    },
    {
      "id": "health-score",
      "title": "Service Health Score",
      "description": "Overall service health score",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "value",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "service_health_score",
                "dataType": "float64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "k8s.namespace.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.namespace}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [],
              "legend": "Health Score",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "cpu-usage",
      "title": "CPU Usage",
      "description": "System CPU usage over time",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "system_cpu_usage",
                "dataType": "float64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "avg",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "k8s.namespace.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.namespace}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [],
              "legend": "CPU Usage",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "percent"
    },
    {
      "id": "memory-usage",
      "title": "Memory Usage",
      "description": "System memory usage over time",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "system_memory_usage",
                "dataType": "float64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "avg",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "k8s.namespace.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.namespace}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [],
              "legend": "Memory Usage",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "percent"
    }
  ]
 }
--- a/infrastructure/signoz/dashboards/user-activity.json
+++ b/infrastructure/signoz/dashboards/user-activity.json
@@ -1,96 +1,323 @@
 {
  "dashboard": {
    "title": "Bakery IA - User Activity",
  "description": "User activity and behavior monitoring dashboard",
  "tags": ["user", "activity", "behavior"],
-    "panels": [
+  "name": "bakery-ia-user-activity",
  "title": "Bakery IA - User Activity",
  "uploadedGrafana": false,
  "uuid": "bakery-ia-user-01",
  "version": "v4",
  "collapsableRowsMigrated": true,
  "layout": [
    {
-        "title": "Active Users",
+      "x": 0,
-        "type": "timeseries",
+      "y": 0,
-        "query": {
+      "w": 6,
-          "metric": "active_users",
+      "h": 3,
-          "aggregate": "sum",
+      "i": "active-users",
-          "groupBy": ["service"],
+      "moved": false,
-          "filters": [
+      "static": false
            {
              "key": "service",
              "operator": "=",
              "value": "${service}"
            }
          ]
        },
        "unit": "number"
    },
    {
-        "title": "User Sessions",
+      "x": 6,
-        "type": "timeseries",
+      "y": 0,
-        "query": {
+      "w": 6,
-          "metric": "user_sessions_total",
+      "h": 3,
-          "aggregate": "sum",
+      "i": "user-sessions",
-          "groupBy": ["service"],
+      "moved": false,
-          "filters": [
+      "static": false
            {
              "key": "service",
              "operator": "=",
              "value": "${service}"
            }
          ]
        },
        "unit": "number"
    },
    {
-        "title": "API Calls per User",
+      "x": 0,
-        "type": "timeseries",
+      "y": 3,
-        "query": {
+      "w": 6,
-          "metric": "api_calls_per_user",
+      "h": 3,
-          "aggregate": "avg",
+      "i": "api-calls-per-user",
-          "groupBy": ["service"],
+      "moved": false,
-          "filters": [
+      "static": false
            {
              "key": "service",
              "operator": "=",
              "value": "${service}"
            }
          ]
        },
        "unit": "number"
    },
    {
-        "title": "Session Duration",
+      "x": 6,
-        "type": "timeseries",
+      "y": 3,
-        "query": {
+      "w": 6,
-          "metric": "session_duration_seconds",
+      "h": 3,
-          "aggregate": "avg",
+      "i": "session-duration",
-          "groupBy": ["service"],
+      "moved": false,
-          "filters": [
+      "static": false
            {
              "key": "service",
              "operator": "=",
              "value": "${service}"
            }
          ]
        },
        "unit": "seconds"
    }
  ],
-    "variables": [
+  "variables": {
-      {
+    "service": {
      "id": "service-var",
      "name": "service",
-        "label": "Service",
+      "description": "Filter by service name",
-        "type": "dropdown",
+      "type": "QUERY",
-        "default": "*",
+      "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'active_users' AND value != '' ORDER BY value",
-        "values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service"]
+      "customValue": "",
      "textboxValue": "",
      "showALLOption": true,
      "multiSelect": false,
      "order": 1,
      "modificationUUID": "",
      "sort": "ASC",
      "selectedValue": null
    }
  },
  "widgets": [
    {
      "id": "active-users",
      "title": "Active Users",
      "description": "Number of active users by service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "active_users",
                "dataType": "int64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "latest",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "service.name",
                      "dataType": "string",
                      "type": "resource",
                      "isColumn": false
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  }
                ],
-    "layout": {
+                "op": "AND"
      "type": "grid",
      "columns": 12,
      "gap": [16, 16]
              },
-    "refresh": "30s",
+              "expression": "A",
-    "time": {
+              "disabled": false,
-      "from": "now-1h",
+              "having": [],
-      "to": "now"
+              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "service.name",
                  "dataType": "string",
                  "type": "resource",
                  "isColumn": false
                }
              ],
              "legend": "{{service.name}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "user-sessions",
      "title": "User Sessions",
      "description": "Total user sessions by service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "sum",
              "aggregateAttribute": {
                "key": "user_sessions_total",
                "dataType": "int64",
                "type": "Counter",
                "isColumn": false
              },
              "timeAggregation": "sum",
              "spaceAggregation": "sum",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "serviceName",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": true
                }
              ],
              "legend": "{{serviceName}}",
              "reduceTo": "sum"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "api-calls-per-user",
      "title": "API Calls per User",
      "description": "Average API calls per user by service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "api_calls_per_user",
                "dataType": "float64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "avg",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "serviceName",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": true
                }
              ],
              "legend": "{{serviceName}}",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "none"
    },
    {
      "id": "session-duration",
      "title": "Session Duration",
      "description": "Average session duration by service",
      "isStacked": false,
      "nullZeroValues": "zero",
      "opacity": "1",
      "panelTypes": "graph",
      "query": {
        "builder": {
          "queryData": [
            {
              "dataSource": "metrics",
              "queryName": "A",
              "aggregateOperator": "avg",
              "aggregateAttribute": {
                "key": "session_duration_seconds",
                "dataType": "float64",
                "type": "Gauge",
                "isColumn": false
              },
              "timeAggregation": "avg",
              "spaceAggregation": "avg",
              "functions": [],
              "filters": {
                "items": [
                  {
                    "key": {
                      "key": "serviceName",
                      "dataType": "string",
                      "type": "tag",
                      "isColumn": true
                    },
                    "op": "=",
                    "value": "{{.service}}"
                  }
                ],
                "op": "AND"
              },
              "expression": "A",
              "disabled": false,
              "having": [],
              "stepInterval": 60,
              "limit": null,
              "orderBy": [],
              "groupBy": [
                {
                  "key": "serviceName",
                  "dataType": "string",
                  "type": "tag",
                  "isColumn": true
                }
              ],
              "legend": "{{serviceName}}",
              "reduceTo": "avg"
            }
          ],
          "queryFormulas": []
        },
        "queryType": "builder"
      },
      "fillSpans": false,
      "yAxisUnit": "seconds"
    }
  ]
 }
--- a/services/ai_insights/app/main.py
+++ b/services/ai_insights/app/main.py
@@ -1,160 +1,61 @@
 """Main FastAPI application for AI Insights Service."""
 from fastapi import FastAPI, Response
 from fastapi.middleware.cors import CORSMiddleware
 from contextlib import asynccontextmanager
 import structlog
 import os
 from app.core.config import settings
 from app.core.database import init_db, close_db
 from app.api import insights
-from shared.monitoring.logging import setup_logging
+from shared.service_base import StandardFastAPIService
 from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
 from shared.monitoring.system_metrics import SystemMetricsCollector
-# OpenTelemetry imports
+# Initialize logger
 from opentelemetry import trace
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
 from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
 from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
 from opentelemetry.instrumentation.redis import RedisInstrumentor
 from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
 from opentelemetry.sdk.resources import Resource
 # Configure OpenTelemetry tracing
 def setup_tracing(service_name: str = "ai-insights"):
    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
    resource = Resource.create({"service.name": service_name})
    otlp_exporter = OTLPSpanExporter(
        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
        insecure=True
    )
    provider = TracerProvider(resource=resource)
    processor = BatchSpanProcessor(otlp_exporter)
    provider.add_span_processor(processor)
    trace.set_tracer_provider(provider)
    return provider
 # Initialize tracing
 tracer_provider = setup_tracing("ai-insights")
 # Setup logging
 setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO'))
 logger = structlog.get_logger()
 # Setup OpenTelemetry logging export if enabled
 logger.info(f"OTEL_LOGS_EXPORTER env var: {os.getenv('OTEL_LOGS_EXPORTER', 'not set')}")
 if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
    try:
        logger.info("Attempting to setup OpenTelemetry logging")
        from shared.monitoring.logs_exporter import setup_otel_logging
        result = setup_otel_logging("ai-insights", settings.SERVICE_VERSION)
        if result:
            logger.info("OpenTelemetry logs export enabled for ai-insights")
        else:
            logger.warning("OpenTelemetry logs export setup returned None")
    except Exception as e:
        logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
 else:
    logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
 class AIInsightsService(StandardFastAPIService):
    """AI Insights Service with standardized monitoring setup"""
-@asynccontextmanager
+    async def on_startup(self, app):
-async def lifespan(app: FastAPI):
+        """Custom startup logic for AI Insights"""
-    """Lifespan event handler for startup and shutdown."""
+        # Initialize database
    # Startup
    logger.info("Starting AI Insights Service", service=settings.SERVICE_NAME, version=settings.SERVICE_VERSION)
        await init_db()
        logger.info("Database initialized")
-    # Initialize system metrics collection
+        await super().on_startup(app)
    system_metrics = SystemMetricsCollector("ai-insights")
    logger.info("System metrics collection started")
-    # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
+    async def on_shutdown(self, app):
-    logger.info("Metrics export configured via OpenTelemetry OTLP")
+        """Custom shutdown logic for AI Insights"""
        await super().on_shutdown(app)
-    yield
+        # Close database
    # Shutdown
    logger.info("Shutting down AI Insights Service")
        await close_db()
        logger.info("Database connections closed")
-# Create FastAPI app
+# Create service instance
-app = FastAPI(
+service = AIInsightsService(
-    title="AI Insights Service",
+    service_name="ai-insights",
    app_name="AI Insights Service",
    description="Intelligent insights and recommendations for bakery operations",
    version=settings.SERVICE_VERSION,
-    lifespan=lifespan
+    log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
    cors_origins=getattr(settings, 'ALLOWED_ORIGINS', ["*"]),
    api_prefix=settings.API_V1_PREFIX,
    enable_metrics=True,
    enable_health_checks=True,
    enable_tracing=True,
    enable_cors=True
 )
-# Instrument FastAPI with OpenTelemetry
+# Create FastAPI app
-FastAPIInstrumentor.instrument_app(app)
+app = service.create_app()
-# Instrument httpx for outgoing requests
+# Add service-specific routers
-HTTPXClientInstrumentor().instrument()
+service.add_router(
 # Instrument Redis
 RedisInstrumentor().instrument()
 # Instrument SQLAlchemy
 SQLAlchemyInstrumentor().instrument()
 # Initialize metrics collector
 metrics_collector = MetricsCollector("ai-insights")
 # Add metrics middleware to track HTTP requests
 add_metrics_middleware(app, metrics_collector)
 # CORS middleware
 app.add_middleware(
    CORSMiddleware,
    allow_origins=settings.ALLOWED_ORIGINS,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Include routers
 app.include_router(
    insights.router,
    prefix=settings.API_V1_PREFIX,
    tags=["insights"]
 )
@app.get("/")
 async def root():
    """Root endpoint."""
    return {
        "service": settings.SERVICE_NAME,
        "version": settings.SERVICE_VERSION,
        "status": "running"
    }
@app.get("/health")
 async def health_check():
    """Health check endpoint."""
    return {
        "status": "healthy",
        "service": settings.SERVICE_NAME,
        "version": settings.SERVICE_VERSION
    }
 # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
 # The /metrics endpoint is not needed as metrics are pushed automatically
 if __name__ == "__main__":
    import uvicorn
--- a/services/alert_processor/app/main.py
+++ b/services/alert_processor/app/main.py
@@ -4,90 +4,28 @@ Alert Processor Service v2.0
 Main FastAPI application with RabbitMQ consumer lifecycle management.
 """
 from fastapi import FastAPI, Response
 from fastapi.middleware.cors import CORSMiddleware
 from contextlib import asynccontextmanager
 import structlog
 import os
 from app.core.config import settings
 from app.consumer.event_consumer import EventConsumer
 from app.api import alerts, sse
 from shared.redis_utils import initialize_redis, close_redis
-from shared.monitoring.logging import setup_logging
+from shared.service_base import StandardFastAPIService
 from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
 from shared.monitoring.system_metrics import SystemMetricsCollector
-# OpenTelemetry imports
+# Initialize logger
 from opentelemetry import trace
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
 from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
 from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
 from opentelemetry.instrumentation.redis import RedisInstrumentor
 from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
 from opentelemetry.sdk.resources import Resource
 # Configure OpenTelemetry tracing
 def setup_tracing(service_name: str = "alert-processor"):
    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
    resource = Resource.create({"service.name": service_name})
    otlp_exporter = OTLPSpanExporter(
        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
        insecure=True
    )
    provider = TracerProvider(resource=resource)
    processor = BatchSpanProcessor(otlp_exporter)
    provider.add_span_processor(processor)
    trace.set_tracer_provider(provider)
    return provider
 # Initialize tracing
 tracer_provider = setup_tracing("alert-processor")
 # Setup logging
 setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
 # Setup OpenTelemetry logging export if enabled
 if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
    try:
        from shared.monitoring.logs_exporter import setup_otel_logging
        result = setup_otel_logging("alert-processor", settings.VERSION)
        if result:
 logger = structlog.get_logger()
            logger.info("OpenTelemetry logs export enabled for alert-processor")
        else:
            logger = structlog.get_logger()
            logger.warning("OpenTelemetry logs export setup returned None")
    except Exception as e:
        logger = structlog.get_logger()
        logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
 else:
    logger = structlog.get_logger()
    logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
 # Global consumer instance
 consumer: EventConsumer = None
-@asynccontextmanager
+class AlertProcessorService(StandardFastAPIService):
-async def lifespan(app: FastAPI):
+    """Alert Processor Service with standardized monitoring setup and RabbitMQ consumer"""
    """
    Application lifecycle manager.
-    Startup: Initialize Redis and RabbitMQ consumer
+    async def on_startup(self, app):
-    Shutdown: Close consumer and Redis connections
+        """Custom startup logic for Alert Processor"""
    """
        global consumer
    logger.info("alert_processor_starting", version=settings.VERSION)
    # Startup: Initialize Redis and start consumer
    try:
        # Initialize Redis connection
        await initialize_redis(
            settings.REDIS_URL,
@@ -96,69 +34,48 @@ async def lifespan(app: FastAPI):
        )
        logger.info("redis_initialized")
        # Start RabbitMQ consumer
        consumer = EventConsumer()
        await consumer.start()
-        logger.info("alert_processor_started")
+        logger.info("rabbitmq_consumer_started")
-        # Initialize system metrics collection
+        await super().on_startup(app)
        system_metrics = SystemMetricsCollector("alert-processor")
        logger.info("System metrics collection started")
-        # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
+    async def on_shutdown(self, app):
-        logger.info("Metrics export configured via OpenTelemetry OTLP")
+        """Custom shutdown logic for Alert Processor"""
-    except Exception as e:
+        global consumer
        logger.error("alert_processor_startup_failed", error=str(e))
        raise
-    yield
+        await super().on_shutdown(app)
-    # Shutdown: Stop consumer and close Redis
+        # Stop RabbitMQ consumer
    try:
        if consumer:
            await consumer.stop()
            logger.info("rabbitmq_consumer_stopped")
        # Close Redis
        await close_redis()
-        logger.info("alert_processor_shutdown")
+        logger.info("redis_closed")
    except Exception as e:
        logger.error("alert_processor_shutdown_failed", error=str(e))
-# Create FastAPI app
+# Create service instance
-app = FastAPI(
+service = AlertProcessorService(
-    title="Alert Processor Service",
+    service_name="alert-processor",
    app_name="Alert Processor Service",
    description="Event processing, enrichment, and alert management system",
    version=settings.VERSION,
-    lifespan=lifespan,
+    log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
-    debug=settings.DEBUG
+    cors_origins=["*"],  # Configure appropriately for production
    api_prefix="/api/v1",
    enable_metrics=True,
    enable_health_checks=True,
    enable_tracing=True,
    enable_cors=True
 )
-# Instrument FastAPI with OpenTelemetry
+# Create FastAPI app
-FastAPIInstrumentor.instrument_app(app)
+app = service.create_app(debug=settings.DEBUG)
-# Instrument httpx for outgoing requests
+# Add service-specific routers
 HTTPXClientInstrumentor().instrument()
 # Instrument Redis
 RedisInstrumentor().instrument()
 # Instrument SQLAlchemy
 SQLAlchemyInstrumentor().instrument()
 # Initialize metrics collector
 metrics_collector = MetricsCollector("alert-processor")
 # Add metrics middleware to track HTTP requests
 add_metrics_middleware(app, metrics_collector)
 # CORS middleware
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Configure appropriately for production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Include routers
 app.include_router(
    alerts.router,
    prefix="/api/v1/tenants/{tenant_id}",
@@ -172,34 +89,6 @@ app.include_router(
 )
@app.get("/health")
 async def health_check():
    """
    Health check endpoint.
    Returns service status and version.
    """
    return {
        "status": "healthy",
        "service": settings.SERVICE_NAME,
        "version": settings.VERSION
    }
@app.get("/")
 async def root():
    """Root endpoint with service info"""
    return {
        "service": settings.SERVICE_NAME,
        "version": settings.VERSION,
        "description": "Event processing, enrichment, and alert management system"
    }
 # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
 # The /metrics endpoint is not needed as metrics are pushed automatically
 if __name__ == "__main__":
    import uvicorn
--- a/services/demo_session/app/main.py
+++ b/services/demo_session/app/main.py
@@ -3,192 +3,74 @@ Demo Session Service - Main Application
 Manages isolated demo sessions with ephemeral data
 """
 from fastapi import FastAPI, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 import structlog
 from contextlib import asynccontextmanager
 import os
 from app.core import settings, DatabaseManager
 from app.api import demo_sessions, demo_accounts, demo_operations, internal
 from shared.redis_utils import initialize_redis, close_redis
-from shared.monitoring.logging import setup_logging
+from shared.service_base import StandardFastAPIService
 from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
 from shared.monitoring.system_metrics import SystemMetricsCollector
-# OpenTelemetry imports
+# Initialize logger
 from opentelemetry import trace
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
 from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
 from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
 from opentelemetry.instrumentation.redis import RedisInstrumentor
 from opentelemetry.sdk.resources import Resource
 # Configure OpenTelemetry tracing
 def setup_tracing(service_name: str = "demo-session"):
    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
    resource = Resource.create({"service.name": service_name})
    otlp_exporter = OTLPSpanExporter(
        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
        insecure=True
    )
    provider = TracerProvider(resource=resource)
    processor = BatchSpanProcessor(otlp_exporter)
    provider.add_span_processor(processor)
    trace.set_tracer_provider(provider)
    return provider
 # Initialize tracing
 tracer_provider = setup_tracing("demo-session")
 # Setup logging
 setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO'))
 # Setup OpenTelemetry logging export if enabled
 if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
    try:
        from shared.monitoring.logs_exporter import setup_otel_logging
        result = setup_otel_logging("demo-session", settings.VERSION)
        if result:
 logger = structlog.get_logger()
            logger.info("OpenTelemetry logs export enabled for demo-session")
        else:
            logger = structlog.get_logger()
            logger.warning("OpenTelemetry logs export setup returned None")
    except Exception as e:
        logger = structlog.get_logger()
        logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
 else:
    logger = structlog.get_logger()
    logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
-# Initialize database
+# Initialize database manager
 db_manager = DatabaseManager()
-@asynccontextmanager
+class DemoSessionService(StandardFastAPIService):
-async def lifespan(app: FastAPI):
+    """Demo Session Service with standardized monitoring setup"""
    """Application lifespan handler"""
    logger.info("Starting Demo Session Service", version=settings.VERSION)
    async def on_startup(self, app):
        """Custom startup logic for Demo Session"""
        # Initialize database
        db_manager.initialize()
        logger.info("Database initialized")
-    # Initialize Redis using shared implementation
+        # Initialize Redis
        await initialize_redis(
            redis_url=settings.REDIS_URL,
            db=0,
            max_connections=50
        )
        logger.info("Redis initialized")
-    # Initialize system metrics collection
+        await super().on_startup(app)
    system_metrics = SystemMetricsCollector("demo-session")
    logger.info("System metrics collection started")
-    # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
+    async def on_shutdown(self, app):
-    logger.info("Metrics export configured via OpenTelemetry OTLP")
+        """Custom shutdown logic for Demo Session"""
        await super().on_shutdown(app)
-    logger.info("Demo Session Service started successfully")
+        # Cleanup
    yield
    # Cleanup on shutdown
        await db_manager.close()
        await close_redis()
-
+        logger.info("Database and Redis connections closed")
    logger.info("Demo Session Service stopped")
-app = FastAPI(
+# Create service instance
-    title="Demo Session Service",
+service = DemoSessionService(
    service_name="demo-session",
    app_name="Demo Session Service",
    description="Manages isolated demo sessions for prospect users",
    version=settings.VERSION,
-    lifespan=lifespan
+    log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
    cors_origins=["*"],  # Configure appropriately for production
    api_prefix="/api/v1",
    enable_metrics=True,
    enable_health_checks=True,
    enable_tracing=True,
    enable_cors=True
 )
-# Instrument FastAPI with OpenTelemetry
+# Create FastAPI app
-FastAPIInstrumentor.instrument_app(app)
+app = service.create_app(debug=settings.DEBUG)
-# Instrument httpx for outgoing requests
+# Add service-specific routers
 HTTPXClientInstrumentor().instrument()
 # Instrument Redis
 RedisInstrumentor().instrument()
 # Initialize metrics collector
 metrics_collector = MetricsCollector("demo-session")
 # Add metrics middleware to track HTTP requests
 add_metrics_middleware(app, metrics_collector)
 # CORS middleware
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
@app.exception_handler(Exception)
 async def global_exception_handler(request: Request, exc: Exception):
    """Global exception handler"""
    logger.error(
        "Unhandled exception",
        path=request.url.path,
        method=request.method,
        error=str(exc)
    )
    return JSONResponse(
        status_code=500,
        content={"detail": "Internal server error"}
    )
 # Include routers
 app.include_router(demo_sessions.router)
 app.include_router(demo_accounts.router)
 app.include_router(demo_operations.router)
 app.include_router(internal.router)
@app.get("/")
 async def root():
    """Root endpoint"""
    return {
        "service": "demo-session",
        "version": settings.VERSION,
        "status": "running"
    }
@app.get("/health")
 async def health():
    """Health check endpoint"""
    from shared.redis_utils import get_redis_manager
    redis_manager = await get_redis_manager()
    redis_ok = await redis_manager.health_check()
    return {
        "status": "healthy" if redis_ok else "degraded",
        "service": "demo-session",
        "version": settings.VERSION,
        "redis": "connected" if redis_ok else "disconnected"
    }
 # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
 # The /metrics endpoint is not needed as metrics are pushed automatically
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(
--- a/shared/monitoring/init.py
+++ b/shared/monitoring/init.py
@@ -1,14 +1,34 @@
 """
 Shared monitoring package for microservices
 Provides unified OpenTelemetry-based observability:
 - Traces: Distributed tracing
 - Metrics: System and application metrics
 - Logs: Structured logging
 All signals exported to SigNoz via OTLP.
 """
 # Core setup - START HERE
 from .logging import setup_logging
-from .metrics import setup_metrics_early, get_metrics_collector, MetricsCollector
+from .telemetry import (
-from .health_checks import (
+    setup_telemetry,
-    HealthCheckManager,
+    setup_telemetry_simple,
-    FastAPIHealthChecker,
+    get_telemetry_status,
-    create_health_manager,
+    TelemetryProviders
-    setup_fastapi_health_checks
+)
 # Configuration
 from .otel_config import OTelConfig, OTelEndpoints
 # Individual signal setup (used by telemetry.py)
 from .tracing import (
    setup_tracing,
    get_current_trace_id,
    get_current_span_id,
    add_trace_attributes,
    add_trace_event,
    record_exception
 )
 from .logs_exporter import (
    setup_otel_logging,
@@ -27,23 +47,51 @@ from .system_metrics import (
    setup_all_metrics
 )
 # Health checks
 from .health_checks import (
    HealthCheckManager,
    FastAPIHealthChecker,
    create_health_manager,
    setup_fastapi_health_checks
 )
 __all__ = [
    # CORE - Start with these
    'setup_logging',
-    'setup_metrics_early',
+    'setup_telemetry',
-    'get_metrics_collector',
+    'setup_telemetry_simple',
-    'MetricsCollector',
+    'get_telemetry_status',
-    'HealthCheckManager',
+    'TelemetryProviders',
-    'FastAPIHealthChecker',
+
-    'create_health_manager',
+    # Configuration
-    'setup_fastapi_health_checks',
+    'OTelConfig',
    'OTelEndpoints',
    # Tracing
    'setup_tracing',
    'get_current_trace_id',
    'get_current_span_id',
    'add_trace_attributes',
    'add_trace_event',
    'record_exception',
    # Logs
    'setup_otel_logging',
    'add_log_context',
    'get_current_trace_context',
    'StructlogOTELProcessor',
    # Metrics
    'setup_otel_metrics',
    'OTelMetricsCollector',
    'create_dual_metrics_collector',
    'SystemMetricsCollector',
    'ApplicationMetricsCollector',
-    'setup_all_metrics'
+    'setup_all_metrics',
    # Health checks
    'HealthCheckManager',
    'FastAPIHealthChecker',
    'create_health_manager',
    'setup_fastapi_health_checks',
 ]
--- a/shared/monitoring/logs_exporter.py
+++ b/shared/monitoring/logs_exporter.py
@@ -1,6 +1,6 @@
 """
 OpenTelemetry Logs Integration for SigNoz
-Exports structured logs to SigNoz via OpenTelemetry Collector
+Exports structured logs to SigNoz via OpenTelemetry Collector using HTTP protocol
 """
 import os
@@ -10,14 +10,21 @@ from typing import Optional
 from opentelemetry._logs import set_logger_provider
 from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
 from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
 from opentelemetry.sdk.resources import Resource
 # Try to import HTTP log exporter (logs always use HTTP)
 try:
    from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
    HTTP_LOG_EXPORTER_AVAILABLE = True
 except ImportError:
    try:
        from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
        HTTP_LOG_EXPORTER_AVAILABLE = True
    except ImportError:
        OTLPLogExporter = None
-from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
+        HTTP_LOG_EXPORTER_AVAILABLE = False
 from .otel_config import OTelConfig
 logger = structlog.get_logger()
@@ -31,13 +38,14 @@ def setup_otel_logging(
    """
    Setup OpenTelemetry logging to export logs to SigNoz.
-    This integrates with Python's standard logging to automatically
+    Uses HTTP protocol (port 4318) for sending logs to SigNoz.
-    export all log records to SigNoz via the OTLP protocol.
+    Integrates with Python's standard logging to automatically export
    all log records to SigNoz via the OTLP HTTP protocol.
    Args:
        service_name: Name of the service (e.g., "auth-service")
        service_version: Version of the service
-        otel_endpoint: OpenTelemetry collector endpoint (default from env)
+        otel_endpoint: Optional override for OTLP endpoint (HTTP format with path)
        enable_console: Whether to also log to console (default: True)
    Returns:
@@ -47,7 +55,7 @@ def setup_otel_logging(
        from shared.monitoring.logs_exporter import setup_otel_logging
        # Setup during service initialization
-        setup_otel_logging("auth-service", "1.0.0")
+        handler = setup_otel_logging("auth-service", "1.0.0")
        # Now all standard logging calls will be exported to SigNoz
        import logging
@@ -56,7 +64,7 @@ def setup_otel_logging(
    """
    # Check if logging export is enabled
-    if os.getenv("OTEL_LOGS_EXPORTER", "").lower() != "otlp":
+    if not OTelConfig.is_enabled("logs"):
        logger.info(
            "OpenTelemetry logs export disabled",
            service=service_name,
@@ -64,59 +72,36 @@ def setup_otel_logging(
        )
        return None
-    # Get OTLP endpoint from environment or parameter
+    # Check if HTTP log exporter is available
-    # For logs, we need to use the HTTP endpoint (port 4318), not the gRPC endpoint (port 4317)
+    if not HTTP_LOG_EXPORTER_AVAILABLE or OTLPLogExporter is None:
    if otel_endpoint is None:
        # Try logs-specific endpoint first, then fall back to general OTLP endpoint
        otel_endpoint = os.getenv(
            "OTEL_EXPORTER_OTLP_LOGS_ENDPOINT",
            os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
        )
        logger.info(f"Original OTLP endpoint for logs: {otel_endpoint}")
        # If we got the tracing endpoint (4317), switch to logs endpoint (4318)
        if otel_endpoint.endswith(":4317"):
            logger.info("Converting tracing endpoint (4317) to logs endpoint (4318)")
            otel_endpoint = otel_endpoint.replace(":4317", ":4318")
        logger.info(f"Final OTLP endpoint for logs: {otel_endpoint}")
    # Ensure endpoint has proper protocol prefix
    if not otel_endpoint.startswith(("http://", "https://")):
        # Default to HTTP for insecure connections
        otel_endpoint = f"http://{otel_endpoint}"
    # Ensure endpoint has /v1/logs path for HTTP
    if not otel_endpoint.endswith("/v1/logs"):
        otel_endpoint = f"{otel_endpoint}/v1/logs"
    try:
        # Check if OTLPLogExporter is available
        if OTLPLogExporter is None:
        logger.warning(
-                "OpenTelemetry HTTP OTLP exporter not available",
+            "OpenTelemetry HTTP log exporter not available",
            service=service_name,
            reason="opentelemetry-exporter-otlp-proto-http package not installed"
        )
        return None
-        # Create resource with service information
+    try:
-        resource = Resource(attributes={
+        # Get endpoints from centralized config
-            SERVICE_NAME: service_name,
+        endpoints = OTelConfig.get_endpoints()
-            SERVICE_VERSION: service_version,
+
-            "deployment.environment": os.getenv("ENVIRONMENT", "development"),
+        # Use provided endpoint or get from config
-            "k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
+        if otel_endpoint:
-            "k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
+            http_endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/logs")
-        })
+        else:
            http_endpoint = endpoints.logs_http
        # Get resource attributes
        resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
        resource = Resource(attributes=resource_attrs)
        # Configure logger provider
        logger_provider = LoggerProvider(resource=resource)
        set_logger_provider(logger_provider)
-        # Configure OTLP exporter for logs
+        # Configure OTLP HTTP exporter for logs
        otlp_exporter = OTLPLogExporter(
-            endpoint=otel_endpoint,
+            endpoint=http_endpoint,
            timeout=10
        )
@@ -135,9 +120,10 @@ def setup_otel_logging(
        root_logger.addHandler(otel_handler)
        logger.info(
-            "OpenTelemetry logs export configured",
+            "OpenTelemetry logs export configured successfully",
            service=service_name,
-            otel_endpoint=otel_endpoint,
+            http_endpoint=http_endpoint,
            protocol="http",
            console_logging=enable_console
        )
@@ -147,8 +133,7 @@ def setup_otel_logging(
        logger.error(
            "Failed to setup OpenTelemetry logs export",
            service=service_name,
-            error=str(e),
+            error=str(e)
            reason="Will continue with standard logging only"
        )
        return None
--- a/shared/monitoring/metrics_exporter.py
+++ b/shared/monitoring/metrics_exporter.py
@@ -1,6 +1,6 @@
 """
 OpenTelemetry Metrics Integration for SigNoz
-Exports metrics to SigNoz via OpenTelemetry Collector in addition to Prometheus
+Exports metrics to SigNoz via OpenTelemetry Collector using gRPC protocol
 """
 import os
@@ -9,8 +9,24 @@ from typing import Optional
 from opentelemetry import metrics
 from opentelemetry.sdk.metrics import MeterProvider
 from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
-from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
+from opentelemetry.sdk.resources import Resource
-from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
+
 # Import both gRPC and HTTP exporters
 try:
    from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GrpcMetricExporter
    GRPC_AVAILABLE = True
 except ImportError:
    GRPC_AVAILABLE = False
    GrpcMetricExporter = None
 try:
    from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HttpMetricExporter
    HTTP_AVAILABLE = True
 except ImportError:
    HTTP_AVAILABLE = False
    HttpMetricExporter = None
 from .otel_config import OTelConfig
 logger = structlog.get_logger()
@@ -19,20 +35,21 @@ def setup_otel_metrics(
    service_name: str,
    service_version: str = "1.0.0",
    otel_endpoint: Optional[str] = None,
-    export_interval_millis: int = 60000  # Export every 60 seconds
+    export_interval_millis: int = 60000,  # Export every 60 seconds
    protocol: Optional[str] = None  # "grpc" or "http", defaults to grpc
 ) -> Optional[MeterProvider]:
    """
    Setup OpenTelemetry metrics to export to SigNoz.
-    This creates a dual-export strategy:
+    Supports both gRPC (recommended, port 4317) and HTTP (port 4318) protocols.
-    - Prometheus exposition format at /metrics (for Prometheus scraping)
+    Default protocol is gRPC for better performance.
    - OTLP push to SigNoz collector (for direct ingestion)
    Args:
        service_name: Name of the service (e.g., "auth-service")
        service_version: Version of the service
-        otel_endpoint: OpenTelemetry collector endpoint (default from env)
+        otel_endpoint: Optional override for OTLP endpoint
-        export_interval_millis: How often to push metrics (default 60s)
+        export_interval_millis: How often to push metrics in milliseconds (default 60s)
        protocol: Protocol to use ("grpc" or "http"). Defaults to "grpc"
    Returns:
        MeterProvider instance if successful, None otherwise
@@ -40,9 +57,12 @@ def setup_otel_metrics(
    Example:
        from shared.monitoring.metrics_exporter import setup_otel_metrics
-        # Setup during service initialization
+        # Setup with gRPC (default)
        meter_provider = setup_otel_metrics("auth-service", "1.0.0")
        # Or with HTTP
        meter_provider = setup_otel_metrics("auth-service", "1.0.0", protocol="http")
        # Create meters for your metrics
        meter = meter_provider.get_meter(__name__)
        request_counter = meter.create_counter(
@@ -56,8 +76,7 @@ def setup_otel_metrics(
    """
    # Check if metrics export is enabled
-    enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
+    if not OTelConfig.is_enabled("metrics"):
    if not enable_otel_metrics:
        logger.info(
            "OpenTelemetry metrics export disabled",
            service=service_name,
@@ -65,30 +84,64 @@ def setup_otel_metrics(
        )
        return None
-    # Get OTLP endpoint from environment or parameter
+    # Determine protocol to use
-    if otel_endpoint is None:
+    if protocol is None:
-        otel_endpoint = os.getenv(
+        protocol = OTelConfig.get_protocol("metrics")
            "OTEL_EXPORTER_OTLP_ENDPOINT",
            os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
        )
-    # Ensure endpoint has /v1/metrics path for HTTP
+    # Validate protocol is available
-    if not otel_endpoint.endswith("/v1/metrics"):
+    if protocol == "grpc" and not GRPC_AVAILABLE:
-        otel_endpoint = f"{otel_endpoint}/v1/metrics"
+        logger.warning(
            "gRPC exporter not available, falling back to HTTP",
            service=service_name
        )
        protocol = "http"
    elif protocol == "http" and not HTTP_AVAILABLE:
        logger.warning(
            "HTTP exporter not available, falling back to gRPC",
            service=service_name
        )
        protocol = "grpc"
    if protocol not in ["grpc", "http"]:
        logger.error(
            "Invalid protocol specified",
            service=service_name,
            protocol=protocol
        )
        return None
    try:
-        # Create resource with service information
+        # Get endpoints from centralized config
-        resource = Resource(attributes={
+        endpoints = OTelConfig.get_endpoints()
            SERVICE_NAME: service_name,
            SERVICE_VERSION: service_version,
            "deployment.environment": os.getenv("ENVIRONMENT", "development"),
            "k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
            "k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
        })
-        # Configure OTLP exporter for metrics
+        # Determine which endpoint to use
-        otlp_exporter = OTLPMetricExporter(
+        if otel_endpoint:
-            endpoint=otel_endpoint,
+            # User provided override
            if protocol == "grpc":
                endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
            else:
                endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/metrics")
        else:
            # Use config-determined endpoint
            if protocol == "grpc":
                endpoint = endpoints.metrics_grpc
            else:
                endpoint = endpoints.metrics_http
        # Get resource attributes
        resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
        resource = Resource(attributes=resource_attrs)
        # Configure OTLP exporter based on protocol
        if protocol == "grpc":
            otlp_exporter = GrpcMetricExporter(
                endpoint=endpoint,
                insecure=True,  # Use secure=False in production with proper TLS
                timeout=10
            )
        else:  # http
            otlp_exporter = HttpMetricExporter(
                endpoint=endpoint,
                timeout=10
            )
@@ -108,9 +161,10 @@ def setup_otel_metrics(
        metrics.set_meter_provider(meter_provider)
        logger.info(
-            "OpenTelemetry metrics export configured",
+            "OpenTelemetry metrics export configured successfully",
            service=service_name,
-            otel_endpoint=otel_endpoint,
+            endpoint=endpoint,
            protocol=protocol,
            export_interval_seconds=export_interval_millis / 1000
        )
@@ -121,7 +175,7 @@ def setup_otel_metrics(
            "Failed to setup OpenTelemetry metrics export",
            service=service_name,
            error=str(e),
-            reason="Will continue with Prometheus-only metrics"
+            protocol=protocol
        )
        return None
--- a/shared/monitoring/otel_config.py
+++ b/shared/monitoring/otel_config.py
@@ -0,0 +1,286 @@
 """
 Centralized OpenTelemetry Configuration
 Manages OTEL endpoints and settings for traces, metrics, and logs
 """
 import os
 from typing import Optional, Tuple
 from dataclasses import dataclass
 import structlog
 logger = structlog.get_logger()
@dataclass
 class OTelEndpoints:
    """
    Container for OpenTelemetry endpoints.
    SigNoz uses different protocols for different signals:
    - Traces: gRPC (port 4317)
    - Metrics: gRPC (port 4317) or HTTP (port 4318)
    - Logs: HTTP (port 4318)
    """
    traces_grpc: str  # gRPC endpoint for traces (e.g., "host:4317")
    metrics_grpc: str  # gRPC endpoint for metrics (e.g., "host:4317")
    metrics_http: str  # HTTP endpoint for metrics (e.g., "http://host:4318/v1/metrics")
    logs_http: str     # HTTP endpoint for logs (e.g., "http://host:4318/v1/logs")
 class OTelConfig:
    """
    Centralized configuration for OpenTelemetry exporters.
    This class manages endpoint URLs and ensures proper protocol usage:
    - gRPC endpoints: host:port (no protocol prefix)
    - HTTP endpoints: http://host:port/path (with protocol and path)
    """
    # Default base endpoint (can be overridden by environment variables)
    DEFAULT_OTEL_COLLECTOR_HOST = "signoz-otel-collector.bakery-ia.svc.cluster.local"
    DEFAULT_GRPC_PORT = 4317
    DEFAULT_HTTP_PORT = 4318
    @classmethod
    def get_endpoints(cls) -> OTelEndpoints:
        """
        Get OpenTelemetry endpoints from environment variables with proper fallbacks.
        Environment variables (in order of precedence):
        1. OTEL_EXPORTER_OTLP_ENDPOINT - Base endpoint (gRPC format: host:port)
        2. OTEL_EXPORTER_OTLP_TRACES_ENDPOINT - Specific traces endpoint
        3. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT - Specific metrics endpoint
        4. OTEL_EXPORTER_OTLP_LOGS_ENDPOINT - Specific logs endpoint
        5. OTEL_COLLECTOR_ENDPOINT - Legacy variable (HTTP format)
        Returns:
            OTelEndpoints with all configured endpoints
        """
        # Get base endpoint from environment
        base_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
        if base_endpoint:
            # Clean and parse base endpoint
            base_grpc = cls._clean_grpc_endpoint(base_endpoint)
            base_http_host = cls._extract_host(base_endpoint)
        else:
            # Use default collector
            base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
            base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
        # Get signal-specific endpoints (or use base endpoint)
        traces_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", base_grpc)
        metrics_endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", base_grpc)
        logs_endpoint = os.getenv("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT")
        # Build final endpoints
        traces_grpc = cls._clean_grpc_endpoint(traces_endpoint)
        metrics_grpc = cls._clean_grpc_endpoint(metrics_endpoint)
        # For metrics HTTP, convert gRPC endpoint to HTTP if needed
        metrics_http = cls._grpc_to_http_endpoint(metrics_grpc, "/v1/metrics")
        # For logs, use HTTP endpoint
        if logs_endpoint:
            logs_http = cls._ensure_http_endpoint(logs_endpoint, "/v1/logs")
        else:
            logs_http = cls._grpc_to_http_endpoint(base_grpc, "/v1/logs")
        endpoints = OTelEndpoints(
            traces_grpc=traces_grpc,
            metrics_grpc=metrics_grpc,
            metrics_http=metrics_http,
            logs_http=logs_http
        )
        logger.info(
            "OpenTelemetry endpoints configured",
            traces_grpc=endpoints.traces_grpc,
            metrics_grpc=endpoints.metrics_grpc,
            metrics_http=endpoints.metrics_http,
            logs_http=endpoints.logs_http
        )
        return endpoints
    @staticmethod
    def _clean_grpc_endpoint(endpoint: str) -> str:
        """
        Clean endpoint for gRPC usage (remove protocol, paths).
        Args:
            endpoint: Raw endpoint string
        Returns:
            Cleaned endpoint in format "host:port"
        """
        # Remove protocol prefixes
        endpoint = endpoint.replace("http://", "").replace("https://", "")
        # Remove paths (gRPC doesn't use paths)
        if "/" in endpoint:
            endpoint = endpoint.split("/")[0]
        # Ensure it has a port
        if ":" not in endpoint:
            endpoint = f"{endpoint}:4317"
        return endpoint
    @staticmethod
    def _extract_host(endpoint: str) -> str:
        """
        Extract host and convert to HTTP endpoint.
        Args:
            endpoint: Raw endpoint string
        Returns:
            HTTP endpoint without path (e.g., "http://host:4318")
        """
        # Remove protocol if present
        clean = endpoint.replace("http://", "").replace("https://", "")
        # Remove path if present
        if "/" in clean:
            clean = clean.split("/")[0]
        # Extract host without port
        if ":" in clean:
            host = clean.split(":")[0]
        else:
            host = clean
        return f"http://{host}:4318"
    @staticmethod
    def _grpc_to_http_endpoint(grpc_endpoint: str, path: str) -> str:
        """
        Convert gRPC endpoint to HTTP endpoint with path.
        Args:
            grpc_endpoint: gRPC endpoint (e.g., "host:4317")
            path: HTTP path (e.g., "/v1/metrics")
        Returns:
            HTTP endpoint (e.g., "http://host:4318/v1/metrics")
        """
        # Extract host from gRPC endpoint
        if ":" in grpc_endpoint:
            host = grpc_endpoint.split(":")[0]
        else:
            host = grpc_endpoint
        # Build HTTP endpoint with port 4318
        return f"http://{host}:4318{path}"
    @staticmethod
    def _ensure_http_endpoint(endpoint: str, path: str) -> str:
        """
        Ensure endpoint is in HTTP format with proper path.
        Args:
            endpoint: Raw endpoint string
            path: Required path (e.g., "/v1/logs")
        Returns:
            HTTP endpoint with protocol and path
        """
        # Add protocol if missing
        if not endpoint.startswith(("http://", "https://")):
            endpoint = f"http://{endpoint}"
        # Ensure it has the correct port for HTTP
        if ":4317" in endpoint:
            endpoint = endpoint.replace(":4317", ":4318")
        elif ":4318" not in endpoint and ":" in endpoint:
            # Has a port but not the right one, replace it
            parts = endpoint.split(":")
            if len(parts) >= 2:
                # Remove existing port and path
                base = ":".join(parts[:-1])
                endpoint = f"{base}:4318"
        elif ":" not in endpoint.replace("http://", "").replace("https://", ""):
            # No port at all, add it
            endpoint = f"{endpoint}:4318"
        # Ensure path is present
        if not endpoint.endswith(path):
            # Remove any existing path first
            if "/" in endpoint.split("://")[1]:
                base = endpoint.split("://")[0] + "://" + endpoint.split("://")[1].split("/")[0]
                endpoint = base
            endpoint = f"{endpoint}{path}"
        return endpoint
    @classmethod
    def get_resource_attributes(
        cls,
        service_name: str,
        service_version: str = "1.0.0"
    ) -> dict:
        """
        Get common resource attributes for all OTEL signals.
        Args:
            service_name: Name of the service
            service_version: Version of the service
        Returns:
            Dictionary of resource attributes
        """
        return {
            "service.name": service_name,
            "service.version": service_version,
            "deployment.environment": os.getenv("ENVIRONMENT", "development"),
            "k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
            "k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
            "k8s.cluster.name": os.getenv("K8S_CLUSTER_NAME", "bakery-ia-cluster"),
        }
    @classmethod
    def is_enabled(cls, signal: str) -> bool:
        """
        Check if a specific telemetry signal is enabled.
        Args:
            signal: One of "traces", "metrics", "logs"
        Returns:
            True if signal is enabled, False otherwise
        """
        signal = signal.lower()
        if signal == "traces":
            return os.getenv("ENABLE_TRACING", "true").lower() == "true"
        elif signal == "metrics":
            return os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
        elif signal == "logs":
            return os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp"
        else:
            return False
    @classmethod
    def get_protocol(cls, signal: str) -> str:
        """
        Get the preferred protocol for a signal.
        Args:
            signal: One of "traces", "metrics", "logs"
        Returns:
            Protocol name ("grpc" or "http")
        """
        protocol = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc")
        # Signal-specific overrides
        if signal == "traces":
            return os.getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", protocol)
        elif signal == "metrics":
            return os.getenv("OTEL_EXPORTER_OTLP_METRICS_PROTOCOL", protocol)
        elif signal == "logs":
            # Logs always use HTTP in our setup
            return "http"
        return protocol
--- a/shared/monitoring/telemetry.py
+++ b/shared/monitoring/telemetry.py
@@ -0,0 +1,271 @@
 """
 Unified OpenTelemetry Telemetry Setup
 Provides a single entry point to configure all telemetry signals:
 - Traces: Distributed tracing across services
 - Metrics: OTLP metrics export + system metrics collection
 - Logs: Structured logs with trace correlation
 All signals are exported to SigNoz via OTLP.
 """
 import os
 import structlog
 from typing import Optional, Dict, Any, Tuple
 from dataclasses import dataclass
 from .otel_config import OTelConfig
 from .tracing import setup_tracing
 from .metrics_exporter import setup_otel_metrics
 from .logs_exporter import setup_otel_logging
 from .system_metrics import setup_all_metrics, SystemMetricsCollector, ApplicationMetricsCollector
 logger = structlog.get_logger()
@dataclass
 class TelemetryProviders:
    """
    Container for all OpenTelemetry providers and collectors.
    Attributes:
        tracer_provider: Provider for distributed tracing
        meter_provider: Provider for metrics export
        logging_handler: Handler for structured logs
        system_metrics: Collector for system-level metrics (CPU, memory, disk, network)
        app_metrics: Collector for application-level metrics (HTTP, DB)
    """
    tracer_provider: Optional[Any] = None
    meter_provider: Optional[Any] = None
    logging_handler: Optional[Any] = None
    system_metrics: Optional[SystemMetricsCollector] = None
    app_metrics: Optional[ApplicationMetricsCollector] = None
 def setup_telemetry(
    app,
    service_name: str,
    service_version: str = "1.0.0",
    enable_traces: bool = True,
    enable_metrics: bool = True,
    enable_logs: bool = True,
    enable_system_metrics: bool = True,
    metrics_protocol: Optional[str] = None,  # "grpc" or "http", defaults to grpc
    export_interval_millis: int = 60000
 ) -> TelemetryProviders:
    """
    Setup all OpenTelemetry telemetry signals (traces, metrics, logs) for a service.
    This is the UNIFIED setup function that configures everything:
    - Distributed tracing (gRPC, port 4317)
    - Metrics export (gRPC by default, port 4317)
    - System metrics collection (CPU, memory, disk, network)
    - Application metrics (HTTP requests, DB queries)
    - Structured logs export (HTTP, port 4318)
    All signals use the centralized OTelConfig for endpoint management.
    Args:
        app: FastAPI application instance
        service_name: Name of the service (e.g., "auth-service")
        service_version: Version of the service
        enable_traces: Enable distributed tracing (default: True)
        enable_metrics: Enable metrics export to OTLP (default: True)
        enable_logs: Enable logs export to OTLP (default: True)
        enable_system_metrics: Enable system metrics collection (default: True, can be disabled via ENABLE_SYSTEM_METRICS env)
        metrics_protocol: Protocol for metrics ("grpc" or "http", default: "grpc")
        export_interval_millis: How often to export metrics in milliseconds
    Returns:
        TelemetryProviders containing all initialized providers and collectors
    Example:
        from shared.monitoring.telemetry import setup_telemetry
        app = FastAPI(title="Auth Service")
        providers = setup_telemetry(
            app,
            service_name="auth-service",
            service_version="1.0.0"
        )
        # All telemetry is now configured:
        # - Traces automatically captured for HTTP requests
        # - System metrics automatically collected
        # - Application metrics via providers.app_metrics
        # - Logs automatically correlated with traces
    """
    logger.info(
        "Setting up unified OpenTelemetry telemetry",
        service=service_name,
        version=service_version,
        traces=enable_traces,
        metrics=enable_metrics,
        logs=enable_logs,
        system_metrics=enable_system_metrics
    )
    providers = TelemetryProviders()
    # Setup distributed tracing
    if enable_traces and OTelConfig.is_enabled("traces"):
        try:
            providers.tracer_provider = setup_tracing(
                app,
                service_name=service_name,
                service_version=service_version
            )
            if providers.tracer_provider:
                logger.info("✓ Distributed tracing configured", service=service_name)
            else:
                logger.warning("✗ Distributed tracing setup returned None", service=service_name)
        except Exception as e:
            logger.error("✗ Failed to setup distributed tracing", service=service_name, error=str(e))
    # Setup OTLP metrics export
    if enable_metrics and OTelConfig.is_enabled("metrics"):
        try:
            providers.meter_provider = setup_otel_metrics(
                service_name=service_name,
                service_version=service_version,
                protocol=metrics_protocol,
                export_interval_millis=export_interval_millis
            )
            if providers.meter_provider:
                logger.info("✓ OTLP metrics export configured", service=service_name)
                # Setup system and application metrics collectors
                if enable_system_metrics:
                    enable_system_env = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
                    if enable_system_env:
                        try:
                            providers.system_metrics, providers.app_metrics = setup_all_metrics(
                                service_name=service_name,
                                service_version=service_version,
                                meter_provider=providers.meter_provider
                            )
                            logger.info(
                                "✓ System and application metrics collectors initialized",
                                service=service_name,
                                system_metrics=["cpu", "memory", "disk", "network"],
                                app_metrics=["http_requests", "db_queries"]
                            )
                        except Exception as e:
                            logger.warning("✗ Failed to setup metrics collectors", service=service_name, error=str(e))
            else:
                logger.warning("✗ OTLP metrics export setup returned None", service=service_name)
        except Exception as e:
            logger.error("✗ Failed to setup OTLP metrics export", service=service_name, error=str(e))
    # Setup logs export
    if enable_logs and OTelConfig.is_enabled("logs"):
        try:
            providers.logging_handler = setup_otel_logging(
                service_name=service_name,
                service_version=service_version
            )
            if providers.logging_handler:
                logger.info("✓ Structured logs export configured", service=service_name)
            else:
                logger.warning("✗ Logs export setup returned None", service=service_name)
        except Exception as e:
            logger.error("✗ Failed to setup logs export", service=service_name, error=str(e))
    # Log endpoint configuration summary
    try:
        endpoints = OTelConfig.get_endpoints()
        summary = {
            "service": service_name,
            "version": service_version,
            "traces": {
                "enabled": bool(providers.tracer_provider),
                "endpoint": endpoints.traces_grpc if providers.tracer_provider else "disabled"
            },
            "metrics": {
                "enabled": bool(providers.meter_provider),
                "endpoint": (endpoints.metrics_grpc if metrics_protocol != "http" else endpoints.metrics_http) if providers.meter_provider else "disabled",
                "system_metrics": bool(providers.system_metrics),
                "app_metrics": bool(providers.app_metrics)
            },
            "logs": {
                "enabled": bool(providers.logging_handler),
                "endpoint": endpoints.logs_http if providers.logging_handler else "disabled"
            }
        }
        logger.info("🎉 Telemetry setup complete", **summary)
    except Exception as e:
        logger.warning("Could not log endpoint summary", error=str(e))
    return providers
 def setup_telemetry_simple(
    app,
    service_name: str,
    service_version: str = "1.0.0"
 ) -> TelemetryProviders:
    """
    Simplified telemetry setup with all defaults.
    Uses:
    - gRPC for traces (port 4317)
    - gRPC for metrics (port 4317)
    - HTTP for logs (port 4318)
    All settings are read from environment variables and OTelConfig.
    Args:
        app: FastAPI application instance
        service_name: Name of the service
        service_version: Version of the service
    Returns:
        TelemetryProviders containing all initialized providers
    Example:
        from shared.monitoring.telemetry import setup_telemetry_simple
        app = FastAPI(title="Auth Service")
        providers = setup_telemetry_simple(app, "auth-service")
    """
    return setup_telemetry(
        app=app,
        service_name=service_name,
        service_version=service_version
    )
 def get_telemetry_status() -> Dict[str, Any]:
    """
    Get current telemetry configuration status.
    Returns:
        Dictionary with telemetry status information
    Example:
        from shared.monitoring.telemetry import get_telemetry_status
        status = get_telemetry_status()
        print(f"Tracing enabled: {status['traces']['enabled']}")
    """
    endpoints = OTelConfig.get_endpoints()
    return {
        "traces": {
            "enabled": OTelConfig.is_enabled("traces"),
            "protocol": "grpc",
            "endpoint": endpoints.traces_grpc
        },
        "metrics": {
            "enabled": OTelConfig.is_enabled("metrics"),
            "protocol": OTelConfig.get_protocol("metrics"),
            "grpc_endpoint": endpoints.metrics_grpc,
            "http_endpoint": endpoints.metrics_http
        },
        "logs": {
            "enabled": OTelConfig.is_enabled("logs"),
            "protocol": "http",
            "endpoint": endpoints.logs_http
        }
    }
--- a/shared/monitoring/tracing.py
+++ b/shared/monitoring/tracing.py
@@ -3,17 +3,38 @@ OpenTelemetry distributed tracing integration
 Provides end-to-end request tracking across all services
 """
 import os
 import structlog
 from typing import Optional
 from opentelemetry import trace
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
-from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
+from opentelemetry.sdk.resources import Resource
 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
 # Core instrumentations (should always be available)
 from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
 # Optional instrumentations (may not be installed in all services)
 try:
    from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
    HTTPX_AVAILABLE = True
 except ImportError:
    HTTPX_AVAILABLE = False
 try:
    from opentelemetry.instrumentation.redis import RedisInstrumentor
    REDIS_AVAILABLE = True
 except ImportError:
    REDIS_AVAILABLE = False
 try:
    from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
    SQLALCHEMY_AVAILABLE = True
 except ImportError:
    SQLALCHEMY_AVAILABLE = False
 from .otel_config import OTelConfig
 logger = structlog.get_logger()
@@ -22,8 +43,8 @@ def setup_tracing(
    app,
    service_name: str,
    service_version: str = "1.0.0",
-    otel_endpoint: str = "http://signoz-otel-collector.bakery-ia:4318"
+    otel_endpoint: Optional[str] = None
-):
+) -> Optional[TracerProvider]:
    """
    Setup OpenTelemetry distributed tracing for a FastAPI service.
@@ -33,35 +54,56 @@ def setup_tracing(
    - Redis operations
    - PostgreSQL/SQLAlchemy queries
    Uses gRPC protocol (port 4317) for sending traces to SigNoz.
    Args:
        app: FastAPI application instance
        service_name: Name of the service (e.g., "auth-service")
        service_version: Version of the service
-        otel_endpoint: OpenTelemetry collector endpoint (SigNoz)
+        otel_endpoint: Optional override for OTLP endpoint (gRPC format: host:port)
    Returns:
        TracerProvider instance if successful, None otherwise
    Example:
        from shared.monitoring.tracing import setup_tracing
        app = FastAPI(title="Auth Service")
-        setup_tracing(app, "auth-service")
+        tracer_provider = setup_tracing(app, "auth-service", "1.0.0")
    """
    # Check if tracing is enabled
    if not OTelConfig.is_enabled("traces"):
        logger.info(
            "Distributed tracing disabled",
            service=service_name,
            reason="ENABLE_TRACING not set to 'true'"
        )
        return None
    try:
-        # Create resource with service information
+        # Get endpoints from centralized config
-        resource = Resource(attributes={
+        endpoints = OTelConfig.get_endpoints()
-            SERVICE_NAME: service_name,
+
-            SERVICE_VERSION: service_version,
+        # Use provided endpoint or get from config
-            "deployment.environment": "production"
+        if otel_endpoint:
-        })
+            # Clean user-provided endpoint for gRPC
            grpc_endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
        else:
            grpc_endpoint = endpoints.traces_grpc
        # Get resource attributes
        resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
        resource = Resource(attributes=resource_attrs)
        # Configure tracer provider
        tracer_provider = TracerProvider(resource=resource)
        trace.set_tracer_provider(tracer_provider)
-        # Configure OTLP exporter to send to SigNoz
+        # Configure OTLP gRPC exporter for traces
        otlp_exporter = OTLPSpanExporter(
-            endpoint=otel_endpoint,
+            endpoint=grpc_endpoint,
-            insecure=True  # Use TLS in production
+            insecure=True  # Use secure=False in production with proper TLS
        )
        # Add span processor with batching for performance
@@ -75,40 +117,46 @@ def setup_tracing(
            excluded_urls="health,metrics"  # Don't trace health/metrics endpoints
        )
-        # Auto-instrument HTTPX (inter-service communication)
+        # Auto-instrument HTTPX (inter-service communication) if available
        if HTTPX_AVAILABLE:
            try:
                HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
                logger.debug("HTTPX instrumentation enabled")
            except Exception as e:
                logger.warning(f"Failed to instrument HTTPX: {e}")
-        # Auto-instrument Redis
+        # Auto-instrument Redis if available
        if REDIS_AVAILABLE:
            try:
                RedisInstrumentor().instrument(tracer_provider=tracer_provider)
                logger.debug("Redis instrumentation enabled")
            except Exception as e:
                logger.warning(f"Failed to instrument Redis: {e}")
-        # Auto-instrument PostgreSQL (psycopg2) - skip if not available
+        # Auto-instrument SQLAlchemy if available
-        # Most services use asyncpg instead of psycopg2
+        if SQLALCHEMY_AVAILABLE:
        # try:
        #     Psycopg2Instrumentor().instrument(tracer_provider=tracer_provider)
        # except Exception as e:
        #     logger.warning(f"Failed to instrument Psycopg2: {e}")
        # Auto-instrument SQLAlchemy
            try:
                SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
                logger.debug("SQLAlchemy instrumentation enabled")
            except Exception as e:
                logger.warning(f"Failed to instrument SQLAlchemy: {e}")
        logger.info(
-            "Distributed tracing configured",
+            "Distributed tracing configured successfully",
            service=service_name,
-            otel_endpoint=otel_endpoint
+            grpc_endpoint=grpc_endpoint,
            protocol="grpc"
        )
        return tracer_provider
    except Exception as e:
        logger.error(
            "Failed to setup tracing - continuing without it",
            service=service_name,
            error=str(e)
        )
        return None
 def get_current_trace_id() -> Optional[str]:
--- a/shared/service_base.py
+++ b/shared/service_base.py
@@ -20,10 +20,11 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from fastapi.routing import APIRouter
-from shared.monitoring import setup_logging, setup_otel_logging, setup_otel_metrics, setup_all_metrics
+from shared.monitoring import (
-from shared.monitoring.metrics import setup_metrics_early
+    setup_logging,
    setup_telemetry
 )
 from shared.monitoring.health_checks import setup_fastapi_health_checks
 from shared.monitoring.tracing import setup_tracing
 from shared.database.base import DatabaseManager
 if TYPE_CHECKING:
@@ -77,24 +78,13 @@ class BaseFastAPIService:
        # Initialize logging
        setup_logging(service_name, log_level)
        # Setup OpenTelemetry logging export if enabled
        if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
            try:
                setup_otel_logging(service_name, version)
                self.logger = structlog.get_logger()
                self.logger.info(f"OpenTelemetry logs export enabled for {service_name}")
            except Exception as e:
                self.logger = structlog.get_logger()
                self.logger.warning(f"Failed to setup OpenTelemetry logs export: {e}")
        else:
        self.logger = structlog.get_logger()
        # Will be set during app creation
        self.app: Optional[FastAPI] = None
        self.metrics_collector = None
        self.health_manager = None
        self.alert_service = None
        self.telemetry_providers = None  # Contains all OTEL providers and metrics collectors
    def create_app(self, **fastapi_kwargs) -> FastAPI:
        """
@@ -116,49 +106,25 @@ class BaseFastAPIService:
        # Create FastAPI app
        self.app = FastAPI(**config)
-        # Setup metrics BEFORE middleware and lifespan
+        # Setup unified OpenTelemetry telemetry
-        if self.enable_metrics:
+        # This single call configures:
-            self.metrics_collector = setup_metrics_early(self.app, self.service_name)
+        # - Distributed tracing (gRPC, port 4317)
-
+        # - OTLP metrics export (gRPC, port 4317)
-            # Setup OpenTelemetry metrics export if enabled
+        # - System metrics collection (CPU, memory, disk, network)
-            enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
+        # - Application metrics (HTTP requests, DB queries)
-            if enable_otel_metrics:
+        # - Structured logs export (HTTP, port 4318)
        try:
-                    self.otel_meter_provider = setup_otel_metrics(self.service_name, self.version)
+            self.telemetry_providers = setup_telemetry(
-                    if self.otel_meter_provider:
+                app=self.app,
-                        self.logger.info(f"OpenTelemetry metrics export enabled for {self.service_name}")
+                service_name=self.service_name,
-
+                service_version=self.version,
-                        # Setup system metrics collection (CPU, memory, disk, network)
+                enable_traces=self.enable_tracing,
-                        enable_system_metrics = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
+                enable_metrics=self.enable_metrics,
-                        if enable_system_metrics:
+                enable_logs=True,  # Controlled by OTEL_LOGS_EXPORTER env var
-                            try:
+                enable_system_metrics=True  # Controlled by ENABLE_SYSTEM_METRICS env var
                                self.system_metrics, self.app_metrics = setup_all_metrics(
                                    self.service_name,
                                    self.version,
                                    self.otel_meter_provider
            )
                                self.logger.info(f"System metrics collection enabled for {self.service_name}")
        except Exception as e:
-                                self.logger.warning(f"Failed to setup system metrics: {e}")
+            self.logger.warning("Failed to setup telemetry", error=str(e))
                except Exception as e:
                    self.logger.warning(f"Failed to setup OpenTelemetry metrics export: {e}")
        # Setup distributed tracing
        # Check both constructor flag and environment variable
        tracing_enabled = self.enable_tracing and os.getenv("ENABLE_TRACING", "true").lower() == "true"
        if tracing_enabled:
            try:
                otel_endpoint = os.getenv(
                    "OTEL_COLLECTOR_ENDPOINT",
                    "http://signoz-otel-collector.bakery-ia:4318"
                )
                setup_tracing(self.app, self.service_name, self.version, otel_endpoint)
                self.logger.info(f"Distributed tracing enabled for {self.service_name}")
            except Exception as e:
                self.logger.warning(f"Failed to setup tracing, continuing without it: {e}")
        else:
            self.logger.info(f"Distributed tracing disabled for {self.service_name}")
        # Setup lifespan
        self.app.router.lifespan_context = self._create_lifespan()
@@ -361,10 +327,6 @@ class BaseFastAPIService:
                    method=request.method
                )
                # Record error metric if available
                if self.metrics_collector:
                    self.metrics_collector.increment_counter("errors_total", labels={"type": "unhandled"})
                return JSONResponse(
                    status_code=500,
                    content={
@@ -409,7 +371,10 @@ class BaseFastAPIService:
    def register_custom_metrics(self, metrics_config: Dict[str, Dict[str, Any]]):
        """
-        Register custom metrics for the service
+        Register custom OTEL metrics for the service.
        Note: System metrics (CPU, memory, disk, network) and application metrics (HTTP, DB)
        are automatically created by setup_telemetry(). Use this for additional custom metrics.
        Args:
            metrics_config: Dict with metric name as key and config as value
@@ -417,25 +382,36 @@ class BaseFastAPIService:
                               "user_registrations": {
                                   "type": "counter",
                                   "description": "Total user registrations",
-                                   "labels": ["status"]
+                                   "unit": "registrations"
                               }
                           }
        """
-        if not self.metrics_collector:
+        if not self.telemetry_providers or not self.telemetry_providers.meter_provider:
-            self.logger.warning("Metrics collector not available")
+            self.logger.warning("OTEL meter provider not available - metrics not registered")
            return
        from opentelemetry.metrics import get_meter
        meter = get_meter(self.service_name)
        for metric_name, config in metrics_config.items():
            metric_type = config.get("type", "counter")
            description = config.get("description", f"{metric_name} metric")
-            labels = config.get("labels", [])
+            unit = config.get("unit", "1")
            try:
                if metric_type == "counter":
-                self.metrics_collector.register_counter(metric_name, description, labels=labels)
+                    meter.create_counter(metric_name, description=description, unit=unit)
                    self.logger.info(f"Registered custom counter: {metric_name}")
                elif metric_type == "histogram":
-                self.metrics_collector.register_histogram(metric_name, description, labels=labels)
+                    meter.create_histogram(metric_name, description=description, unit=unit)
                    self.logger.info(f"Registered custom histogram: {metric_name}")
                elif metric_type == "gauge":
                    meter.create_up_down_counter(metric_name, description=description, unit=unit)
                    self.logger.info(f"Registered custom gauge: {metric_name}")
                else:
                    self.logger.warning(f"Unsupported metric type: {metric_type}")
            except Exception as e:
                self.logger.error(f"Failed to register metric {metric_name}", error=str(e))
    def run_development_server(self, host: str = "0.0.0.0", port: int = 8000, reload: Optional[bool] = None):
        """