From c05538cafb2cef6fcae2f8d96bf32b704440c5df Mon Sep 17 00:00:00 2001
From: Urtzi Alfaro <urtzialfaro@MacBook-Pro-de-Urtzi.local>
Date: Fri, 9 Jan 2026 23:14:12 +0100
Subject: [PATCH] Imporve monitoring 5

---
 gateway/app/main.py                           | 482 ++++------------
 infrastructure/helm/signoz-values-dev.yaml    | 169 +++++-
 infrastructure/helm/signoz-values-prod.yaml   | 519 ++++++++++++++++-
 infrastructure/kubernetes/base/configmap.yaml |  41 +-
 .../signoz/dashboards/alert-management.json   | 266 +++++----
 .../signoz/dashboards/api-performance.json    | 445 +++++++++++----
 .../dashboards/application-performance.json   | 428 ++++++++++----
 .../dashboards/database-performance.json      | 520 ++++++++++++++----
 .../signoz/dashboards/error-tracking.json     | 445 +++++++++++----
 .../dashboards/infrastructure-monitoring.json | 490 ++++++++++++++---
 .../signoz/dashboards/log-analysis.json       | 424 ++++++++++----
 .../signoz/dashboards/system-health.json      | 379 ++++++++++---
 .../signoz/dashboards/user-activity.json      | 411 ++++++++++----
 services/ai_insights/app/main.py              | 171 ++----
 services/alert_processor/app/main.py          | 179 ++----
 services/demo_session/app/main.py             | 198 ++-----
 shared/monitoring/__init__.py                 |  76 ++-
 shared/monitoring/logs_exporter.py            |  93 ++--
 shared/monitoring/metrics_exporter.py         | 128 +++--
 shared/monitoring/otel_config.py              | 286 ++++++++++
 shared/monitoring/telemetry.py                | 271 +++++++++
 shared/monitoring/tracing.py                  | 124 +++--
 shared/service_base.py                        | 124 ++---
 23 files changed, 4737 insertions(+), 1932 deletions(-)
 create mode 100644 shared/monitoring/otel_config.py
 create mode 100644 shared/monitoring/telemetry.py

diff --git a/gateway/app/main.py b/gateway/app/main.py
index 156fb637..1333accd 100644
--- a/gateway/app/main.py
+++ b/gateway/app/main.py
@@ -8,13 +8,12 @@ import json
 import structlog
 import resource
 import os
-from fastapi import FastAPI, Request, HTTPException, Depends, WebSocket, WebSocketDisconnect
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, StreamingResponse, Response
-import httpx
 import time
+from fastapi import Request, HTTPException, WebSocket, WebSocketDisconnect
+from fastapi.responses import StreamingResponse
+import httpx
 from shared.redis_utils import initialize_redis, close_redis, get_redis_client
-from typing import Dict, Any
+from shared.service_base import StandardFastAPIService
 
 from app.core.config import settings
 from app.middleware.request_id import RequestIDMiddleware
@@ -26,128 +25,84 @@ from app.middleware.subscription import SubscriptionMiddleware
 from app.middleware.demo_middleware import DemoMiddleware
 from app.middleware.read_only_mode import ReadOnlyModeMiddleware
 from app.routes import auth, tenant, notification, nominatim, subscription, demo, pos, geocoding, poi_context
-from shared.monitoring.logging import setup_logging
-from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
-from shared.monitoring.system_metrics import SystemMetricsCollector
 
-# OpenTelemetry imports
-from opentelemetry import trace
-from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
-from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
-from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
-from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
-from opentelemetry.instrumentation.redis import RedisInstrumentor
-from opentelemetry.sdk.resources import Resource
-
-# Configure OpenTelemetry tracing
-def setup_tracing(service_name: str = "gateway"):
-    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
-    # Create resource with service name
-    resource = Resource.create({"service.name": service_name})
-
-    # Configure OTLP exporter (sends to OpenTelemetry Collector)
-    otlp_exporter = OTLPSpanExporter(
-        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
-        insecure=True  # Use insecure connection for internal cluster communication
-    )
-
-    # Configure tracer provider
-    provider = TracerProvider(resource=resource)
-    processor = BatchSpanProcessor(otlp_exporter)
-    provider.add_span_processor(processor)
-
-    # Set global tracer provider
-    trace.set_tracer_provider(provider)
-
-    return provider
-
-# Initialize tracing
-tracer_provider = setup_tracing("gateway")
-
-# Setup logging
-setup_logging("gateway", settings.LOG_LEVEL)
+# Initialize logger
 logger = structlog.get_logger()
 
-# Check file descriptor limits and warn if too low
+# Check file descriptor limits
 try:
     soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
     if soft_limit < 1024:
-        logger.warning(f"Low file descriptor limit detected: {soft_limit}. Gateway may experience 'too many open files' errors.")
-        logger.warning(f"Recommended: Increase limit with 'ulimit -n 4096' or higher for production.")
-        if soft_limit < 256:
-            logger.error(f"Critical: File descriptor limit ({soft_limit}) is too low for gateway operation!")
+        logger.warning(f"Low file descriptor limit detected: {soft_limit}")
     else:
         logger.info(f"File descriptor limit: {soft_limit} (sufficient)")
 except Exception as e:
     logger.debug(f"Could not check file descriptor limits: {e}")
 
-# Check and log current working directory and permissions
-try:
-    cwd = os.getcwd()
-    logger.info(f"Current working directory: {cwd}")
-    
-    # Check if we can write to common log locations
-    test_locations = ["/var/log", "./logs", "."]
-    for location in test_locations:
-        try:
-            test_file = os.path.join(location, ".gateway_permission_test")
-            with open(test_file, 'w') as f:
-                f.write("test")
-            os.remove(test_file)
-            logger.info(f"Write permission confirmed for: {location}")
-        except Exception as e:
-            logger.warning(f"Cannot write to {location}: {e}")
-except Exception as e:
-    logger.debug(f"Could not check directory permissions: {e}")
-
-# Create FastAPI app
-app = FastAPI(
-    title="Bakery Forecasting API Gateway",
-    description="Central API Gateway for bakery forecasting microservices",
-    version="1.0.0",
-    docs_url="/docs",
-    redoc_url="/redoc",
-    redirect_slashes=False  # Disable automatic trailing slash redirects
-)
-
-# Instrument FastAPI with OpenTelemetry
-FastAPIInstrumentor.instrument_app(app)
-
-# Instrument httpx for outgoing requests
-HTTPXClientInstrumentor().instrument()
-
-# Instrument Redis (will be active once redis client is initialized)
-RedisInstrumentor().instrument()
-
-# Initialize metrics collector
-metrics_collector = MetricsCollector("gateway")
-
-# Add metrics middleware to track HTTP requests
-add_metrics_middleware(app, metrics_collector)
-
 # Redis client for SSE streaming
 redis_client = None
 
-# CORS middleware - Add first
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=settings.CORS_ORIGINS_LIST,
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
+
+class GatewayService(StandardFastAPIService):
+    """Gateway Service with standardized monitoring setup"""
+
+    async def on_startup(self, app):
+        """Custom startup logic for Gateway"""
+        global redis_client
+
+        # Initialize Redis
+        try:
+            await initialize_redis(settings.REDIS_URL, db=0, max_connections=50)
+            redis_client = await get_redis_client()
+            logger.info("Connected to Redis for SSE streaming")
+
+            # Add API rate limiting middleware with Redis client
+            app.add_middleware(APIRateLimitMiddleware, redis_client=redis_client)
+            logger.info("API rate limiting middleware enabled")
+        except Exception as e:
+            logger.error(f"Failed to connect to Redis: {e}")
+
+        # Register custom metrics for gateway-specific operations
+        if self.telemetry_providers and self.telemetry_providers.app_metrics:
+            logger.info("Gateway-specific metrics tracking enabled")
+
+        await super().on_startup(app)
+
+    async def on_shutdown(self, app):
+        """Custom shutdown logic for Gateway"""
+        await super().on_shutdown(app)
+
+        # Close Redis
+        await close_redis()
+        logger.info("Redis connection closed")
+
+
+# Create service instance
+service = GatewayService(
+    service_name="gateway",
+    app_name="Bakery Forecasting API Gateway",
+    description="Central API Gateway for bakery forecasting microservices",
+    version="1.0.0",
+    log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
+    cors_origins=settings.CORS_ORIGINS_LIST,
+    enable_metrics=True,
+    enable_health_checks=True,
+    enable_tracing=True,
+    enable_cors=True
 )
 
-# Custom middleware - Add in REVERSE order (last added = first executed)
+# Create FastAPI app
+app = service.create_app()
+
+# Add gateway-specific middleware (in REVERSE order of execution)
 # Execution order: RequestIDMiddleware -> DemoMiddleware -> AuthMiddleware -> ReadOnlyModeMiddleware -> SubscriptionMiddleware -> APIRateLimitMiddleware -> RateLimitMiddleware -> LoggingMiddleware
-app.add_middleware(LoggingMiddleware)          # Executes 8th (outermost)
-app.add_middleware(RateLimitMiddleware, calls_per_minute=300)  # Executes 7th - Simple rate limit
-# Note: APIRateLimitMiddleware will be added on startup with Redis client
-app.add_middleware(SubscriptionMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)  # Executes 5th
-app.add_middleware(ReadOnlyModeMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)  # Executes 4th - Enforce read-only mode
-app.add_middleware(AuthMiddleware)             # Executes 3rd - Checks for demo context
-app.add_middleware(DemoMiddleware)             # Executes 2nd - Sets demo user context
-app.add_middleware(RequestIDMiddleware)        # Executes 1st (innermost) - Generates request ID for tracing
+app.add_middleware(LoggingMiddleware)
+app.add_middleware(RateLimitMiddleware, calls_per_minute=300)
+app.add_middleware(SubscriptionMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)
+app.add_middleware(ReadOnlyModeMiddleware, tenant_service_url=settings.TENANT_SERVICE_URL)
+app.add_middleware(AuthMiddleware)
+app.add_middleware(DemoMiddleware)
+app.add_middleware(RequestIDMiddleware)
 
 # Include routers
 app.include_router(auth.router, prefix="/api/v1/auth", tags=["authentication"])
@@ -156,114 +111,18 @@ app.include_router(subscription.router, prefix="/api/v1", tags=["subscriptions"]
 app.include_router(notification.router, prefix="/api/v1/notifications", tags=["notifications"])
 app.include_router(nominatim.router, prefix="/api/v1/nominatim", tags=["location"])
 app.include_router(geocoding.router, prefix="/api/v1/geocoding", tags=["geocoding"])
-# app.include_router(poi_context.router, prefix="/api/v1/poi-context", tags=["poi-context"])  # Removed to implement tenant-based architecture
 app.include_router(pos.router, prefix="/api/v1/pos", tags=["pos"])
 app.include_router(demo.router, prefix="/api/v1", tags=["demo"])
 
 
-@app.on_event("startup")
-async def startup_event():
-    """Application startup"""
-    global redis_client
-
-    logger.info("Starting API Gateway")
-
-    # Initialize shared Redis connection
-    try:
-        await initialize_redis(settings.REDIS_URL, db=0, max_connections=50)
-        redis_client = await get_redis_client()
-        logger.info("Connected to Redis for SSE streaming")
-
-        # Add API rate limiting middleware with Redis client
-        app.add_middleware(APIRateLimitMiddleware, redis_client=redis_client)
-        logger.info("API rate limiting middleware enabled with subscription-based quotas")
-    except Exception as e:
-        logger.error(f"Failed to connect to Redis: {e}")
-        logger.warning("API rate limiting middleware will fail open (allow all requests)")
-
-    metrics_collector.register_counter(
-        "gateway_auth_requests_total",
-        "Total authentication requests"
-    )
-    metrics_collector.register_counter(
-        "gateway_auth_responses_total", 
-        "Total authentication responses"
-    )
-    metrics_collector.register_counter(
-        "gateway_auth_errors_total",
-        "Total authentication errors"
-    )
-    
-    metrics_collector.register_histogram(
-        "gateway_request_duration_seconds",
-        "Request duration in seconds"
-    )
-    
-    logger.info("Metrics registered successfully")
-    
-    # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
-    # Initialize system metrics collection
-    system_metrics = SystemMetricsCollector("gateway")
-    logger.info("System metrics collection started")
-
-    logger.info("Metrics export configured via OpenTelemetry OTLP")
-    
-    logger.info("API Gateway started successfully")
-
-@app.on_event("shutdown")
-async def shutdown_event():
-    """Application shutdown"""
-    logger.info("Shutting down API Gateway")
-
-    # Close shared Redis connection
-    await close_redis()
-
-    # Clean up service discovery
-    # await service_discovery.cleanup()
-
-    logger.info("API Gateway shutdown complete")
-
-@app.get("/health")
-async def health_check():
-    """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "service": "api-gateway",
-        "version": "1.0.0",
-        "timestamp": time.time()
-    }
-
-# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
-# The /metrics endpoint is not needed as metrics are pushed automatically
-
 # ================================================================
 # SERVER-SENT EVENTS (SSE) HELPER FUNCTIONS
 # ================================================================
 
 def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list:
-    """
-    Determine which Redis channels to subscribe to based on filters.
-
-    Args:
-        tenant_id: Tenant identifier
-        channel_filters: List of channel patterns (e.g., ["inventory.alerts", "*.notifications"])
-
-    Returns:
-        List of full channel names to subscribe to
-
-    Examples:
-        >>> _get_subscription_channels("abc", ["inventory.alerts"])
-        ["tenant:abc:inventory.alerts"]
-
-        >>> _get_subscription_channels("abc", ["*.alerts"])
-        ["tenant:abc:inventory.alerts", "tenant:abc:production.alerts", ...]
-
-        >>> _get_subscription_channels("abc", [])
-        ["tenant:abc:inventory.alerts", "tenant:abc:inventory.notifications", ...]
-    """
+    """Determine which Redis channels to subscribe to based on filters"""
     all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
     all_classes = ["alerts", "notifications"]
-
     channels = []
 
     if not channel_filters:
@@ -271,70 +130,49 @@ def _get_subscription_channels(tenant_id: str, channel_filters: list) -> list:
         for domain in all_domains:
             for event_class in all_classes:
                 channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
-        # Also subscribe to recommendations (tenant-wide)
         channels.append(f"tenant:{tenant_id}:recommendations")
-        # Also subscribe to legacy channel for backward compatibility
-        channels.append(f"alerts:{tenant_id}")
+        channels.append(f"alerts:{tenant_id}")  # Legacy
         return channels
 
     # Parse filters and expand wildcards
     for filter_pattern in channel_filters:
         if filter_pattern == "*.*":
-            # All channels
             for domain in all_domains:
                 for event_class in all_classes:
                     channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
             channels.append(f"tenant:{tenant_id}:recommendations")
-
         elif filter_pattern.endswith(".*"):
-            # Domain wildcard (e.g., "inventory.*")
             domain = filter_pattern.split(".")[0]
             for event_class in all_classes:
                 channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
-
         elif filter_pattern.startswith("*."):
-            # Class wildcard (e.g., "*.alerts")
             event_class = filter_pattern.split(".")[1]
             if event_class == "recommendations":
                 channels.append(f"tenant:{tenant_id}:recommendations")
             else:
                 for domain in all_domains:
                     channels.append(f"tenant:{tenant_id}:{domain}.{event_class}")
-
         elif filter_pattern == "recommendations":
-            # Recommendations channel
             channels.append(f"tenant:{tenant_id}:recommendations")
-
         else:
-            # Specific channel (e.g., "inventory.alerts")
             channels.append(f"tenant:{tenant_id}:{filter_pattern}")
 
-    return list(set(channels))  # Remove duplicates
+    return list(set(channels))
 
 
 async def _load_initial_state(redis_client, tenant_id: str, channel_filters: list) -> list:
-    """
-    Load initial state from Redis cache based on channel filters.
-
-    Args:
-        redis_client: Redis client
-        tenant_id: Tenant identifier
-        channel_filters: List of channel patterns
-
-    Returns:
-        List of initial events
-    """
+    """Load initial state from Redis cache based on channel filters"""
     initial_events = []
 
     try:
         if not channel_filters:
-            # Load from legacy cache if no filters (backward compat)
+            # Legacy cache
             legacy_cache_key = f"active_alerts:{tenant_id}"
             cached_data = await redis_client.get(legacy_cache_key)
             if cached_data:
                 return json.loads(cached_data)
 
-            # Also try loading from new domain-specific caches
+            # New domain-specific caches
             all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
             all_classes = ["alerts", "notifications"]
 
@@ -343,10 +181,9 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
                     cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
                     cached_data = await redis_client.get(cache_key)
                     if cached_data:
-                        events = json.loads(cached_data)
-                        initial_events.extend(events)
+                        initial_events.extend(json.loads(cached_data))
 
-            # Load recommendations
+            # Recommendations
             recommendations_cache_key = f"active_events:{tenant_id}:recommendations"
             cached_data = await redis_client.get(recommendations_cache_key)
             if cached_data:
@@ -356,36 +193,29 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
 
         # Load based on specific filters
         for filter_pattern in channel_filters:
-            # Extract domain and class from filter
             if "." in filter_pattern:
                 parts = filter_pattern.split(".")
                 domain = parts[0] if parts[0] != "*" else None
                 event_class = parts[1] if len(parts) > 1 and parts[1] != "*" else None
 
                 if domain and event_class:
-                    # Specific cache (e.g., "inventory.alerts")
                     cache_key = f"active_events:{tenant_id}:{domain}.{event_class}s"
                     cached_data = await redis_client.get(cache_key)
                     if cached_data:
                         initial_events.extend(json.loads(cached_data))
-
                 elif domain and not event_class:
-                    # Domain wildcard (e.g., "inventory.*")
                     for ec in ["alerts", "notifications"]:
                         cache_key = f"active_events:{tenant_id}:{domain}.{ec}"
                         cached_data = await redis_client.get(cache_key)
                         if cached_data:
                             initial_events.extend(json.loads(cached_data))
-
                 elif not domain and event_class:
-                    # Class wildcard (e.g., "*.alerts")
                     all_domains = ["inventory", "production", "supply_chain", "demand", "operations"]
                     for d in all_domains:
                         cache_key = f"active_events:{tenant_id}:{d}.{event_class}s"
                         cached_data = await redis_client.get(cache_key)
                         if cached_data:
                             initial_events.extend(json.loads(cached_data))
-
             elif filter_pattern == "recommendations":
                 cache_key = f"active_events:{tenant_id}:recommendations"
                 cached_data = await redis_client.get(cache_key)
@@ -400,27 +230,14 @@ async def _load_initial_state(redis_client, tenant_id: str, channel_filters: lis
 
 
 def _determine_event_type(event_data: dict) -> str:
-    """
-    Determine SSE event type from event data.
-
-    Args:
-        event_data: Event data dictionary
-
-    Returns:
-        SSE event type: 'alert', 'notification', or 'recommendation'
-    """
-    # New event architecture uses 'event_class'
+    """Determine SSE event type from event data"""
     if 'event_class' in event_data:
-        return event_data['event_class']  # 'alert', 'notification', or 'recommendation'
-
-    # Legacy format uses 'item_type'
+        return event_data['event_class']
     if 'item_type' in event_data:
         if event_data['item_type'] == 'recommendation':
             return 'recommendation'
         else:
             return 'alert'
-
-    # Default to 'alert' for backward compatibility
     return 'alert'
 
 
@@ -432,42 +249,25 @@ def _determine_event_type(event_data: dict) -> str:
 async def events_stream(
     request: Request,
     tenant_id: str,
-    channels: str = None  # Comma-separated channel filters (e.g., "inventory.alerts,production.notifications")
+    channels: str = None
 ):
     """
     Server-Sent Events stream for real-time notifications with multi-channel support.
 
-    Authentication is handled by auth middleware via query param token.
-    User context is available in request.state.user (injected by middleware).
-
     Query Parameters:
         tenant_id: Tenant identifier (required)
         channels: Comma-separated channel filters (optional)
-            Examples:
-                - "inventory.alerts,production.notifications" - Specific channels
-                - "*.alerts" - All alert channels
-                - "inventory.*" - All inventory events
-                - None - All channels (default, backward compatible)
-
-    New channel pattern: tenant:{tenant_id}:{domain}.{class}
-    Examples:
-        - tenant:abc:inventory.alerts
-        - tenant:abc:production.notifications
-        - tenant:abc:recommendations
-
-    Legacy channel (backward compat): alerts:{tenant_id}
     """
     global redis_client
 
     if not redis_client:
         raise HTTPException(status_code=503, detail="SSE service unavailable")
 
-    # Extract user context from request state (set by auth middleware)
+    # Extract user context from request state
     user_context = request.state.user
     user_id = user_context.get('user_id')
     email = user_context.get('email')
 
-    # Validate tenant_id parameter
     if not tenant_id:
         raise HTTPException(status_code=400, detail="tenant_id query parameter is required")
 
@@ -479,79 +279,53 @@ async def events_stream(
     logger.info(f"SSE connection request for user {email}, tenant {tenant_id}, channels: {channel_filters or 'all'}")
 
     async def event_generator():
-        """Generate server-sent events from Redis pub/sub with multi-channel support"""
+        """Generate server-sent events from Redis pub/sub"""
         pubsub = None
         try:
-            # Create pubsub connection with resource monitoring
             pubsub = redis_client.pubsub()
             logger.debug(f"Created Redis pubsub connection for tenant: {tenant_id}")
-            
-            # Monitor connection count
-            try:
-                connection_info = await redis_client.info('clients')
-                connected_clients = connection_info.get('connected_clients', 'unknown')
-                logger.debug(f"Redis connected clients: {connected_clients}")
-            except Exception:
-                # Don't fail if we can't get connection info
-                pass
 
-            # Determine which channels to subscribe to
+            # Determine channels
             subscription_channels = _get_subscription_channels(tenant_id, channel_filters)
 
-            # Subscribe to all determined channels
+            # Subscribe
             if subscription_channels:
                 await pubsub.subscribe(*subscription_channels)
                 logger.info(f"Subscribed to {len(subscription_channels)} channels for tenant {tenant_id}")
             else:
-                # Fallback to legacy channel if no channels specified
                 legacy_channel = f"alerts:{tenant_id}"
                 await pubsub.subscribe(legacy_channel)
-                logger.info(f"Subscribed to legacy channel: {legacy_channel}")
 
-            # Send initial connection event
+            # Connection event
             yield f"event: connection\n"
             yield f"data: {json.dumps({'type': 'connected', 'message': 'SSE connection established', 'channels': subscription_channels or ['all'], 'timestamp': time.time()})}\n\n"
 
-            # Fetch and send initial state from cache (domain-specific or legacy)
+            # Initial state
             initial_events = await _load_initial_state(redis_client, tenant_id, channel_filters)
             if initial_events:
                 logger.info(f"Sending {len(initial_events)} initial events to tenant {tenant_id}")
-                yield f"event: initial_state\n"
-                yield f"data: {json.dumps(initial_events)}\n\n"
-            else:
-                # Send empty initial state for compatibility
-                yield f"event: initial_state\n"
-                yield f"data: {json.dumps([])}\n\n"
+            yield f"event: initial_state\n"
+            yield f"data: {json.dumps(initial_events)}\n\n"
 
             heartbeat_counter = 0
 
             while True:
-                # Check if client has disconnected
                 if await request.is_disconnected():
                     logger.info(f"SSE client disconnected for tenant: {tenant_id}")
                     break
 
                 try:
-                    # Get message from Redis with timeout
                     message = await asyncio.wait_for(pubsub.get_message(ignore_subscribe_messages=True), timeout=10.0)
 
                     if message and message['type'] == 'message':
-                        # Forward the event from Redis
                         event_data = json.loads(message['data'])
-
-                        # Determine event type for SSE
                         event_type = _determine_event_type(event_data)
-
-                        # Add channel metadata for frontend routing
                         event_data['_channel'] = message['channel'].decode('utf-8') if isinstance(message['channel'], bytes) else message['channel']
 
                         yield f"event: {event_type}\n"
                         yield f"data: {json.dumps(event_data)}\n\n"
 
-                        logger.debug(f"SSE event sent to tenant {tenant_id}: {event_type} - {event_data.get('title')}")
-
                 except asyncio.TimeoutError:
-                    # Send heartbeat every 10 timeouts (100 seconds)
                     heartbeat_counter += 1
                     if heartbeat_counter >= 10:
                         yield f"event: heartbeat\n"
@@ -563,24 +337,13 @@ async def events_stream(
         except Exception as e:
             logger.error(f"SSE error for tenant {tenant_id}: {e}", exc_info=True)
         finally:
-            try:
-                if pubsub:
-                    try:
-                        # Unsubscribe from all channels
-                        await pubsub.unsubscribe()
-                        logger.debug(f"Unsubscribed from Redis channels for tenant: {tenant_id}")
-                    except Exception as unsubscribe_error:
-                        logger.error(f"Failed to unsubscribe Redis pubsub for tenant {tenant_id}: {unsubscribe_error}")
-                    
-                    try:
-                        # Close pubsub connection
-                        await pubsub.close()
-                        logger.debug(f"Closed Redis pubsub connection for tenant: {tenant_id}")
-                    except Exception as close_error:
-                        logger.error(f"Failed to close Redis pubsub for tenant {tenant_id}: {close_error}")
-                logger.info(f"SSE connection closed for tenant: {tenant_id}")
-            except Exception as finally_error:
-                logger.error(f"Error in SSE cleanup for tenant {tenant_id}: {finally_error}")
+            if pubsub:
+                try:
+                    await pubsub.unsubscribe()
+                    await pubsub.close()
+                except Exception as e:
+                    logger.error(f"Error closing pubsub: {e}")
+            logger.info(f"SSE connection closed for tenant: {tenant_id}")
 
     return StreamingResponse(
         event_generator(),
@@ -593,55 +356,35 @@ async def events_stream(
         }
     )
 
+
 # ================================================================
 # WEBSOCKET ROUTING FOR TRAINING SERVICE
 # ================================================================
 
 @app.websocket("/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live")
 async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_id: str):
-    """
-    Simple WebSocket proxy with token verification only.
-    Validates the token and forwards the connection to the training service.
-    """
-    # Get token from query params
+    """WebSocket proxy with token verification for training service"""
     token = websocket.query_params.get("token")
     if not token:
-        logger.warning("WebSocket proxy rejected - missing token",
-                      job_id=job_id,
-                      tenant_id=tenant_id)
         await websocket.accept()
         await websocket.close(code=1008, reason="Authentication token required")
         return
 
     # Verify token
     from shared.auth.jwt_handler import JWTHandler
-
     jwt_handler = JWTHandler(settings.JWT_SECRET_KEY, settings.JWT_ALGORITHM)
 
     try:
         payload = jwt_handler.verify_token(token)
         if not payload or not payload.get('user_id'):
-            logger.warning("WebSocket proxy rejected - invalid token",
-                          job_id=job_id,
-                          tenant_id=tenant_id)
             await websocket.accept()
             await websocket.close(code=1008, reason="Invalid token")
             return
-
-        logger.info("WebSocket proxy - token verified",
-                   user_id=payload.get('user_id'),
-                   tenant_id=tenant_id,
-                   job_id=job_id)
-
     except Exception as e:
-        logger.warning("WebSocket proxy - token verification failed",
-                      job_id=job_id,
-                      error=str(e))
         await websocket.accept()
         await websocket.close(code=1008, reason="Token verification failed")
         return
 
-    # Accept the connection
     await websocket.accept()
 
     # Build WebSocket URL to training service
@@ -649,33 +392,24 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
     training_ws_url = training_service_base.replace('http://', 'ws://').replace('https://', 'wss://')
     training_ws_url = f"{training_ws_url}/api/v1/tenants/{tenant_id}/training/jobs/{job_id}/live?token={token}"
 
-    logger.info("Gateway proxying WebSocket to training service",
-               job_id=job_id,
-               training_ws_url=training_ws_url.replace(token, '***'))
-
     training_ws = None
 
     try:
-        # Connect to training service WebSocket
         import websockets
         from websockets.protocol import State
 
         training_ws = await websockets.connect(
             training_ws_url,
-            ping_interval=120,  # Send ping every 2 minutes (tolerates long training operations)
-            ping_timeout=60,    # Wait up to 1 minute for pong (graceful timeout)
-            close_timeout=60,   # Increase close timeout for graceful shutdown
+            ping_interval=120,
+            ping_timeout=60,
+            close_timeout=60,
             open_timeout=30
         )
 
-        logger.info("Gateway connected to training service WebSocket", job_id=job_id)
-
         async def forward_frontend_to_training():
-            """Forward messages from frontend to training service"""
             try:
                 while training_ws and training_ws.state == State.OPEN:
                     data = await websocket.receive()
-
                     if data.get("type") == "websocket.receive":
                         if "text" in data:
                             await training_ws.send(data["text"])
@@ -683,30 +417,17 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
                             await training_ws.send(data["bytes"])
                     elif data.get("type") == "websocket.disconnect":
                         break
-            except Exception as e:
-                logger.debug("Frontend to training forward ended", error=str(e))
+            except Exception:
+                pass
 
         async def forward_training_to_frontend():
-            """Forward messages from training service to frontend"""
-            message_count = 0
             try:
                 while training_ws and training_ws.state == State.OPEN:
                     message = await training_ws.recv()
                     await websocket.send_text(message)
-                    message_count += 1
+            except Exception:
+                pass
 
-                    # Log every 10th message to track connectivity
-                    if message_count % 10 == 0:
-                        logger.debug("WebSocket proxy active",
-                                   job_id=job_id,
-                                   messages_forwarded=message_count)
-            except Exception as e:
-                logger.info("Training to frontend forward ended",
-                           job_id=job_id,
-                           messages_forwarded=message_count,
-                           error=str(e))
-
-        # Run both forwarding tasks concurrently
         await asyncio.gather(
             forward_frontend_to_training(),
             forward_training_to_frontend(),
@@ -716,20 +437,17 @@ async def websocket_training_progress(websocket: WebSocket, tenant_id: str, job_
     except Exception as e:
         logger.error("WebSocket proxy error", job_id=job_id, error=str(e))
     finally:
-        # Cleanup
         if training_ws and training_ws.state == State.OPEN:
             try:
                 await training_ws.close()
             except:
                 pass
-
         try:
             if not websocket.client_state.name == 'DISCONNECTED':
                 await websocket.close(code=1000, reason="Proxy closed")
         except:
             pass
 
-        logger.info("WebSocket proxy connection closed", job_id=job_id)
 
 if __name__ == "__main__":
     import uvicorn
diff --git a/infrastructure/helm/signoz-values-dev.yaml b/infrastructure/helm/signoz-values-dev.yaml
index e3334d4f..4554e5e7 100644
--- a/infrastructure/helm/signoz-values-dev.yaml
+++ b/infrastructure/helm/signoz-values-dev.yaml
@@ -48,9 +48,9 @@ signoz:
     signoz_traces_ttl_duration_hrs: "168"
     signoz_metrics_ttl_duration_hrs: "168"
     signoz_logs_ttl_duration_hrs: "168"
-    # OpAMP Server Configuration
-    signoz_opamp_server_enabled: "true"
-    signoz_opamp_server_endpoint: "0.0.0.0:4320"
+    # OpAMP Server Configuration - DISABLED for dev (causes gRPC instability)
+    signoz_opamp_server_enabled: "false"
+    # signoz_opamp_server_endpoint: "0.0.0.0:4320"
 
   persistence:
     enabled: true
@@ -149,9 +149,10 @@ otelCollector:
     repository: signoz/signoz-otel-collector
     tag: v0.129.12  # Latest recommended version
 
-  # OpAMP Configuration - Enabled for dynamic configuration management
-  # Note: OpAMP allows remote configuration management via SigNoz backend
-  # This replaces the manual kubectl patch approach
+  # OpAMP Configuration - DISABLED for development
+  # OpAMP is designed for production with remote config management
+  # In dev, it causes gRPC instability and collector reloads
+  # We use static configuration instead
 
   # Init containers for the Otel Collector pod
   initContainers:
@@ -231,6 +232,9 @@ otelCollector:
         secretName: postgres-tls
     - name: postgres-tls-fixed
       emptyDir: {}
+    - name: varlogpods
+      hostPath:
+        path: /var/log/pods
 
   extraVolumeMounts:
     - name: redis-tls
@@ -242,13 +246,16 @@ otelCollector:
     - name: postgres-tls-fixed
       mountPath: /etc/postgres-tls
       readOnly: false
+    - name: varlogpods
+      mountPath: /var/log/pods
+      readOnly: true
 
-  # Enable OpAMP for dynamic configuration management
+  # Disable OpAMP - use static configuration only
+  # Use 'args' instead of 'extraArgs' to completely override the command
   command:
     name: /signoz-otel-collector
-    extraArgs:
+    args:
       - --config=/conf/otel-collector-config.yaml
-      - --manager-config=/conf/otel-collector-opamp-config.yaml
       - --feature-gates=-pkg.translator.prometheus.NormalizeName
 
   # OpenTelemetry Collector configuration
@@ -275,6 +282,63 @@ otelCollector:
               allowed_origins:
                 - "*"
 
+      # Filelog receiver for Kubernetes pod logs
+      # Collects container stdout/stderr from /var/log/pods
+      filelog:
+        include:
+          - /var/log/pods/*/*/*.log
+        exclude:
+          # Exclude SigNoz's own logs to avoid recursive collection
+          - /var/log/pods/bakery-ia_signoz-*/*/*.log
+        include_file_path: true
+        include_file_name: false
+        operators:
+          # Parse CRI-O / containerd log format
+          - type: regex_parser
+            regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
+            timestamp:
+              parse_from: attributes.time
+              layout: '%Y-%m-%dT%H:%M:%S.%LZ'
+          # Fix timestamp parsing - extract from the parsed time field
+          - type: move
+            from: attributes.time
+            to: attributes.timestamp
+          # Extract Kubernetes metadata from file path
+          - type: regex_parser
+            id: extract_metadata_from_filepath
+            regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
+            parse_from: attributes["log.file.path"]
+          # Move metadata to resource attributes
+          - type: move
+            from: attributes.namespace
+            to: resource["k8s.namespace.name"]
+          - type: move
+            from: attributes.pod_name
+            to: resource["k8s.pod.name"]
+          - type: move
+            from: attributes.container_name
+            to: resource["k8s.container.name"]
+          - type: move
+            from: attributes.log
+            to: body
+
+      # Kubernetes Cluster Receiver - Collects cluster-level metrics
+      # Provides information about nodes, namespaces, pods, and other cluster resources
+      k8s_cluster:
+        collection_interval: 30s
+        node_conditions_to_report:
+          - Ready
+          - MemoryPressure
+          - DiskPressure
+          - PIDPressure
+          - NetworkUnavailable
+        allocatable_types_to_report:
+          - cpu
+          - memory
+          - pods
+
+
+
       # PostgreSQL receivers for database metrics
       # ENABLED: Monitor users configured and credentials stored in secrets
       # Collects metrics directly from PostgreSQL databases with proper TLS
@@ -538,6 +602,43 @@ otelCollector:
         password: ${env:RABBITMQ_PASSWORD}
         collection_interval: 30s
 
+      # Prometheus Receiver - Scrapes metrics from Kubernetes API
+      # Simplified configuration using only Kubernetes API metrics
+      prometheus:
+        config:
+          scrape_configs:
+            - job_name: 'kubernetes-nodes-cadvisor'
+              scrape_interval: 30s
+              scrape_timeout: 10s
+              scheme: https
+              tls_config:
+                insecure_skip_verify: true
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+              kubernetes_sd_configs:
+                - role: node
+              relabel_configs:
+                - action: labelmap
+                  regex: __meta_kubernetes_node_label_(.+)
+                - target_label: __address__
+                  replacement: kubernetes.default.svc:443
+                - source_labels: [__meta_kubernetes_node_name]
+                  regex: (.+)
+                  target_label: __metrics_path__
+                  replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
+            - job_name: 'kubernetes-apiserver'
+              scrape_interval: 30s
+              scrape_timeout: 10s
+              scheme: https
+              tls_config:
+                insecure_skip_verify: true
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+              kubernetes_sd_configs:
+                - role: endpoints
+              relabel_configs:
+                - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+                  action: keep
+                  regex: default;kubernetes;https
+
     processors:
       # Batch processor for better performance (optimized for high throughput)
       batch:
@@ -562,6 +663,25 @@ otelCollector:
         detectors: [env, system, docker]
         timeout: 5s
 
+      # Kubernetes attributes processor - CRITICAL for logs
+      # Extracts pod, namespace, container metadata from log attributes
+      k8sattributes:
+        auth_type: "serviceAccount"
+        passthrough: false
+        extract:
+          metadata:
+            - k8s.pod.name
+            - k8s.pod.uid
+            - k8s.deployment.name
+            - k8s.namespace.name
+            - k8s.node.name
+            - k8s.container.name
+          labels:
+            - tag_name: "app"
+            - tag_name: "pod-template-hash"
+          annotations:
+            - tag_name: "description"
+
       # SigNoz span metrics processor with delta aggregation (recommended)
       # Generates RED metrics (Rate, Error, Duration) from trace spans
       signozspanmetrics/delta:
@@ -643,7 +763,7 @@ otelCollector:
             postgresql/orchestrator, postgresql/pos, postgresql/procurement,
             postgresql/production, postgresql/recipes, postgresql/sales,
             postgresql/suppliers, postgresql/tenant, postgresql/training,
-            redis, rabbitmq]
+            redis, rabbitmq, k8s_cluster, prometheus]
           processors: [memory_limiter, batch, resourcedetection]
           exporters: [signozclickhousemetrics]
 
@@ -653,17 +773,38 @@ otelCollector:
           processors: [batch/meter]
           exporters: [signozclickhousemeter]
 
-        # Logs pipeline
+        # Logs pipeline - includes both OTLP and Kubernetes pod logs
         logs:
-          receivers: [otlp]
-          processors: [memory_limiter, batch, resourcedetection]
+          receivers: [otlp, filelog]
+          processors: [memory_limiter, batch, resourcedetection, k8sattributes]
           exporters: [clickhouselogsexporter]
 
 # Additional Configuration
 serviceAccount:
   create: true
   annotations: {}
-  name: ""
+  name: "signoz-otel-collector"
+
+# RBAC Configuration for Kubernetes monitoring
+# Required for k8s_cluster and kubeletstats receivers to access Kubernetes API
+rbac:
+  create: true
+  rules:
+    - apiGroups: [""]
+      resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["apps"]
+      resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["batch"]
+      resources: ["jobs", "cronjobs"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["extensions"]
+      resources: ["deployments", "daemonsets", "replicasets"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["metrics.k8s.io"]
+      resources: ["nodes", "pods"]
+      verbs: ["get", "list", "watch"]
 
 # Security Context
 securityContext:
diff --git a/infrastructure/helm/signoz-values-prod.yaml b/infrastructure/helm/signoz-values-prod.yaml
index 759e1407..73abded8 100644
--- a/infrastructure/helm/signoz-values-prod.yaml
+++ b/infrastructure/helm/signoz-values-prod.yaml
@@ -66,6 +66,11 @@ signoz:
     signoz_traces_ttl_duration_hrs: "720"
     signoz_metrics_ttl_duration_hrs: "720"
     signoz_logs_ttl_duration_hrs: "720"
+    # OpAMP Server Configuration
+    # WARNING: OpAMP can cause gRPC instability and collector reloads
+    # Only enable if you have a stable OpAMP backend server
+    signoz_opamp_server_enabled: "false"
+    # signoz_opamp_server_endpoint: "0.0.0.0:4320"
     # SMTP configuration for email alerts
     signoz_smtp_enabled: "true"
     signoz_smtp_host: "smtp.gmail.com"
@@ -247,17 +252,52 @@ otelCollector:
     tag: v0.129.12  # Updated to latest recommended version
     pullPolicy: IfNotPresent
 
+  # Init containers for the Otel Collector pod
+  initContainers:
+    fix-postgres-tls:
+      enabled: true
+      image:
+        registry: docker.io
+        repository: busybox
+        tag: 1.35
+        pullPolicy: IfNotPresent
+      command:
+        - sh
+        - -c
+        - |
+          echo "Fixing PostgreSQL TLS file permissions..."
+          cp /etc/postgres-tls-source/* /etc/postgres-tls/
+          chmod 600 /etc/postgres-tls/server-key.pem
+          chmod 644 /etc/postgres-tls/server-cert.pem
+          chmod 644 /etc/postgres-tls/ca-cert.pem
+          echo "PostgreSQL TLS permissions fixed"
+      volumeMounts:
+        - name: postgres-tls-source
+          mountPath: /etc/postgres-tls-source
+          readOnly: true
+        - name: postgres-tls-fixed
+          mountPath: /etc/postgres-tls
+          readOnly: false
+
   service:
     type: ClusterIP
     ports:
       - name: otlp-grpc
         port: 4317
+        targetPort: 4317
+        protocol: TCP
       - name: otlp-http
         port: 4318
+        targetPort: 4318
+        protocol: TCP
+      - name: prometheus
+        port: 8889
+        targetPort: 8889
+        protocol: TCP
       - name: metrics
         port: 8888
-      - name: healthcheck
-        port: 13133
+        targetPort: 8888
+        protocol: TCP
 
   resources:
     requests:
@@ -267,6 +307,50 @@ otelCollector:
       cpu: 2000m
       memory: 2Gi
 
+  # Additional environment variables for receivers
+  additionalEnvs:
+    POSTGRES_MONITOR_USER: "monitoring"
+    POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca"
+    REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k"
+    RABBITMQ_USER: "bakery"
+    RABBITMQ_PASSWORD: "forecast123"
+
+  # Mount TLS certificates for secure connections
+  extraVolumes:
+    - name: redis-tls
+      secret:
+        secretName: redis-tls-secret
+    - name: postgres-tls
+      secret:
+        secretName: postgres-tls
+    - name: postgres-tls-fixed
+      emptyDir: {}
+    - name: varlogpods
+      hostPath:
+        path: /var/log/pods
+
+  extraVolumeMounts:
+    - name: redis-tls
+      mountPath: /etc/redis-tls
+      readOnly: true
+    - name: postgres-tls
+      mountPath: /etc/postgres-tls-source
+      readOnly: true
+    - name: postgres-tls-fixed
+      mountPath: /etc/postgres-tls
+      readOnly: false
+    - name: varlogpods
+      mountPath: /var/log/pods
+      readOnly: true
+
+  # Enable OpAMP for dynamic configuration management
+  command:
+    name: /signoz-otel-collector
+    extraArgs:
+      - --config=/conf/otel-collector-config.yaml
+      - --manager-config=/conf/otel-collector-opamp-config.yaml
+      - --feature-gates=-pkg.translator.prometheus.NormalizeName
+
   # Full OTEL Collector Configuration
   config:
     # Connectors - bridge between pipelines
@@ -297,14 +381,358 @@ otelCollector:
                 - "https://monitoring.bakewise.ai"
                 - "https://*.bakewise.ai"
 
+      # Filelog receiver for Kubernetes pod logs
+      # Collects container stdout/stderr from /var/log/pods
+      filelog:
+        include:
+          - /var/log/pods/*/*/*.log
+        exclude:
+          # Exclude SigNoz's own logs to avoid recursive collection
+          - /var/log/pods/bakery-ia_signoz-*/*/*.log
+        include_file_path: true
+        include_file_name: false
+        operators:
+          # Parse CRI-O / containerd log format
+          - type: regex_parser
+            regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
+            timestamp:
+              parse_from: attributes.time
+              layout: '%Y-%m-%dT%H:%M:%S.%LZ'
+          # Fix timestamp parsing - extract from the parsed time field
+          - type: move
+            from: attributes.time
+            to: attributes.timestamp
+          # Extract Kubernetes metadata from file path
+          - type: regex_parser
+            id: extract_metadata_from_filepath
+            regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
+            parse_from: attributes["log.file.path"]
+          # Move metadata to resource attributes
+          - type: move
+            from: attributes.namespace
+            to: resource["k8s.namespace.name"]
+          - type: move
+            from: attributes.pod_name
+            to: resource["k8s.pod.name"]
+          - type: move
+            from: attributes.container_name
+            to: resource["k8s.container.name"]
+          - type: move
+            from: attributes.log
+            to: body
+
+      # Kubernetes Cluster Receiver - Collects cluster-level metrics
+      # Provides information about nodes, namespaces, pods, and other cluster resources
+      k8s_cluster:
+        collection_interval: 30s
+        node_conditions_to_report:
+          - Ready
+          - MemoryPressure
+          - DiskPressure
+          - PIDPressure
+          - NetworkUnavailable
+        allocatable_types_to_report:
+          - cpu
+          - memory
+          - pods
+
       # Prometheus receiver for scraping metrics
       prometheus:
         config:
           scrape_configs:
-            - job_name: 'otel-collector'
+            - job_name: 'kubernetes-nodes-cadvisor'
               scrape_interval: 30s
-              static_configs:
-                - targets: ['localhost:8888']
+              scrape_timeout: 10s
+              scheme: https
+              tls_config:
+                insecure_skip_verify: true
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+              kubernetes_sd_configs:
+                - role: node
+              relabel_configs:
+                - action: labelmap
+                  regex: __meta_kubernetes_node_label_(.+)
+                - target_label: __address__
+                  replacement: kubernetes.default.svc:443
+                - source_labels: [__meta_kubernetes_node_name]
+                  regex: (.+)
+                  target_label: __metrics_path__
+                  replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
+            - job_name: 'kubernetes-apiserver'
+              scrape_interval: 30s
+              scrape_timeout: 10s
+              scheme: https
+              tls_config:
+                insecure_skip_verify: true
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+              kubernetes_sd_configs:
+                - role: endpoints
+              relabel_configs:
+                - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+                  action: keep
+                  regex: default;kubernetes;https
+
+      # Redis receiver for cache metrics
+      # ENABLED: Using existing credentials from redis-secrets with TLS
+      redis:
+        endpoint: redis-service.bakery-ia:6379
+        password: ${env:REDIS_PASSWORD}
+        collection_interval: 60s
+        transport: tcp
+        tls:
+          insecure_skip_verify: false
+          cert_file: /etc/redis-tls/redis-cert.pem
+          key_file: /etc/redis-tls/redis-key.pem
+          ca_file: /etc/redis-tls/ca-cert.pem
+        metrics:
+          redis.maxmemory:
+            enabled: true
+          redis.cmd.latency:
+            enabled: true
+
+      # RabbitMQ receiver via management API
+      # ENABLED: Using existing credentials from rabbitmq-secrets
+      rabbitmq:
+        endpoint: http://rabbitmq-service.bakery-ia:15672
+        username: ${env:RABBITMQ_USER}
+        password: ${env:RABBITMQ_PASSWORD}
+        collection_interval: 30s
+
+      # PostgreSQL receivers for database metrics
+      # Monitor all databases with proper TLS configuration
+      postgresql/auth:
+        endpoint: auth-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - auth_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/inventory:
+        endpoint: inventory-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - inventory_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/orders:
+        endpoint: orders-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - orders_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/ai-insights:
+        endpoint: ai-insights-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - ai_insights_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/alert-processor:
+        endpoint: alert-processor-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - alert_processor_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/distribution:
+        endpoint: distribution-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - distribution_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/external:
+        endpoint: external-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - external_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/forecasting:
+        endpoint: forecasting-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - forecasting_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/notification:
+        endpoint: notification-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - notification_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/orchestrator:
+        endpoint: orchestrator-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - orchestrator_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/pos:
+        endpoint: pos-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - pos_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/procurement:
+        endpoint: procurement-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - procurement_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/production:
+        endpoint: production-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - production_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/recipes:
+        endpoint: recipes-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - recipes_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/sales:
+        endpoint: sales-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - sales_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/suppliers:
+        endpoint: suppliers-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - suppliers_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/tenant:
+        endpoint: tenant-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - tenant_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/training:
+        endpoint: training-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - training_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
 
     processors:
       # High-performance batch processing (official recommendation)
@@ -326,7 +754,7 @@ otelCollector:
 
       # Resource detection for K8s
       resourcedetection:
-        detectors: [env, system, docker, kubernetes]
+        detectors: [env, system, docker]
         timeout: 5s
 
       # Add resource attributes
@@ -339,6 +767,26 @@ otelCollector:
             value: bakery-ia-prod
             action: upsert
 
+      # Kubernetes attributes processor - CRITICAL for logs
+      # Extracts pod, namespace, container metadata from log attributes
+      k8sattributes:
+        auth_type: "serviceAccount"
+        passthrough: false
+        extract:
+          metadata:
+            - k8s.pod.name
+            - k8s.pod.uid
+            - k8s.deployment.name
+            - k8s.namespace.name
+            - k8s.node.name
+            - k8s.container.name
+          labels:
+            - tag_name: "app"
+            - tag_name: "pod-template-hash"
+            - tag_name: "version"
+          annotations:
+            - tag_name: "description"
+
       # SigNoz span metrics processor with delta aggregation (recommended)
       # Generates RED metrics (Rate, Error, Duration) from trace spans
       signozspanmetrics/delta:
@@ -354,9 +802,9 @@ otelCollector:
           - name: signoz.collector.id
 
     exporters:
-      # Export to SigNoz ClickHouse
+      # ClickHouse exporter for traces
       clickhousetraces:
-        datasource: tcp://clickhouse:9000/?database=signoz_traces
+        datasource: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_traces
         timeout: 10s
         retry_on_failure:
           enabled: true
@@ -364,8 +812,9 @@ otelCollector:
           max_interval: 30s
           max_elapsed_time: 300s
 
+      # ClickHouse exporter for metrics
       signozclickhousemetrics:
-        endpoint: "tcp://clickhouse:9000/?database=signoz_metrics"
+        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
         timeout: 10s
         retry_on_failure:
           enabled: true
@@ -375,32 +824,32 @@ otelCollector:
 
       # ClickHouse exporter for meter data (usage metrics)
       signozclickhousemeter:
-        dsn: "tcp://clickhouse:9000/?database=signoz_meter"
+        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_meter"
         timeout: 45s
         sending_queue:
           enabled: false
 
+      # ClickHouse exporter for logs
       clickhouselogsexporter:
-        dsn: tcp://clickhouse:9000/?database=signoz_logs
+        dsn: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_logs
         timeout: 10s
         retry_on_failure:
           enabled: true
           initial_interval: 5s
           max_interval: 30s
-          max_elapsed_time: 300s
 
       # Metadata exporter for service metadata
       metadataexporter:
-        dsn: "tcp://clickhouse:9000/?database=signoz_metadata"
+        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metadata"
         timeout: 10s
         cache:
           provider: in_memory
 
-      # Debug exporter for debugging (replaces deprecated logging exporter)
+      # Debug exporter for debugging (optional)
       debug:
         verbosity: detailed
-        sampling_initial: 2
-        sampling_thereafter: 500
+        sampling_initial: 5
+        sampling_thereafter: 200
 
     service:
       extensions: [health_check, zpages]
@@ -411,9 +860,16 @@ otelCollector:
           processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource]
           exporters: [clickhousetraces, metadataexporter, signozmeter]
 
-        # Metrics pipeline
+        # Metrics pipeline - includes all infrastructure receivers
         metrics:
-          receivers: [otlp, prometheus]
+          receivers: [otlp,
+            postgresql/auth, postgresql/inventory, postgresql/orders,
+            postgresql/ai-insights, postgresql/alert-processor, postgresql/distribution,
+            postgresql/external, postgresql/forecasting, postgresql/notification,
+            postgresql/orchestrator, postgresql/pos, postgresql/procurement,
+            postgresql/production, postgresql/recipes, postgresql/sales,
+            postgresql/suppliers, postgresql/tenant, postgresql/training,
+            redis, rabbitmq, k8s_cluster, prometheus]
           processors: [memory_limiter, batch, resourcedetection, resource]
           exporters: [signozclickhousemetrics]
 
@@ -423,10 +879,10 @@ otelCollector:
           processors: [batch/meter]
           exporters: [signozclickhousemeter]
 
-        # Logs pipeline
+        # Logs pipeline - includes both OTLP and Kubernetes pod logs
         logs:
-          receivers: [otlp]
-          processors: [memory_limiter, batch, resourcedetection, resource]
+          receivers: [otlp, filelog]
+          processors: [memory_limiter, batch, resourcedetection, resource, k8sattributes]
           exporters: [clickhouselogsexporter]
 
   # HPA for OTEL Collector
@@ -455,6 +911,27 @@ serviceAccount:
   annotations: {}
   name: "signoz"
 
+# RBAC Configuration for Kubernetes monitoring
+# Required for k8s_cluster receiver to access Kubernetes API
+rbac:
+  create: true
+  rules:
+    - apiGroups: [""]
+      resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["apps"]
+      resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["batch"]
+      resources: ["jobs", "cronjobs"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["extensions"]
+      resources: ["deployments", "daemonsets", "replicasets"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["metrics.k8s.io"]
+      resources: ["nodes", "pods"]
+      verbs: ["get", "list", "watch"]
+
 # Security Context
 securityContext:
   runAsNonRoot: true
diff --git a/infrastructure/kubernetes/base/configmap.yaml b/infrastructure/kubernetes/base/configmap.yaml
index b322469d..e743e5e5 100644
--- a/infrastructure/kubernetes/base/configmap.yaml
+++ b/infrastructure/kubernetes/base/configmap.yaml
@@ -15,9 +15,13 @@ data:
   LOG_LEVEL: "INFO"
 
   # Observability Settings - SigNoz enabled
+  # Note: Detailed OTEL configuration is in the OBSERVABILITY section below
   ENABLE_TRACING: "true"
   ENABLE_METRICS: "true"
   ENABLE_LOGS: "true"
+  ENABLE_OTEL_METRICS: "true"
+  ENABLE_SYSTEM_METRICS: "true"
+  OTEL_LOGS_EXPORTER: "otlp"
 
   # Database initialization settings
   # IMPORTANT: Services NEVER run migrations - they only verify DB is ready
@@ -384,15 +388,44 @@ data:
   # ================================================================
   # OBSERVABILITY - SigNoz (Unified Monitoring)
   # ================================================================
-  # OpenTelemetry Configuration - Direct to SigNoz
-  # IMPORTANT: gRPC endpoints should NOT include http:// prefix
+  # OpenTelemetry Configuration - Direct to SigNoz OTel Collector
+  #
+  # ENDPOINT CONFIGURATION:
+  # - OTEL_EXPORTER_OTLP_ENDPOINT: Base gRPC endpoint (host:port format, NO http:// prefix)
+  #   Used by traces and metrics (gRPC) by default
+  #   Format: "host:4317" (gRPC port)
+  #
+  # PROTOCOL USAGE:
+  # - Traces: gRPC (port 4317) - High performance, low latency
+  # - Metrics: gRPC (port 4317) - Efficient batch export
+  # - Logs: HTTP (port 4318) - Required for OTLP log protocol
+  #
+  # The monitoring library automatically handles:
+  # - Converting gRPC endpoint (4317) to HTTP endpoint (4318) for logs
+  # - Adding proper paths (/v1/traces, /v1/metrics, /v1/logs)
+  # - Protocol prefixes (http:// for HTTP, none for gRPC)
+  #
+  # Base OTLP endpoint (gRPC format - used by traces and metrics)
   OTEL_EXPORTER_OTLP_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
+
+  # Protocol configuration (gRPC is recommended for better performance)
   OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
+
+  # Optional: Signal-specific endpoint overrides (if different from base)
+  # OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
+  # OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
+  # OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
+
+  # Optional: Protocol overrides per signal
+  # OTEL_EXPORTER_OTLP_TRACES_PROTOCOL: "grpc"
+  # OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: "grpc"
+  # Note: Logs always use HTTP protocol regardless of this setting
+
+  # Resource attributes (added to all telemetry signals)
   OTEL_SERVICE_NAME: "bakery-ia"
   OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
-  OTEL_LOGS_EXPORTER: "otlp"
 
-  # SigNoz Endpoints (v0.106.0+ unified service)
+  # SigNoz service endpoints (for UI and API access)
   SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080"
   SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local"
 
diff --git a/infrastructure/signoz/dashboards/alert-management.json b/infrastructure/signoz/dashboards/alert-management.json
index 568ab9ad..9eabeeae 100644
--- a/infrastructure/signoz/dashboards/alert-management.json
+++ b/infrastructure/signoz/dashboards/alert-management.json
@@ -1,104 +1,170 @@
 {
-  "dashboard": {
-    "title": "Bakery IA - Alert Management",
-    "description": "Alert monitoring and management dashboard",
-    "tags": ["alerts", "monitoring", "management"],
-    "panels": [
-      {
-        "title": "Active Alerts",
-        "type": "stat",
-        "query": {
-          "metric": "alerts_active",
-          "aggregate": "sum",
-          "filters": [
-            {
-              "key": "severity",
-              "operator": "=",
-              "value": "${severity}"
-            },
-            {
-              "key": "status",
-              "operator": "=",
-              "value": "firing"
-            }
-          ]
-        },
-        "unit": "number"
-      },
-      {
-        "title": "Alert Rate",
-        "type": "timeseries",
-        "query": {
-          "metric": "alerts_total",
-          "aggregate": "rate",
-          "filters": [
-            {
-              "key": "severity",
-              "operator": "=",
-              "value": "${severity}"
-            }
-          ]
-        },
-        "unit": "alerts/s"
-      },
-      {
-        "title": "Alerts by Severity",
-        "type": "pie",
-        "query": {
-          "metric": "alerts_total",
-          "aggregate": "sum",
-          "groupBy": ["severity"],
-          "filters": [
-            {
-              "key": "severity",
-              "operator": "=",
-              "value": "${severity}"
-            }
-          ]
-        }
-      },
-      {
-        "title": "Alerts by Status",
-        "type": "pie",
-        "query": {
-          "metric": "alerts_total",
-          "aggregate": "sum",
-          "groupBy": ["status"],
-          "filters": [
-            {
-              "key": "status",
-              "operator": "=",
-              "value": "${status}"
-            }
-          ]
-        }
-      }
-    ],
-    "variables": [
-      {
-        "name": "severity",
-        "label": "Severity",
-        "type": "dropdown",
-        "default": "*",
-        "values": ["*", "critical", "high", "medium", "low"]
-      },
-      {
-        "name": "status",
-        "label": "Status",
-        "type": "dropdown",
-        "default": "*",
-        "values": ["*", "firing", "resolved", "acknowledged"]
-      }
-    ],
-    "layout": {
-      "type": "grid",
-      "columns": 12,
-      "gap": [16, 16]
+  "description": "Alert monitoring and management dashboard",
+  "tags": ["alerts", "monitoring", "management"],
+  "name": "bakery-ia-alert-management",
+  "title": "Bakery IA - Alert Management",
+  "uploadedGrafana": false,
+  "uuid": "bakery-ia-alerts-01",
+  "version": "v4",
+  "collapsableRowsMigrated": true,
+  "layout": [
+    {
+      "x": 0,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "active-alerts",
+      "moved": false,
+      "static": false
     },
-    "refresh": "15s",
-    "time": {
-      "from": "now-1h",
-      "to": "now"
+    {
+      "x": 6,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "alert-rate",
+      "moved": false,
+      "static": false
     }
-  }
+  ],
+  "variables": {
+    "service": {
+      "id": "service-var",
+      "name": "service",
+      "description": "Filter by service name",
+      "type": "QUERY",
+      "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'alerts_active' AND value != '' ORDER BY value",
+      "customValue": "",
+      "textboxValue": "",
+      "showALLOption": true,
+      "multiSelect": false,
+      "order": 1,
+      "modificationUUID": "",
+      "sort": "ASC",
+      "selectedValue": null
+    }
+  },
+  "widgets": [
+    {
+      "id": "active-alerts",
+      "title": "Active Alerts",
+      "description": "Number of currently active alerts",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "value",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "alerts_active",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [],
+              "legend": "Active Alerts",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "alert-rate",
+      "title": "Alert Rate",
+      "description": "Rate of alerts over time",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "alerts_total",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "rate",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "alerts/s"
+    }
+  ]
 }
\ No newline at end of file
diff --git a/infrastructure/signoz/dashboards/api-performance.json b/infrastructure/signoz/dashboards/api-performance.json
index 03a29d71..52f484a3 100644
--- a/infrastructure/signoz/dashboards/api-performance.json
+++ b/infrastructure/signoz/dashboards/api-performance.json
@@ -1,102 +1,351 @@
 {
-  "dashboard": {
-    "title": "Bakery IA - API Performance",
-    "description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints",
-    "tags": ["api", "performance", "rest", "graphql"],
-    "panels": [
-      {
-        "title": "Request Volume",
-        "type": "timeseries",
-        "query": {
-          "metric": "http_server_requests_seconds_count",
-          "aggregate": "sum",
-          "groupBy": ["api"],
-          "filters": [
-            {
-              "key": "api",
-              "operator": "=",
-              "value": "${api}"
-            }
-          ]
-        },
-        "unit": "req/s"
-      },
-      {
-        "title": "Error Rate",
-        "type": "timeseries",
-        "query": {
-          "metric": "http_server_requests_seconds_count",
-          "aggregate": "sum",
-          "groupBy": ["api", "status"],
-          "filters": [
-            {
-              "key": "api",
-              "operator": "=",
-              "value": "${api}"
-            },
-            {
-              "key": "status",
-              "operator": "=~",
-              "value": "5.."
-            }
-          ]
-        },
-        "unit": "req/s"
-      },
-      {
-        "title": "Average Response Time",
-        "type": "timeseries",
-        "query": {
-          "metric": "http_server_requests_seconds_sum",
-          "aggregate": "avg",
-          "groupBy": ["api", "endpoint"],
-          "filters": [
-            {
-              "key": "api",
-              "operator": "=",
-              "value": "${api}"
-            }
-          ]
-        },
-        "unit": "seconds"
-      },
-      {
-        "title": "P95 Latency",
-        "type": "timeseries",
-        "query": {
-          "metric": "http_server_requests_seconds_bucket",
-          "aggregate": "histogram_quantile",
-          "quantile": 0.95,
-          "groupBy": ["api", "endpoint"],
-          "filters": [
-            {
-              "key": "api",
-              "operator": "=",
-              "value": "${api}"
-            }
-          ]
-        },
-        "unit": "seconds"
-      }
-    ],
-    "variables": [
-      {
-        "name": "api",
-        "label": "API Service",
-        "type": "dropdown",
-        "default": "*",
-        "values": ["*", "gateway-api", "auth-api", "inventory-api", "production-api", "forecasting-api", "procurement-api"]
-      }
-    ],
-    "layout": {
-      "type": "grid",
-      "columns": 12,
-      "gap": [16, 16]
+  "description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints",
+  "tags": ["api", "performance", "rest", "graphql"],
+  "name": "bakery-ia-api-performance",
+  "title": "Bakery IA - API Performance",
+  "uploadedGrafana": false,
+  "uuid": "bakery-ia-api-01",
+  "version": "v4",
+  "collapsableRowsMigrated": true,
+  "layout": [
+    {
+      "x": 0,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "request-volume",
+      "moved": false,
+      "static": false
     },
-    "refresh": "15s",
-    "time": {
-      "from": "now-1h",
-      "to": "now"
+    {
+      "x": 6,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "error-rate",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 0,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "avg-response-time",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 6,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "p95-latency",
+      "moved": false,
+      "static": false
     }
-  }
+  ],
+  "variables": {
+    "service": {
+      "id": "service-var",
+      "name": "service",
+      "description": "Filter by API service",
+      "type": "QUERY",
+      "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'http_server_requests_seconds_count' AND value != '' ORDER BY value",
+      "customValue": "",
+      "textboxValue": "",
+      "showALLOption": true,
+      "multiSelect": false,
+      "order": 1,
+      "modificationUUID": "",
+      "sort": "ASC",
+      "selectedValue": null
+    }
+  },
+  "widgets": [
+    {
+      "id": "request-volume",
+      "title": "Request Volume",
+      "description": "API request volume by service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "http_server_requests_seconds_count",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "rate",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "service.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "api.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{api.name}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "req/s"
+    },
+    {
+      "id": "error-rate",
+      "title": "Error Rate",
+      "description": "API error rate by service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "http_server_requests_seconds_count",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "rate",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "api.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.api}}"
+                  },
+                  {
+                    "key": {
+                      "key": "status_code",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": false
+                    },
+                    "op": "=~",
+                    "value": "5.."
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "api.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                },
+                {
+                  "key": "status_code",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{api.name}} - {{status_code}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "req/s"
+    },
+    {
+      "id": "avg-response-time",
+      "title": "Average Response Time",
+      "description": "Average API response time by endpoint",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "http_server_requests_seconds_sum",
+                "dataType": "float64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "avg",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "api.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.api}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "api.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                },
+                {
+                  "key": "endpoint",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{api.name}} - {{endpoint}}",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "seconds"
+    },
+    {
+      "id": "p95-latency",
+      "title": "P95 Latency",
+      "description": "95th percentile latency by endpoint",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "histogram_quantile",
+              "aggregateAttribute": {
+                "key": "http_server_requests_seconds_bucket",
+                "dataType": "float64",
+                "type": "Histogram",
+                "isColumn": false
+              },
+              "timeAggregation": "avg",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "api.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.api}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "api.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                },
+                {
+                  "key": "endpoint",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{api.name}} - {{endpoint}}",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "seconds"
+    }
+  ]
 }
\ No newline at end of file
diff --git a/infrastructure/signoz/dashboards/application-performance.json b/infrastructure/signoz/dashboards/application-performance.json
index 8f354463..c4f277d4 100644
--- a/infrastructure/signoz/dashboards/application-performance.json
+++ b/infrastructure/signoz/dashboards/application-performance.json
@@ -1,101 +1,333 @@
 {
-  "dashboard": {
-    "title": "Bakery IA - Application Performance",
-    "description": "Application performance monitoring dashboard for Bakery IA microservices",
-    "tags": ["application", "performance", "apm"],
-    "panels": [
-      {
-        "title": "Request Rate",
-        "type": "timeseries",
-        "query": {
-          "metric": "http_server_requests_seconds_count",
-          "aggregate": "sum",
-          "groupBy": ["service"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        },
-        "unit": "req/s"
-      },
-      {
-        "title": "Error Rate",
-        "type": "timeseries",
-        "query": {
-          "metric": "http_server_requests_seconds_count",
-          "aggregate": "sum",
-          "groupBy": ["service", "status"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            },
-            {
-              "key": "status",
-              "operator": "=~",
-              "value": "5.."
-            }
-          ]
-        },
-        "unit": "req/s"
-      },
-      {
-        "title": "Average Response Time",
-        "type": "timeseries",
-        "query": {
-          "metric": "http_server_requests_seconds_sum",
-          "aggregate": "avg",
-          "groupBy": ["service"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        },
-        "unit": "seconds"
-      },
-      {
-        "title": "Throughput",
-        "type": "timeseries",
-        "query": {
-          "metric": "http_server_requests_seconds_count",
-          "aggregate": "rate",
-          "groupBy": ["service"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        },
-        "unit": "req/s"
-      }
-    ],
-    "variables": [
-      {
-        "name": "service",
-        "label": "Service",
-        "type": "dropdown",
-        "default": "*",
-        "values": ["*", "auth-service", "gateway-service", "forecasting-service", "inventory-service", "production-service", "procurement-service"]
-      }
-    ],
-    "layout": {
-      "type": "grid",
-      "columns": 12,
-      "gap": [16, 16]
+  "description": "Application performance monitoring dashboard using distributed traces and metrics",
+  "tags": ["application", "performance", "traces", "apm"],
+  "name": "bakery-ia-application-performance",
+  "title": "Bakery IA - Application Performance (APM)",
+  "uploadedGrafana": false,
+  "uuid": "bakery-ia-apm-01",
+  "version": "v4",
+  "collapsableRowsMigrated": true,
+  "layout": [
+    {
+      "x": 0,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "latency-p99",
+      "moved": false,
+      "static": false
     },
-    "refresh": "15s",
-    "time": {
-      "from": "now-30m",
-      "to": "now"
+    {
+      "x": 6,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "request-rate",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 0,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "error-rate",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 6,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "avg-duration",
+      "moved": false,
+      "static": false
     }
-  }
-}
\ No newline at end of file
+  ],
+  "variables": {
+    "service_name": {
+      "id": "service-var",
+      "name": "service_name",
+      "description": "Filter by service name",
+      "type": "QUERY",
+      "queryValue": "SELECT DISTINCT(serviceName) FROM signoz_traces.distributed_signoz_index_v2 ORDER BY serviceName",
+      "customValue": "",
+      "textboxValue": "",
+      "showALLOption": true,
+      "multiSelect": false,
+      "order": 1,
+      "modificationUUID": "",
+      "sort": "ASC",
+      "selectedValue": null
+    }
+  },
+  "widgets": [
+    {
+      "id": "latency-p99",
+      "title": "P99 Latency",
+      "description": "99th percentile latency for selected service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "traces",
+              "queryName": "A",
+              "aggregateOperator": "p99",
+              "aggregateAttribute": {
+                "key": "duration_ns",
+                "dataType": "float64",
+                "type": "",
+                "isColumn": true
+              },
+              "timeAggregation": "avg",
+              "spaceAggregation": "p99",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service_name}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}}",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "ms"
+    },
+    {
+      "id": "request-rate",
+      "title": "Request Rate",
+      "description": "Requests per second for the service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "traces",
+              "queryName": "A",
+              "aggregateOperator": "count",
+              "aggregateAttribute": {
+                "key": "",
+                "dataType": "",
+                "type": "",
+                "isColumn": false
+              },
+              "timeAggregation": "rate",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service_name}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "reqps"
+    },
+    {
+      "id": "error-rate",
+      "title": "Error Rate",
+      "description": "Error rate percentage for the service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "traces",
+              "queryName": "A",
+              "aggregateOperator": "count",
+              "aggregateAttribute": {
+                "key": "",
+                "dataType": "",
+                "type": "",
+                "isColumn": false
+              },
+              "timeAggregation": "rate",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service_name}}"
+                  },
+                  {
+                    "key": {
+                      "key": "status_code",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "STATUS_CODE_ERROR"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "reqps"
+    },
+    {
+      "id": "avg-duration",
+      "title": "Average Duration",
+      "description": "Average request duration",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "traces",
+              "queryName": "A",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "duration_ns",
+                "dataType": "float64",
+                "type": "",
+                "isColumn": true
+              },
+              "timeAggregation": "avg",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service_name}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}}",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "ms"
+    }
+  ]
+}
diff --git a/infrastructure/signoz/dashboards/database-performance.json b/infrastructure/signoz/dashboards/database-performance.json
index f7b4fe3b..3a7f16d2 100644
--- a/infrastructure/signoz/dashboards/database-performance.json
+++ b/infrastructure/signoz/dashboards/database-performance.json
@@ -1,101 +1,425 @@
 {
-  "dashboard": {
-    "title": "Bakery IA - Database Performance",
-    "description": "Comprehensive database performance monitoring for PostgreSQL and Redis",
-    "tags": ["database", "postgresql", "redis", "performance"],
-    "panels": [
-      {
-        "title": "Database Connections",
-        "type": "timeseries",
-        "query": {
-          "metric": "pg_stat_activity_count",
-          "aggregate": "sum",
-          "groupBy": ["datname"],
-          "filters": [
-            {
-              "key": "datname",
-              "operator": "=",
-              "value": "${database}"
-            }
-          ]
-        },
-        "unit": "number"
-      },
-      {
-        "title": "Active Queries",
-        "type": "timeseries",
-        "query": {
-          "metric": "pg_stat_activity_count",
-          "aggregate": "sum",
-          "groupBy": ["datname"],
-          "filters": [
-            {
-              "key": "datname",
-              "operator": "=",
-              "value": "${database}"
-            },
-            {
-              "key": "state",
-              "operator": "=",
-              "value": "active"
-            }
-          ]
-        },
-        "unit": "number"
-      },
-      {
-        "title": "Database Size",
-        "type": "timeseries",
-        "query": {
-          "metric": "pg_database_size_bytes",
-          "aggregate": "sum",
-          "groupBy": ["datname"],
-          "filters": [
-            {
-              "key": "datname",
-              "operator": "=",
-              "value": "${database}"
-            }
-          ]
-        },
-        "unit": "bytes"
-      },
-      {
-        "title": "Query Execution Time",
-        "type": "timeseries",
-        "query": {
-          "metric": "pg_stat_statements_total_time",
-          "aggregate": "avg",
-          "groupBy": ["datname"],
-          "filters": [
-            {
-              "key": "datname",
-              "operator": "=",
-              "value": "${database}"
-            }
-          ]
-        },
-        "unit": "seconds"
-      }
-    ],
-    "variables": [
-      {
-        "name": "database",
-        "label": "Database",
-        "type": "dropdown",
-        "default": "*",
-        "values": ["*", "postgresql", "redis"]
-      }
-    ],
-    "layout": {
-      "type": "grid",
-      "columns": 12,
-      "gap": [16, 16]
+  "description": "Comprehensive database performance monitoring for PostgreSQL, Redis, and RabbitMQ",
+  "tags": ["database", "postgresql", "redis", "rabbitmq", "performance"],
+  "name": "bakery-ia-database-performance",
+  "title": "Bakery IA - Database Performance",
+  "uploadedGrafana": false,
+  "uuid": "bakery-ia-db-01",
+  "version": "v4",
+  "collapsableRowsMigrated": true,
+  "layout": [
+    {
+      "x": 0,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "pg-connections",
+      "moved": false,
+      "static": false
     },
-    "refresh": "30s",
-    "time": {
-      "from": "now-1h",
-      "to": "now"
+    {
+      "x": 6,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "pg-db-size",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 0,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "redis-connected-clients",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 6,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "redis-memory",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 0,
+      "y": 6,
+      "w": 6,
+      "h": 3,
+      "i": "rabbitmq-messages",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 6,
+      "y": 6,
+      "w": 6,
+      "h": 3,
+      "i": "rabbitmq-consumers",
+      "moved": false,
+      "static": false
     }
-  }
-}
\ No newline at end of file
+  ],
+  "variables": {
+    "database": {
+      "id": "database-var",
+      "name": "database",
+      "description": "Filter by PostgreSQL database name",
+      "type": "QUERY",
+      "queryValue": "SELECT DISTINCT(resource_attrs['postgresql.database.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'postgresql.db_size' AND value != '' ORDER BY value",
+      "customValue": "",
+      "textboxValue": "",
+      "showALLOption": true,
+      "multiSelect": false,
+      "order": 1,
+      "modificationUUID": "",
+      "sort": "ASC",
+      "selectedValue": null
+    }
+  },
+  "widgets": [
+    {
+      "id": "pg-connections",
+      "title": "PostgreSQL - Active Connections",
+      "description": "Number of active PostgreSQL connections",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "postgresql.backends",
+                "dataType": "float64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "postgresql.database.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.database}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "postgresql.database.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{postgresql.database.name}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "pg-db-size",
+      "title": "PostgreSQL - Database Size",
+      "description": "Size of PostgreSQL databases in bytes",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "postgresql.db_size",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "postgresql.database.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.database}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "postgresql.database.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{postgresql.database.name}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "bytes"
+    },
+    {
+      "id": "redis-connected-clients",
+      "title": "Redis - Connected Clients",
+      "description": "Number of clients connected to Redis",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "redis.clients.connected",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "host.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{host.name}}",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "redis-memory",
+      "title": "Redis - Memory Usage",
+      "description": "Redis memory usage in bytes",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "redis.memory.used",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "host.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{host.name}}",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "bytes"
+    },
+    {
+      "id": "rabbitmq-messages",
+      "title": "RabbitMQ - Current Messages",
+      "description": "Number of messages currently in RabbitMQ queues",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "rabbitmq.message.current",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "queue",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": false
+                }
+              ],
+              "legend": "Queue: {{queue}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "rabbitmq-consumers",
+      "title": "RabbitMQ - Consumer Count",
+      "description": "Number of consumers per queue",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "rabbitmq.consumer.count",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "queue",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": false
+                }
+              ],
+              "legend": "Queue: {{queue}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    }
+  ]
+}
diff --git a/infrastructure/signoz/dashboards/error-tracking.json b/infrastructure/signoz/dashboards/error-tracking.json
index 3fbb14a6..93bb9c68 100644
--- a/infrastructure/signoz/dashboards/error-tracking.json
+++ b/infrastructure/signoz/dashboards/error-tracking.json
@@ -1,105 +1,348 @@
 {
-  "dashboard": {
-    "title": "Bakery IA - Error Tracking",
-    "description": "Comprehensive error tracking and analysis dashboard",
-    "tags": ["errors", "exceptions", "tracking"],
-    "panels": [
-      {
-        "title": "Total Errors",
-        "type": "stat",
-        "query": {
-          "metric": "error_total",
-          "aggregate": "sum",
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        },
-        "unit": "number"
-      },
-      {
-        "title": "Error Rate",
-        "type": "timeseries",
-        "query": {
-          "metric": "error_total",
-          "aggregate": "rate",
-          "groupBy": ["service"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        },
-        "unit": "errors/s"
-      },
-      {
-        "title": "HTTP 5xx Errors",
-        "type": "timeseries",
-        "query": {
-          "metric": "http_server_requests_seconds_count",
-          "aggregate": "sum",
-          "groupBy": ["service", "status"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            },
-            {
-              "key": "status",
-              "operator": "=~",
-              "value": "5.."
-            }
-          ]
-        },
-        "unit": "number"
-      },
-      {
-        "title": "HTTP 4xx Errors",
-        "type": "timeseries",
-        "query": {
-          "metric": "http_server_requests_seconds_count",
-          "aggregate": "sum",
-          "groupBy": ["service", "status"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            },
-            {
-              "key": "status",
-              "operator": "=~",
-              "value": "4.."
-            }
-          ]
-        },
-        "unit": "number"
-      }
-    ],
-    "variables": [
-      {
-        "name": "service",
-        "label": "Service",
-        "type": "dropdown",
-        "default": "*",
-        "values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
-      }
-    ],
-    "layout": {
-      "type": "grid",
-      "columns": 12,
-      "gap": [16, 16]
+  "description": "Comprehensive error tracking and analysis dashboard",
+  "tags": ["errors", "exceptions", "tracking"],
+  "name": "bakery-ia-error-tracking",
+  "title": "Bakery IA - Error Tracking",
+  "uploadedGrafana": false,
+  "uuid": "bakery-ia-errors-01",
+  "version": "v4",
+  "collapsableRowsMigrated": true,
+  "layout": [
+    {
+      "x": 0,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "total-errors",
+      "moved": false,
+      "static": false
     },
-    "refresh": "15s",
-    "time": {
-      "from": "now-1h",
-      "to": "now"
+    {
+      "x": 6,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "error-rate",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 0,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "http-5xx",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 6,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "http-4xx",
+      "moved": false,
+      "static": false
     }
-  }
+  ],
+  "variables": {
+    "service": {
+      "id": "service-var",
+      "name": "service",
+      "description": "Filter by service name",
+      "type": "QUERY",
+      "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'error_total' AND value != '' ORDER BY value",
+      "customValue": "",
+      "textboxValue": "",
+      "showALLOption": true,
+      "multiSelect": false,
+      "order": 1,
+      "modificationUUID": "",
+      "sort": "ASC",
+      "selectedValue": null
+    }
+  },
+  "widgets": [
+    {
+      "id": "total-errors",
+      "title": "Total Errors",
+      "description": "Total number of errors across all services",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "value",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "error_total",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "sum",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "service.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [],
+              "legend": "Total Errors",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "error-rate",
+      "title": "Error Rate",
+      "description": "Error rate over time",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "error_total",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "rate",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "service.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "errors/s"
+    },
+    {
+      "id": "http-5xx",
+      "title": "HTTP 5xx Errors",
+      "description": "Server errors (5xx status codes)",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "http_server_requests_seconds_count",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "sum",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "service.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  },
+                  {
+                    "key": {
+                      "key": "status_code",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": false
+                    },
+                    "op": "=~",
+                    "value": "5.."
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                },
+                {
+                  "key": "status_code",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{serviceName}} - {{status_code}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "number"
+    },
+    {
+      "id": "http-4xx",
+      "title": "HTTP 4xx Errors",
+      "description": "Client errors (4xx status codes)",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "http_server_requests_seconds_count",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "sum",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "service.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  },
+                  {
+                    "key": {
+                      "key": "status_code",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": false
+                    },
+                    "op": "=~",
+                    "value": "4.."
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                },
+                {
+                  "key": "status_code",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{serviceName}} - {{status_code}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "number"
+    }
+  ]
 }
\ No newline at end of file
diff --git a/infrastructure/signoz/dashboards/infrastructure-monitoring.json b/infrastructure/signoz/dashboards/infrastructure-monitoring.json
index bb3c7301..df4f2a1c 100644
--- a/infrastructure/signoz/dashboards/infrastructure-monitoring.json
+++ b/infrastructure/signoz/dashboards/infrastructure-monitoring.json
@@ -1,105 +1,423 @@
 {
-  "dashboard": {
-    "title": "Bakery IA - Infrastructure Monitoring",
-    "description": "Comprehensive infrastructure monitoring dashboard for Bakery IA system",
-    "tags": ["infrastructure", "system", "kubernetes"],
-    "panels": [
-      {
-        "title": "CPU Usage",
-        "type": "timeseries",
-        "query": {
-          "metric": "container_cpu_usage_seconds_total",
-          "aggregate": "sum",
-          "groupBy": ["namespace"],
-          "filters": [
+  "description": "Comprehensive infrastructure monitoring dashboard for Bakery IA Kubernetes cluster",
+  "tags": ["infrastructure", "kubernetes", "k8s", "system"],
+  "name": "bakery-ia-infrastructure-monitoring",
+  "title": "Bakery IA - Infrastructure Monitoring",
+  "uploadedGrafana": false,
+  "uuid": "bakery-ia-infra-01",
+  "version": "v4",
+  "collapsableRowsMigrated": true,
+  "layout": [
+    {
+      "x": 0,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "pod-count",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 6,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "pod-phase",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 0,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "container-restarts",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 6,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "node-condition",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 0,
+      "y": 6,
+      "w": 12,
+      "h": 3,
+      "i": "deployment-status",
+      "moved": false,
+      "static": false
+    }
+  ],
+  "variables": {
+    "namespace": {
+      "id": "namespace-var",
+      "name": "namespace",
+      "description": "Filter by Kubernetes namespace",
+      "type": "QUERY",
+      "queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'k8s.pod.phase' AND value != '' ORDER BY value",
+      "customValue": "",
+      "textboxValue": "",
+      "showALLOption": true,
+      "multiSelect": false,
+      "order": 1,
+      "modificationUUID": "",
+      "sort": "ASC",
+      "selectedValue": "bakery-ia"
+    }
+  },
+  "widgets": [
+    {
+      "id": "pod-count",
+      "title": "Total Pods",
+      "description": "Total number of pods in the namespace",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "value",
+      "query": {
+        "builder": {
+          "queryData": [
             {
-              "key": "namespace",
-              "operator": "=",
-              "value": "bakery-ia"
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "count",
+              "aggregateAttribute": {
+                "key": "k8s.pod.phase",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "k8s.namespace.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.namespace}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [],
+              "legend": "Total Pods",
+              "reduceTo": "sum"
             }
-          ]
+          ],
+          "queryFormulas": []
         },
-        "unit": "percent",
-        "yAxis": {
-          "min": 0,
-          "max": 100
-        }
+        "queryType": "builder"
       },
-      {
-        "title": "Memory Usage",
-        "type": "timeseries",
-        "query": {
-          "metric": "container_memory_working_set_bytes",
-          "aggregate": "sum",
-          "groupBy": ["namespace"],
-          "filters": [
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "pod-phase",
+      "title": "Pod Phase Distribution",
+      "description": "Pods by phase (Running, Pending, Failed, etc.)",
+      "isStacked": true,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
             {
-              "key": "namespace",
-              "operator": "=",
-              "value": "bakery-ia"
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "k8s.pod.phase",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "k8s.namespace.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.namespace}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "phase",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{phase}}",
+              "reduceTo": "sum"
             }
-          ]
+          ],
+          "queryFormulas": []
         },
-        "unit": "bytes"
+        "queryType": "builder"
       },
-      {
-        "title": "Network Traffic",
-        "type": "timeseries",
-        "query": {
-          "metric": "container_network_receive_bytes_total",
-          "aggregate": "sum",
-          "groupBy": ["namespace"],
-          "filters": [
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "container-restarts",
+      "title": "Container Restarts",
+      "description": "Container restart count over time",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
             {
-              "key": "namespace",
-              "operator": "=",
-              "value": "bakery-ia"
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "k8s.container.restarts",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "increase",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "k8s.namespace.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.namespace}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "k8s.pod.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{k8s.pod.name}}",
+              "reduceTo": "sum"
             }
-          ]
+          ],
+          "queryFormulas": []
         },
-        "unit": "bytes"
+        "queryType": "builder"
       },
-      {
-        "title": "Pod Status",
-        "type": "stat",
-        "query": {
-          "metric": "kube_pod_status_phase",
-          "aggregate": "count",
-          "groupBy": ["phase"],
-          "filters": [
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "node-condition",
+      "title": "Node Conditions",
+      "description": "Node condition status (Ready, MemoryPressure, DiskPressure, etc.)",
+      "isStacked": true,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
             {
-              "key": "namespace",
-              "operator": "=",
-              "value": "bakery-ia"
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "k8s.node.condition_ready",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "k8s.node.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{k8s.node.name}} Ready",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "deployment-status",
+      "title": "Deployment Status (Desired vs Available)",
+      "description": "Deployment replicas: desired vs available",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "k8s.deployment.desired",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "k8s.namespace.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.namespace}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "k8s.deployment.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{k8s.deployment.name}} (desired)",
+              "reduceTo": "avg"
             },
             {
-              "key": "phase",
-              "operator": "=",
-              "value": "Running"
+              "dataSource": "metrics",
+              "queryName": "B",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "k8s.deployment.available",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "k8s.namespace.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.namespace}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "B",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "k8s.deployment.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{k8s.deployment.name}} (available)",
+              "reduceTo": "avg"
             }
-          ]
+          ],
+          "queryFormulas": []
         },
-        "unit": "number"
-      }
-    ],
-    "variables": [
-      {
-        "name": "namespace",
-        "label": "Namespace",
-        "type": "dropdown",
-        "default": "bakery-ia",
-        "values": ["bakery-ia", "default", "kube-system"]
-      }
-    ],
-    "layout": {
-      "type": "grid",
-      "columns": 12,
-      "gap": [16, 16]
-    },
-    "refresh": "30s",
-    "time": {
-      "from": "now-1h",
-      "to": "now"
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
     }
-  }
-}
\ No newline at end of file
+  ]
+}
diff --git a/infrastructure/signoz/dashboards/log-analysis.json b/infrastructure/signoz/dashboards/log-analysis.json
index e2b24f6b..d48dc771 100644
--- a/infrastructure/signoz/dashboards/log-analysis.json
+++ b/infrastructure/signoz/dashboards/log-analysis.json
@@ -1,99 +1,333 @@
 {
-  "dashboard": {
-    "title": "Bakery IA - Log Analysis",
-    "description": "Comprehensive log analysis and search dashboard",
-    "tags": ["logs", "analysis", "search"],
-    "panels": [
-      {
-        "title": "Log Volume",
-        "type": "timeseries",
-        "query": {
-          "metric": "log_lines_total",
-          "aggregate": "sum",
-          "groupBy": ["service"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        },
-        "unit": "logs/s"
-      },
-      {
-        "title": "Error Logs",
-        "type": "timeseries",
-        "query": {
-          "metric": "log_lines_total",
-          "aggregate": "sum",
-          "groupBy": ["service"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            },
-            {
-              "key": "level",
-              "operator": "=",
-              "value": "error"
-            }
-          ]
-        },
-        "unit": "logs/s"
-      },
-      {
-        "title": "Logs by Level",
-        "type": "pie",
-        "query": {
-          "metric": "log_lines_total",
-          "aggregate": "sum",
-          "groupBy": ["level"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        }
-      },
-      {
-        "title": "Logs by Service",
-        "type": "pie",
-        "query": {
-          "metric": "log_lines_total",
-          "aggregate": "sum",
-          "groupBy": ["service"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        }
-      }
-    ],
-    "variables": [
-      {
-        "name": "service",
-        "label": "Service",
-        "type": "dropdown",
-        "default": "*",
-        "values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
-      }
-    ],
-    "layout": {
-      "type": "grid",
-      "columns": 12,
-      "gap": [16, 16]
+  "description": "Comprehensive log analysis and search dashboard",
+  "tags": ["logs", "analysis", "search"],
+  "name": "bakery-ia-log-analysis",
+  "title": "Bakery IA - Log Analysis",
+  "uploadedGrafana": false,
+  "uuid": "bakery-ia-logs-01",
+  "version": "v4",
+  "collapsableRowsMigrated": true,
+  "layout": [
+    {
+      "x": 0,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "log-volume",
+      "moved": false,
+      "static": false
     },
-    "refresh": "30s",
-    "time": {
-      "from": "now-1h",
-      "to": "now"
+    {
+      "x": 6,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "error-logs",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 0,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "logs-by-level",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 6,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "logs-by-service",
+      "moved": false,
+      "static": false
     }
-  }
+  ],
+  "variables": {
+    "service": {
+      "id": "service-var",
+      "name": "service",
+      "description": "Filter by service name",
+      "type": "QUERY",
+      "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'log_lines_total' AND value != '' ORDER BY value",
+      "customValue": "",
+      "textboxValue": "",
+      "showALLOption": true,
+      "multiSelect": false,
+      "order": 1,
+      "modificationUUID": "",
+      "sort": "ASC",
+      "selectedValue": null
+    }
+  },
+  "widgets": [
+    {
+      "id": "log-volume",
+      "title": "Log Volume",
+      "description": "Total log volume by service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "log_lines_total",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "rate",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "logs/s"
+    },
+    {
+      "id": "error-logs",
+      "title": "Error Logs",
+      "description": "Error log volume by service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "log_lines_total",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "rate",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  },
+                  {
+                    "key": {
+                      "key": "level",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "error"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}} (errors)",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "logs/s"
+    },
+    {
+      "id": "logs-by-level",
+      "title": "Logs by Level",
+      "description": "Distribution of logs by severity level",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "pie",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "log_lines_total",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "sum",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "level",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{level}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "logs-by-service",
+      "title": "Logs by Service",
+      "description": "Distribution of logs by service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "pie",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "log_lines_total",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "sum",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    }
+  ]
 }
\ No newline at end of file
diff --git a/infrastructure/signoz/dashboards/system-health.json b/infrastructure/signoz/dashboards/system-health.json
index f70fb48f..11f38e1d 100644
--- a/infrastructure/signoz/dashboards/system-health.json
+++ b/infrastructure/signoz/dashboards/system-health.json
@@ -1,92 +1,295 @@
 {
-  "dashboard": {
-    "title": "Bakery IA - System Health",
-    "description": "Comprehensive system health monitoring dashboard",
-    "tags": ["system", "health", "monitoring"],
-    "panels": [
-      {
-        "title": "System Availability",
-        "type": "stat",
-        "query": {
-          "metric": "system_availability",
-          "aggregate": "avg",
-          "filters": [
-            {
-              "key": "namespace",
-              "operator": "=",
-              "value": "${namespace}"
-            }
-          ]
-        },
-        "unit": "percent"
-      },
-      {
-        "title": "Service Health Score",
-        "type": "stat",
-        "query": {
-          "metric": "service_health_score",
-          "aggregate": "avg",
-          "filters": [
-            {
-              "key": "namespace",
-              "operator": "=",
-              "value": "${namespace}"
-            }
-          ]
-        },
-        "unit": "number"
-      },
-      {
-        "title": "CPU Usage",
-        "type": "timeseries",
-        "query": {
-          "metric": "system_cpu_usage",
-          "aggregate": "avg",
-          "filters": [
-            {
-              "key": "namespace",
-              "operator": "=",
-              "value": "${namespace}"
-            }
-          ]
-        },
-        "unit": "percent"
-      },
-      {
-        "title": "Memory Usage",
-        "type": "timeseries",
-        "query": {
-          "metric": "system_memory_usage",
-          "aggregate": "avg",
-          "filters": [
-            {
-              "key": "namespace",
-              "operator": "=",
-              "value": "${namespace}"
-            }
-          ]
-        },
-        "unit": "percent"
-      }
-    ],
-    "variables": [
-      {
-        "name": "namespace",
-        "label": "Namespace",
-        "type": "dropdown",
-        "default": "bakery-ia",
-        "values": ["bakery-ia", "default"]
-      }
-    ],
-    "layout": {
-      "type": "grid",
-      "columns": 12,
-      "gap": [16, 16]
+  "description": "Comprehensive system health monitoring dashboard",
+  "tags": ["system", "health", "monitoring"],
+  "name": "bakery-ia-system-health",
+  "title": "Bakery IA - System Health",
+  "uploadedGrafana": false,
+  "uuid": "bakery-ia-health-01",
+  "version": "v4",
+  "collapsableRowsMigrated": true,
+  "layout": [
+    {
+      "x": 0,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "system-availability",
+      "moved": false,
+      "static": false
     },
-    "refresh": "30s",
-    "time": {
-      "from": "now-1h",
-      "to": "now"
+    {
+      "x": 6,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "health-score",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 0,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "cpu-usage",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 6,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "memory-usage",
+      "moved": false,
+      "static": false
     }
-  }
+  ],
+  "variables": {
+    "namespace": {
+      "id": "namespace-var",
+      "name": "namespace",
+      "description": "Filter by Kubernetes namespace",
+      "type": "QUERY",
+      "queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'system_availability' AND value != '' ORDER BY value",
+      "customValue": "",
+      "textboxValue": "",
+      "showALLOption": true,
+      "multiSelect": false,
+      "order": 1,
+      "modificationUUID": "",
+      "sort": "ASC",
+      "selectedValue": "bakery-ia"
+    }
+  },
+  "widgets": [
+    {
+      "id": "system-availability",
+      "title": "System Availability",
+      "description": "Overall system availability percentage",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "value",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "system_availability",
+                "dataType": "float64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "k8s.namespace.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.namespace}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [],
+              "legend": "System Availability",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "percent"
+    },
+    {
+      "id": "health-score",
+      "title": "Service Health Score",
+      "description": "Overall service health score",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "value",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "service_health_score",
+                "dataType": "float64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "k8s.namespace.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.namespace}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [],
+              "legend": "Health Score",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "cpu-usage",
+      "title": "CPU Usage",
+      "description": "System CPU usage over time",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "system_cpu_usage",
+                "dataType": "float64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "avg",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "k8s.namespace.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.namespace}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [],
+              "legend": "CPU Usage",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "percent"
+    },
+    {
+      "id": "memory-usage",
+      "title": "Memory Usage",
+      "description": "System memory usage over time",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "system_memory_usage",
+                "dataType": "float64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "avg",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "k8s.namespace.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.namespace}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [],
+              "legend": "Memory Usage",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "percent"
+    }
+  ]
 }
\ No newline at end of file
diff --git a/infrastructure/signoz/dashboards/user-activity.json b/infrastructure/signoz/dashboards/user-activity.json
index e4d4d9e6..0d0e0ef3 100644
--- a/infrastructure/signoz/dashboards/user-activity.json
+++ b/infrastructure/signoz/dashboards/user-activity.json
@@ -1,96 +1,323 @@
 {
-  "dashboard": {
-    "title": "Bakery IA - User Activity",
-    "description": "User activity and behavior monitoring dashboard",
-    "tags": ["user", "activity", "behavior"],
-    "panels": [
-      {
-        "title": "Active Users",
-        "type": "timeseries",
-        "query": {
-          "metric": "active_users",
-          "aggregate": "sum",
-          "groupBy": ["service"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        },
-        "unit": "number"
-      },
-      {
-        "title": "User Sessions",
-        "type": "timeseries",
-        "query": {
-          "metric": "user_sessions_total",
-          "aggregate": "sum",
-          "groupBy": ["service"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        },
-        "unit": "number"
-      },
-      {
-        "title": "API Calls per User",
-        "type": "timeseries",
-        "query": {
-          "metric": "api_calls_per_user",
-          "aggregate": "avg",
-          "groupBy": ["service"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        },
-        "unit": "number"
-      },
-      {
-        "title": "Session Duration",
-        "type": "timeseries",
-        "query": {
-          "metric": "session_duration_seconds",
-          "aggregate": "avg",
-          "groupBy": ["service"],
-          "filters": [
-            {
-              "key": "service",
-              "operator": "=",
-              "value": "${service}"
-            }
-          ]
-        },
-        "unit": "seconds"
-      }
-    ],
-    "variables": [
-      {
-        "name": "service",
-        "label": "Service",
-        "type": "dropdown",
-        "default": "*",
-        "values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service"]
-      }
-    ],
-    "layout": {
-      "type": "grid",
-      "columns": 12,
-      "gap": [16, 16]
+  "description": "User activity and behavior monitoring dashboard",
+  "tags": ["user", "activity", "behavior"],
+  "name": "bakery-ia-user-activity",
+  "title": "Bakery IA - User Activity",
+  "uploadedGrafana": false,
+  "uuid": "bakery-ia-user-01",
+  "version": "v4",
+  "collapsableRowsMigrated": true,
+  "layout": [
+    {
+      "x": 0,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "active-users",
+      "moved": false,
+      "static": false
     },
-    "refresh": "30s",
-    "time": {
-      "from": "now-1h",
-      "to": "now"
+    {
+      "x": 6,
+      "y": 0,
+      "w": 6,
+      "h": 3,
+      "i": "user-sessions",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 0,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "api-calls-per-user",
+      "moved": false,
+      "static": false
+    },
+    {
+      "x": 6,
+      "y": 3,
+      "w": 6,
+      "h": 3,
+      "i": "session-duration",
+      "moved": false,
+      "static": false
     }
-  }
+  ],
+  "variables": {
+    "service": {
+      "id": "service-var",
+      "name": "service",
+      "description": "Filter by service name",
+      "type": "QUERY",
+      "queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'active_users' AND value != '' ORDER BY value",
+      "customValue": "",
+      "textboxValue": "",
+      "showALLOption": true,
+      "multiSelect": false,
+      "order": 1,
+      "modificationUUID": "",
+      "sort": "ASC",
+      "selectedValue": null
+    }
+  },
+  "widgets": [
+    {
+      "id": "active-users",
+      "title": "Active Users",
+      "description": "Number of active users by service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "active_users",
+                "dataType": "int64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "latest",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "service.name",
+                      "dataType": "string",
+                      "type": "resource",
+                      "isColumn": false
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "service.name",
+                  "dataType": "string",
+                  "type": "resource",
+                  "isColumn": false
+                }
+              ],
+              "legend": "{{service.name}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "user-sessions",
+      "title": "User Sessions",
+      "description": "Total user sessions by service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "sum",
+              "aggregateAttribute": {
+                "key": "user_sessions_total",
+                "dataType": "int64",
+                "type": "Counter",
+                "isColumn": false
+              },
+              "timeAggregation": "sum",
+              "spaceAggregation": "sum",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}}",
+              "reduceTo": "sum"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "api-calls-per-user",
+      "title": "API Calls per User",
+      "description": "Average API calls per user by service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "api_calls_per_user",
+                "dataType": "float64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "avg",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}}",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "none"
+    },
+    {
+      "id": "session-duration",
+      "title": "Session Duration",
+      "description": "Average session duration by service",
+      "isStacked": false,
+      "nullZeroValues": "zero",
+      "opacity": "1",
+      "panelTypes": "graph",
+      "query": {
+        "builder": {
+          "queryData": [
+            {
+              "dataSource": "metrics",
+              "queryName": "A",
+              "aggregateOperator": "avg",
+              "aggregateAttribute": {
+                "key": "session_duration_seconds",
+                "dataType": "float64",
+                "type": "Gauge",
+                "isColumn": false
+              },
+              "timeAggregation": "avg",
+              "spaceAggregation": "avg",
+              "functions": [],
+              "filters": {
+                "items": [
+                  {
+                    "key": {
+                      "key": "serviceName",
+                      "dataType": "string",
+                      "type": "tag",
+                      "isColumn": true
+                    },
+                    "op": "=",
+                    "value": "{{.service}}"
+                  }
+                ],
+                "op": "AND"
+              },
+              "expression": "A",
+              "disabled": false,
+              "having": [],
+              "stepInterval": 60,
+              "limit": null,
+              "orderBy": [],
+              "groupBy": [
+                {
+                  "key": "serviceName",
+                  "dataType": "string",
+                  "type": "tag",
+                  "isColumn": true
+                }
+              ],
+              "legend": "{{serviceName}}",
+              "reduceTo": "avg"
+            }
+          ],
+          "queryFormulas": []
+        },
+        "queryType": "builder"
+      },
+      "fillSpans": false,
+      "yAxisUnit": "seconds"
+    }
+  ]
 }
\ No newline at end of file
diff --git a/services/ai_insights/app/main.py b/services/ai_insights/app/main.py
index 2835d5c6..7df10dcf 100644
--- a/services/ai_insights/app/main.py
+++ b/services/ai_insights/app/main.py
@@ -1,160 +1,61 @@
 """Main FastAPI application for AI Insights Service."""
 
-from fastapi import FastAPI, Response
-from fastapi.middleware.cors import CORSMiddleware
-from contextlib import asynccontextmanager
 import structlog
-import os
 
 from app.core.config import settings
 from app.core.database import init_db, close_db
 from app.api import insights
-from shared.monitoring.logging import setup_logging
-from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
-from shared.monitoring.system_metrics import SystemMetricsCollector
+from shared.service_base import StandardFastAPIService
 
-# OpenTelemetry imports
-from opentelemetry import trace
-from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
-from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
-from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
-from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
-from opentelemetry.instrumentation.redis import RedisInstrumentor
-from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
-from opentelemetry.sdk.resources import Resource
-
-# Configure OpenTelemetry tracing
-def setup_tracing(service_name: str = "ai-insights"):
-    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
-    resource = Resource.create({"service.name": service_name})
-
-    otlp_exporter = OTLPSpanExporter(
-        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
-        insecure=True
-    )
-
-    provider = TracerProvider(resource=resource)
-    processor = BatchSpanProcessor(otlp_exporter)
-    provider.add_span_processor(processor)
-    trace.set_tracer_provider(provider)
-
-    return provider
-
-# Initialize tracing
-tracer_provider = setup_tracing("ai-insights")
-
-# Setup logging
-setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO'))
+# Initialize logger
 logger = structlog.get_logger()
 
-# Setup OpenTelemetry logging export if enabled
-logger.info(f"OTEL_LOGS_EXPORTER env var: {os.getenv('OTEL_LOGS_EXPORTER', 'not set')}")
-if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
-    try:
-        logger.info("Attempting to setup OpenTelemetry logging")
-        from shared.monitoring.logs_exporter import setup_otel_logging
-        result = setup_otel_logging("ai-insights", settings.SERVICE_VERSION)
-        if result:
-            logger.info("OpenTelemetry logs export enabled for ai-insights")
-        else:
-            logger.warning("OpenTelemetry logs export setup returned None")
-    except Exception as e:
-        logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
-else:
-    logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
+
+class AIInsightsService(StandardFastAPIService):
+    """AI Insights Service with standardized monitoring setup"""
+
+    async def on_startup(self, app):
+        """Custom startup logic for AI Insights"""
+        # Initialize database
+        await init_db()
+        logger.info("Database initialized")
+
+        await super().on_startup(app)
+
+    async def on_shutdown(self, app):
+        """Custom shutdown logic for AI Insights"""
+        await super().on_shutdown(app)
+
+        # Close database
+        await close_db()
+        logger.info("Database connections closed")
 
 
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Lifespan event handler for startup and shutdown."""
-    # Startup
-    logger.info("Starting AI Insights Service", service=settings.SERVICE_NAME, version=settings.SERVICE_VERSION)
-    await init_db()
-    logger.info("Database initialized")
-
-    # Initialize system metrics collection
-    system_metrics = SystemMetricsCollector("ai-insights")
-    logger.info("System metrics collection started")
-
-    # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
-    logger.info("Metrics export configured via OpenTelemetry OTLP")
-
-    yield
-
-    # Shutdown
-    logger.info("Shutting down AI Insights Service")
-    await close_db()
-    logger.info("Database connections closed")
-
-
-# Create FastAPI app
-app = FastAPI(
-    title="AI Insights Service",
+# Create service instance
+service = AIInsightsService(
+    service_name="ai-insights",
+    app_name="AI Insights Service",
     description="Intelligent insights and recommendations for bakery operations",
     version=settings.SERVICE_VERSION,
-    lifespan=lifespan
+    log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
+    cors_origins=getattr(settings, 'ALLOWED_ORIGINS', ["*"]),
+    api_prefix=settings.API_V1_PREFIX,
+    enable_metrics=True,
+    enable_health_checks=True,
+    enable_tracing=True,
+    enable_cors=True
 )
 
-# Instrument FastAPI with OpenTelemetry
-FastAPIInstrumentor.instrument_app(app)
+# Create FastAPI app
+app = service.create_app()
 
-# Instrument httpx for outgoing requests
-HTTPXClientInstrumentor().instrument()
-
-# Instrument Redis
-RedisInstrumentor().instrument()
-
-# Instrument SQLAlchemy
-SQLAlchemyInstrumentor().instrument()
-
-# Initialize metrics collector
-metrics_collector = MetricsCollector("ai-insights")
-
-# Add metrics middleware to track HTTP requests
-add_metrics_middleware(app, metrics_collector)
-
-# CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=settings.ALLOWED_ORIGINS,
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-# Include routers
-app.include_router(
+# Add service-specific routers
+service.add_router(
     insights.router,
-    prefix=settings.API_V1_PREFIX,
     tags=["insights"]
 )
 
 
-@app.get("/")
-async def root():
-    """Root endpoint."""
-    return {
-        "service": settings.SERVICE_NAME,
-        "version": settings.SERVICE_VERSION,
-        "status": "running"
-    }
-
-
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    return {
-        "status": "healthy",
-        "service": settings.SERVICE_NAME,
-        "version": settings.SERVICE_VERSION
-    }
-
-
-# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
-# The /metrics endpoint is not needed as metrics are pushed automatically
-
-
 if __name__ == "__main__":
     import uvicorn
 
diff --git a/services/alert_processor/app/main.py b/services/alert_processor/app/main.py
index 296e5785..e1ce9bba 100644
--- a/services/alert_processor/app/main.py
+++ b/services/alert_processor/app/main.py
@@ -4,90 +4,28 @@ Alert Processor Service v2.0
 Main FastAPI application with RabbitMQ consumer lifecycle management.
 """
 
-from fastapi import FastAPI, Response
-from fastapi.middleware.cors import CORSMiddleware
-from contextlib import asynccontextmanager
 import structlog
-import os
 
 from app.core.config import settings
 from app.consumer.event_consumer import EventConsumer
 from app.api import alerts, sse
 from shared.redis_utils import initialize_redis, close_redis
-from shared.monitoring.logging import setup_logging
-from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
-from shared.monitoring.system_metrics import SystemMetricsCollector
+from shared.service_base import StandardFastAPIService
 
-# OpenTelemetry imports
-from opentelemetry import trace
-from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
-from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
-from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
-from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
-from opentelemetry.instrumentation.redis import RedisInstrumentor
-from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
-from opentelemetry.sdk.resources import Resource
-
-# Configure OpenTelemetry tracing
-def setup_tracing(service_name: str = "alert-processor"):
-    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
-    resource = Resource.create({"service.name": service_name})
-
-    otlp_exporter = OTLPSpanExporter(
-        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
-        insecure=True
-    )
-
-    provider = TracerProvider(resource=resource)
-    processor = BatchSpanProcessor(otlp_exporter)
-    provider.add_span_processor(processor)
-    trace.set_tracer_provider(provider)
-
-    return provider
-
-# Initialize tracing
-tracer_provider = setup_tracing("alert-processor")
-
-# Setup logging
-setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
-
-# Setup OpenTelemetry logging export if enabled
-if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
-    try:
-        from shared.monitoring.logs_exporter import setup_otel_logging
-        result = setup_otel_logging("alert-processor", settings.VERSION)
-        if result:
-            logger = structlog.get_logger()
-            logger.info("OpenTelemetry logs export enabled for alert-processor")
-        else:
-            logger = structlog.get_logger()
-            logger.warning("OpenTelemetry logs export setup returned None")
-    except Exception as e:
-        logger = structlog.get_logger()
-        logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
-else:
-    logger = structlog.get_logger()
-    logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
+# Initialize logger
+logger = structlog.get_logger()
 
 # Global consumer instance
 consumer: EventConsumer = None
 
 
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """
-    Application lifecycle manager.
+class AlertProcessorService(StandardFastAPIService):
+    """Alert Processor Service with standardized monitoring setup and RabbitMQ consumer"""
 
-    Startup: Initialize Redis and RabbitMQ consumer
-    Shutdown: Close consumer and Redis connections
-    """
-    global consumer
+    async def on_startup(self, app):
+        """Custom startup logic for Alert Processor"""
+        global consumer
 
-    logger.info("alert_processor_starting", version=settings.VERSION)
-
-    # Startup: Initialize Redis and start consumer
-    try:
         # Initialize Redis connection
         await initialize_redis(
             settings.REDIS_URL,
@@ -96,69 +34,48 @@ async def lifespan(app: FastAPI):
         )
         logger.info("redis_initialized")
 
+        # Start RabbitMQ consumer
         consumer = EventConsumer()
         await consumer.start()
-        logger.info("alert_processor_started")
+        logger.info("rabbitmq_consumer_started")
 
-        # Initialize system metrics collection
-        system_metrics = SystemMetricsCollector("alert-processor")
-        logger.info("System metrics collection started")
+        await super().on_startup(app)
 
-        # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
-        logger.info("Metrics export configured via OpenTelemetry OTLP")
-    except Exception as e:
-        logger.error("alert_processor_startup_failed", error=str(e))
-        raise
+    async def on_shutdown(self, app):
+        """Custom shutdown logic for Alert Processor"""
+        global consumer
 
-    yield
+        await super().on_shutdown(app)
 
-    # Shutdown: Stop consumer and close Redis
-    try:
+        # Stop RabbitMQ consumer
         if consumer:
             await consumer.stop()
+            logger.info("rabbitmq_consumer_stopped")
+
+        # Close Redis
         await close_redis()
-        logger.info("alert_processor_shutdown")
-    except Exception as e:
-        logger.error("alert_processor_shutdown_failed", error=str(e))
+        logger.info("redis_closed")
 
 
-# Create FastAPI app
-app = FastAPI(
-    title="Alert Processor Service",
+# Create service instance
+service = AlertProcessorService(
+    service_name="alert-processor",
+    app_name="Alert Processor Service",
     description="Event processing, enrichment, and alert management system",
     version=settings.VERSION,
-    lifespan=lifespan,
-    debug=settings.DEBUG
+    log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
+    cors_origins=["*"],  # Configure appropriately for production
+    api_prefix="/api/v1",
+    enable_metrics=True,
+    enable_health_checks=True,
+    enable_tracing=True,
+    enable_cors=True
 )
 
-# Instrument FastAPI with OpenTelemetry
-FastAPIInstrumentor.instrument_app(app)
+# Create FastAPI app
+app = service.create_app(debug=settings.DEBUG)
 
-# Instrument httpx for outgoing requests
-HTTPXClientInstrumentor().instrument()
-
-# Instrument Redis
-RedisInstrumentor().instrument()
-
-# Instrument SQLAlchemy
-SQLAlchemyInstrumentor().instrument()
-
-# Initialize metrics collector
-metrics_collector = MetricsCollector("alert-processor")
-
-# Add metrics middleware to track HTTP requests
-add_metrics_middleware(app, metrics_collector)
-
-# CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # Configure appropriately for production
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-# Include routers
+# Add service-specific routers
 app.include_router(
     alerts.router,
     prefix="/api/v1/tenants/{tenant_id}",
@@ -172,34 +89,6 @@ app.include_router(
 )
 
 
-@app.get("/health")
-async def health_check():
-    """
-    Health check endpoint.
-
-    Returns service status and version.
-    """
-    return {
-        "status": "healthy",
-        "service": settings.SERVICE_NAME,
-        "version": settings.VERSION
-    }
-
-
-@app.get("/")
-async def root():
-    """Root endpoint with service info"""
-    return {
-        "service": settings.SERVICE_NAME,
-        "version": settings.VERSION,
-        "description": "Event processing, enrichment, and alert management system"
-    }
-
-
-# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
-# The /metrics endpoint is not needed as metrics are pushed automatically
-
-
 if __name__ == "__main__":
     import uvicorn
 
diff --git a/services/demo_session/app/main.py b/services/demo_session/app/main.py
index 6af70095..f594fe01 100644
--- a/services/demo_session/app/main.py
+++ b/services/demo_session/app/main.py
@@ -3,192 +3,74 @@ Demo Session Service - Main Application
 Manages isolated demo sessions with ephemeral data
 """
 
-from fastapi import FastAPI, Request, Response
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
 import structlog
-from contextlib import asynccontextmanager
-import os
 
 from app.core import settings, DatabaseManager
 from app.api import demo_sessions, demo_accounts, demo_operations, internal
 from shared.redis_utils import initialize_redis, close_redis
-from shared.monitoring.logging import setup_logging
-from shared.monitoring.metrics import MetricsCollector, add_metrics_middleware
-from shared.monitoring.system_metrics import SystemMetricsCollector
+from shared.service_base import StandardFastAPIService
 
-# OpenTelemetry imports
-from opentelemetry import trace
-from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
-from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
-from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
-from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
-from opentelemetry.instrumentation.redis import RedisInstrumentor
-from opentelemetry.sdk.resources import Resource
+# Initialize logger
+logger = structlog.get_logger()
 
-# Configure OpenTelemetry tracing
-def setup_tracing(service_name: str = "demo-session"):
-    """Initialize OpenTelemetry tracing with OTLP exporter for Jaeger"""
-    resource = Resource.create({"service.name": service_name})
-
-    otlp_exporter = OTLPSpanExporter(
-        endpoint=os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4317"),
-        insecure=True
-    )
-
-    provider = TracerProvider(resource=resource)
-    processor = BatchSpanProcessor(otlp_exporter)
-    provider.add_span_processor(processor)
-    trace.set_tracer_provider(provider)
-
-    return provider
-
-# Initialize tracing
-tracer_provider = setup_tracing("demo-session")
-
-# Setup logging
-setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO'))
-
-# Setup OpenTelemetry logging export if enabled
-if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
-    try:
-        from shared.monitoring.logs_exporter import setup_otel_logging
-        result = setup_otel_logging("demo-session", settings.VERSION)
-        if result:
-            logger = structlog.get_logger()
-            logger.info("OpenTelemetry logs export enabled for demo-session")
-        else:
-            logger = structlog.get_logger()
-            logger.warning("OpenTelemetry logs export setup returned None")
-    except Exception as e:
-        logger = structlog.get_logger()
-        logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
-else:
-    logger = structlog.get_logger()
-    logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
-
-# Initialize database
+# Initialize database manager
 db_manager = DatabaseManager()
 
 
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Application lifespan handler"""
-    logger.info("Starting Demo Session Service", version=settings.VERSION)
+class DemoSessionService(StandardFastAPIService):
+    """Demo Session Service with standardized monitoring setup"""
 
-    # Initialize database
-    db_manager.initialize()
+    async def on_startup(self, app):
+        """Custom startup logic for Demo Session"""
+        # Initialize database
+        db_manager.initialize()
+        logger.info("Database initialized")
 
-    # Initialize Redis using shared implementation
-    await initialize_redis(
-        redis_url=settings.REDIS_URL,
-        db=0,
-        max_connections=50
-    )
+        # Initialize Redis
+        await initialize_redis(
+            redis_url=settings.REDIS_URL,
+            db=0,
+            max_connections=50
+        )
+        logger.info("Redis initialized")
 
-    # Initialize system metrics collection
-    system_metrics = SystemMetricsCollector("demo-session")
-    logger.info("System metrics collection started")
+        await super().on_startup(app)
 
-    # Note: Metrics are exported via OpenTelemetry OTLP to SigNoz - no metrics server needed
-    logger.info("Metrics export configured via OpenTelemetry OTLP")
+    async def on_shutdown(self, app):
+        """Custom shutdown logic for Demo Session"""
+        await super().on_shutdown(app)
 
-    logger.info("Demo Session Service started successfully")
-
-    yield
-
-    # Cleanup on shutdown
-    await db_manager.close()
-    await close_redis()
-
-    logger.info("Demo Session Service stopped")
+        # Cleanup
+        await db_manager.close()
+        await close_redis()
+        logger.info("Database and Redis connections closed")
 
 
-app = FastAPI(
-    title="Demo Session Service",
+# Create service instance
+service = DemoSessionService(
+    service_name="demo-session",
+    app_name="Demo Session Service",
     description="Manages isolated demo sessions for prospect users",
     version=settings.VERSION,
-    lifespan=lifespan
+    log_level=getattr(settings, 'LOG_LEVEL', 'INFO'),
+    cors_origins=["*"],  # Configure appropriately for production
+    api_prefix="/api/v1",
+    enable_metrics=True,
+    enable_health_checks=True,
+    enable_tracing=True,
+    enable_cors=True
 )
 
-# Instrument FastAPI with OpenTelemetry
-FastAPIInstrumentor.instrument_app(app)
+# Create FastAPI app
+app = service.create_app(debug=settings.DEBUG)
 
-# Instrument httpx for outgoing requests
-HTTPXClientInstrumentor().instrument()
-
-# Instrument Redis
-RedisInstrumentor().instrument()
-
-# Initialize metrics collector
-metrics_collector = MetricsCollector("demo-session")
-
-# Add metrics middleware to track HTTP requests
-add_metrics_middleware(app, metrics_collector)
-
-# CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-
-@app.exception_handler(Exception)
-async def global_exception_handler(request: Request, exc: Exception):
-    """Global exception handler"""
-    logger.error(
-        "Unhandled exception",
-        path=request.url.path,
-        method=request.method,
-        error=str(exc)
-    )
-    return JSONResponse(
-        status_code=500,
-        content={"detail": "Internal server error"}
-    )
-
-
-# Include routers
+# Add service-specific routers
 app.include_router(demo_sessions.router)
 app.include_router(demo_accounts.router)
 app.include_router(demo_operations.router)
 app.include_router(internal.router)
 
 
-@app.get("/")
-async def root():
-    """Root endpoint"""
-    return {
-        "service": "demo-session",
-        "version": settings.VERSION,
-        "status": "running"
-    }
-
-
-@app.get("/health")
-async def health():
-    """Health check endpoint"""
-    from shared.redis_utils import get_redis_manager
-
-    redis_manager = await get_redis_manager()
-    redis_ok = await redis_manager.health_check()
-
-    return {
-        "status": "healthy" if redis_ok else "degraded",
-        "service": "demo-session",
-        "version": settings.VERSION,
-        "redis": "connected" if redis_ok else "disconnected"
-    }
-
-
-# Note: Metrics are exported via OpenTelemetry OTLP to SigNoz
-# The /metrics endpoint is not needed as metrics are pushed automatically
-
-
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(
diff --git a/shared/monitoring/__init__.py b/shared/monitoring/__init__.py
index 3795e114..9ffad53f 100755
--- a/shared/monitoring/__init__.py
+++ b/shared/monitoring/__init__.py
@@ -1,14 +1,34 @@
 """
 Shared monitoring package for microservices
+
+Provides unified OpenTelemetry-based observability:
+- Traces: Distributed tracing
+- Metrics: System and application metrics
+- Logs: Structured logging
+
+All signals exported to SigNoz via OTLP.
 """
 
+# Core setup - START HERE
 from .logging import setup_logging
-from .metrics import setup_metrics_early, get_metrics_collector, MetricsCollector
-from .health_checks import (
-    HealthCheckManager,
-    FastAPIHealthChecker,
-    create_health_manager,
-    setup_fastapi_health_checks
+from .telemetry import (
+    setup_telemetry,
+    setup_telemetry_simple,
+    get_telemetry_status,
+    TelemetryProviders
+)
+
+# Configuration
+from .otel_config import OTelConfig, OTelEndpoints
+
+# Individual signal setup (used by telemetry.py)
+from .tracing import (
+    setup_tracing,
+    get_current_trace_id,
+    get_current_span_id,
+    add_trace_attributes,
+    add_trace_event,
+    record_exception
 )
 from .logs_exporter import (
     setup_otel_logging,
@@ -27,23 +47,51 @@ from .system_metrics import (
     setup_all_metrics
 )
 
+# Health checks
+from .health_checks import (
+    HealthCheckManager,
+    FastAPIHealthChecker,
+    create_health_manager,
+    setup_fastapi_health_checks
+)
+
 __all__ = [
+    # CORE - Start with these
     'setup_logging',
-    'setup_metrics_early',
-    'get_metrics_collector',
-    'MetricsCollector',
-    'HealthCheckManager',
-    'FastAPIHealthChecker',
-    'create_health_manager',
-    'setup_fastapi_health_checks',
+    'setup_telemetry',
+    'setup_telemetry_simple',
+    'get_telemetry_status',
+    'TelemetryProviders',
+
+    # Configuration
+    'OTelConfig',
+    'OTelEndpoints',
+
+    # Tracing
+    'setup_tracing',
+    'get_current_trace_id',
+    'get_current_span_id',
+    'add_trace_attributes',
+    'add_trace_event',
+    'record_exception',
+
+    # Logs
     'setup_otel_logging',
     'add_log_context',
     'get_current_trace_context',
     'StructlogOTELProcessor',
+
+    # Metrics
     'setup_otel_metrics',
     'OTelMetricsCollector',
     'create_dual_metrics_collector',
     'SystemMetricsCollector',
     'ApplicationMetricsCollector',
-    'setup_all_metrics'
+    'setup_all_metrics',
+
+    # Health checks
+    'HealthCheckManager',
+    'FastAPIHealthChecker',
+    'create_health_manager',
+    'setup_fastapi_health_checks',
 ]
\ No newline at end of file
diff --git a/shared/monitoring/logs_exporter.py b/shared/monitoring/logs_exporter.py
index 87a25493..755654a8 100644
--- a/shared/monitoring/logs_exporter.py
+++ b/shared/monitoring/logs_exporter.py
@@ -1,6 +1,6 @@
 """
 OpenTelemetry Logs Integration for SigNoz
-Exports structured logs to SigNoz via OpenTelemetry Collector
+Exports structured logs to SigNoz via OpenTelemetry Collector using HTTP protocol
 """
 
 import os
@@ -10,14 +10,21 @@ from typing import Optional
 from opentelemetry._logs import set_logger_provider
 from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
 from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
+from opentelemetry.sdk.resources import Resource
+
+# Try to import HTTP log exporter (logs always use HTTP)
 try:
     from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
+    HTTP_LOG_EXPORTER_AVAILABLE = True
 except ImportError:
     try:
         from opentelemetry.exporter.otlp.proto.http.log_exporter import OTLPLogExporter
+        HTTP_LOG_EXPORTER_AVAILABLE = True
     except ImportError:
         OTLPLogExporter = None
-from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
+        HTTP_LOG_EXPORTER_AVAILABLE = False
+
+from .otel_config import OTelConfig
 
 logger = structlog.get_logger()
 
@@ -31,13 +38,14 @@ def setup_otel_logging(
     """
     Setup OpenTelemetry logging to export logs to SigNoz.
 
-    This integrates with Python's standard logging to automatically
-    export all log records to SigNoz via the OTLP protocol.
+    Uses HTTP protocol (port 4318) for sending logs to SigNoz.
+    Integrates with Python's standard logging to automatically export
+    all log records to SigNoz via the OTLP HTTP protocol.
 
     Args:
         service_name: Name of the service (e.g., "auth-service")
         service_version: Version of the service
-        otel_endpoint: OpenTelemetry collector endpoint (default from env)
+        otel_endpoint: Optional override for OTLP endpoint (HTTP format with path)
         enable_console: Whether to also log to console (default: True)
 
     Returns:
@@ -47,7 +55,7 @@ def setup_otel_logging(
         from shared.monitoring.logs_exporter import setup_otel_logging
 
         # Setup during service initialization
-        setup_otel_logging("auth-service", "1.0.0")
+        handler = setup_otel_logging("auth-service", "1.0.0")
 
         # Now all standard logging calls will be exported to SigNoz
         import logging
@@ -56,7 +64,7 @@ def setup_otel_logging(
     """
 
     # Check if logging export is enabled
-    if os.getenv("OTEL_LOGS_EXPORTER", "").lower() != "otlp":
+    if not OTelConfig.is_enabled("logs"):
         logger.info(
             "OpenTelemetry logs export disabled",
             service=service_name,
@@ -64,59 +72,36 @@ def setup_otel_logging(
         )
         return None
 
-    # Get OTLP endpoint from environment or parameter
-    # For logs, we need to use the HTTP endpoint (port 4318), not the gRPC endpoint (port 4317)
-    if otel_endpoint is None:
-        # Try logs-specific endpoint first, then fall back to general OTLP endpoint
-        otel_endpoint = os.getenv(
-            "OTEL_EXPORTER_OTLP_LOGS_ENDPOINT",
-            os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
+    # Check if HTTP log exporter is available
+    if not HTTP_LOG_EXPORTER_AVAILABLE or OTLPLogExporter is None:
+        logger.warning(
+            "OpenTelemetry HTTP log exporter not available",
+            service=service_name,
+            reason="opentelemetry-exporter-otlp-proto-http package not installed"
         )
-        
-        logger.info(f"Original OTLP endpoint for logs: {otel_endpoint}")
-        
-        # If we got the tracing endpoint (4317), switch to logs endpoint (4318)
-        if otel_endpoint.endswith(":4317"):
-            logger.info("Converting tracing endpoint (4317) to logs endpoint (4318)")
-            otel_endpoint = otel_endpoint.replace(":4317", ":4318")
-        
-        logger.info(f"Final OTLP endpoint for logs: {otel_endpoint}")
-
-    # Ensure endpoint has proper protocol prefix
-    if not otel_endpoint.startswith(("http://", "https://")):
-        # Default to HTTP for insecure connections
-        otel_endpoint = f"http://{otel_endpoint}"
-
-    # Ensure endpoint has /v1/logs path for HTTP
-    if not otel_endpoint.endswith("/v1/logs"):
-        otel_endpoint = f"{otel_endpoint}/v1/logs"
+        return None
 
     try:
-        # Check if OTLPLogExporter is available
-        if OTLPLogExporter is None:
-            logger.warning(
-                "OpenTelemetry HTTP OTLP exporter not available",
-                service=service_name,
-                reason="opentelemetry-exporter-otlp-proto-http package not installed"
-            )
-            return None
+        # Get endpoints from centralized config
+        endpoints = OTelConfig.get_endpoints()
 
-        # Create resource with service information
-        resource = Resource(attributes={
-            SERVICE_NAME: service_name,
-            SERVICE_VERSION: service_version,
-            "deployment.environment": os.getenv("ENVIRONMENT", "development"),
-            "k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
-            "k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
-        })
+        # Use provided endpoint or get from config
+        if otel_endpoint:
+            http_endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/logs")
+        else:
+            http_endpoint = endpoints.logs_http
+
+        # Get resource attributes
+        resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
+        resource = Resource(attributes=resource_attrs)
 
         # Configure logger provider
         logger_provider = LoggerProvider(resource=resource)
         set_logger_provider(logger_provider)
 
-        # Configure OTLP exporter for logs
+        # Configure OTLP HTTP exporter for logs
         otlp_exporter = OTLPLogExporter(
-            endpoint=otel_endpoint,
+            endpoint=http_endpoint,
             timeout=10
         )
 
@@ -135,9 +120,10 @@ def setup_otel_logging(
         root_logger.addHandler(otel_handler)
 
         logger.info(
-            "OpenTelemetry logs export configured",
+            "OpenTelemetry logs export configured successfully",
             service=service_name,
-            otel_endpoint=otel_endpoint,
+            http_endpoint=http_endpoint,
+            protocol="http",
             console_logging=enable_console
         )
 
@@ -147,8 +133,7 @@ def setup_otel_logging(
         logger.error(
             "Failed to setup OpenTelemetry logs export",
             service=service_name,
-            error=str(e),
-            reason="Will continue with standard logging only"
+            error=str(e)
         )
         return None
 
diff --git a/shared/monitoring/metrics_exporter.py b/shared/monitoring/metrics_exporter.py
index 6a4020eb..47060841 100644
--- a/shared/monitoring/metrics_exporter.py
+++ b/shared/monitoring/metrics_exporter.py
@@ -1,6 +1,6 @@
 """
 OpenTelemetry Metrics Integration for SigNoz
-Exports metrics to SigNoz via OpenTelemetry Collector in addition to Prometheus
+Exports metrics to SigNoz via OpenTelemetry Collector using gRPC protocol
 """
 
 import os
@@ -9,8 +9,24 @@ from typing import Optional
 from opentelemetry import metrics
 from opentelemetry.sdk.metrics import MeterProvider
 from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
-from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
-from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
+from opentelemetry.sdk.resources import Resource
+
+# Import both gRPC and HTTP exporters
+try:
+    from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GrpcMetricExporter
+    GRPC_AVAILABLE = True
+except ImportError:
+    GRPC_AVAILABLE = False
+    GrpcMetricExporter = None
+
+try:
+    from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HttpMetricExporter
+    HTTP_AVAILABLE = True
+except ImportError:
+    HTTP_AVAILABLE = False
+    HttpMetricExporter = None
+
+from .otel_config import OTelConfig
 
 logger = structlog.get_logger()
 
@@ -19,20 +35,21 @@ def setup_otel_metrics(
     service_name: str,
     service_version: str = "1.0.0",
     otel_endpoint: Optional[str] = None,
-    export_interval_millis: int = 60000  # Export every 60 seconds
+    export_interval_millis: int = 60000,  # Export every 60 seconds
+    protocol: Optional[str] = None  # "grpc" or "http", defaults to grpc
 ) -> Optional[MeterProvider]:
     """
     Setup OpenTelemetry metrics to export to SigNoz.
 
-    This creates a dual-export strategy:
-    - Prometheus exposition format at /metrics (for Prometheus scraping)
-    - OTLP push to SigNoz collector (for direct ingestion)
+    Supports both gRPC (recommended, port 4317) and HTTP (port 4318) protocols.
+    Default protocol is gRPC for better performance.
 
     Args:
         service_name: Name of the service (e.g., "auth-service")
         service_version: Version of the service
-        otel_endpoint: OpenTelemetry collector endpoint (default from env)
-        export_interval_millis: How often to push metrics (default 60s)
+        otel_endpoint: Optional override for OTLP endpoint
+        export_interval_millis: How often to push metrics in milliseconds (default 60s)
+        protocol: Protocol to use ("grpc" or "http"). Defaults to "grpc"
 
     Returns:
         MeterProvider instance if successful, None otherwise
@@ -40,9 +57,12 @@ def setup_otel_metrics(
     Example:
         from shared.monitoring.metrics_exporter import setup_otel_metrics
 
-        # Setup during service initialization
+        # Setup with gRPC (default)
         meter_provider = setup_otel_metrics("auth-service", "1.0.0")
 
+        # Or with HTTP
+        meter_provider = setup_otel_metrics("auth-service", "1.0.0", protocol="http")
+
         # Create meters for your metrics
         meter = meter_provider.get_meter(__name__)
         request_counter = meter.create_counter(
@@ -56,8 +76,7 @@ def setup_otel_metrics(
     """
 
     # Check if metrics export is enabled
-    enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
-    if not enable_otel_metrics:
+    if not OTelConfig.is_enabled("metrics"):
         logger.info(
             "OpenTelemetry metrics export disabled",
             service=service_name,
@@ -65,32 +84,66 @@ def setup_otel_metrics(
         )
         return None
 
-    # Get OTLP endpoint from environment or parameter
-    if otel_endpoint is None:
-        otel_endpoint = os.getenv(
-            "OTEL_EXPORTER_OTLP_ENDPOINT",
-            os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
-        )
+    # Determine protocol to use
+    if protocol is None:
+        protocol = OTelConfig.get_protocol("metrics")
 
-    # Ensure endpoint has /v1/metrics path for HTTP
-    if not otel_endpoint.endswith("/v1/metrics"):
-        otel_endpoint = f"{otel_endpoint}/v1/metrics"
+    # Validate protocol is available
+    if protocol == "grpc" and not GRPC_AVAILABLE:
+        logger.warning(
+            "gRPC exporter not available, falling back to HTTP",
+            service=service_name
+        )
+        protocol = "http"
+    elif protocol == "http" and not HTTP_AVAILABLE:
+        logger.warning(
+            "HTTP exporter not available, falling back to gRPC",
+            service=service_name
+        )
+        protocol = "grpc"
+
+    if protocol not in ["grpc", "http"]:
+        logger.error(
+            "Invalid protocol specified",
+            service=service_name,
+            protocol=protocol
+        )
+        return None
 
     try:
-        # Create resource with service information
-        resource = Resource(attributes={
-            SERVICE_NAME: service_name,
-            SERVICE_VERSION: service_version,
-            "deployment.environment": os.getenv("ENVIRONMENT", "development"),
-            "k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
-            "k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
-        })
+        # Get endpoints from centralized config
+        endpoints = OTelConfig.get_endpoints()
 
-        # Configure OTLP exporter for metrics
-        otlp_exporter = OTLPMetricExporter(
-            endpoint=otel_endpoint,
-            timeout=10
-        )
+        # Determine which endpoint to use
+        if otel_endpoint:
+            # User provided override
+            if protocol == "grpc":
+                endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
+            else:
+                endpoint = OTelConfig._ensure_http_endpoint(otel_endpoint, "/v1/metrics")
+        else:
+            # Use config-determined endpoint
+            if protocol == "grpc":
+                endpoint = endpoints.metrics_grpc
+            else:
+                endpoint = endpoints.metrics_http
+
+        # Get resource attributes
+        resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
+        resource = Resource(attributes=resource_attrs)
+
+        # Configure OTLP exporter based on protocol
+        if protocol == "grpc":
+            otlp_exporter = GrpcMetricExporter(
+                endpoint=endpoint,
+                insecure=True,  # Use secure=False in production with proper TLS
+                timeout=10
+            )
+        else:  # http
+            otlp_exporter = HttpMetricExporter(
+                endpoint=endpoint,
+                timeout=10
+            )
 
         # Create periodic metric reader
         metric_reader = PeriodicExportingMetricReader(
@@ -108,9 +161,10 @@ def setup_otel_metrics(
         metrics.set_meter_provider(meter_provider)
 
         logger.info(
-            "OpenTelemetry metrics export configured",
+            "OpenTelemetry metrics export configured successfully",
             service=service_name,
-            otel_endpoint=otel_endpoint,
+            endpoint=endpoint,
+            protocol=protocol,
             export_interval_seconds=export_interval_millis / 1000
         )
 
@@ -121,7 +175,7 @@ def setup_otel_metrics(
             "Failed to setup OpenTelemetry metrics export",
             service=service_name,
             error=str(e),
-            reason="Will continue with Prometheus-only metrics"
+            protocol=protocol
         )
         return None
 
diff --git a/shared/monitoring/otel_config.py b/shared/monitoring/otel_config.py
new file mode 100644
index 00000000..0ce443ed
--- /dev/null
+++ b/shared/monitoring/otel_config.py
@@ -0,0 +1,286 @@
+"""
+Centralized OpenTelemetry Configuration
+Manages OTEL endpoints and settings for traces, metrics, and logs
+"""
+
+import os
+from typing import Optional, Tuple
+from dataclasses import dataclass
+import structlog
+
+logger = structlog.get_logger()
+
+
+@dataclass
+class OTelEndpoints:
+    """
+    Container for OpenTelemetry endpoints.
+
+    SigNoz uses different protocols for different signals:
+    - Traces: gRPC (port 4317)
+    - Metrics: gRPC (port 4317) or HTTP (port 4318)
+    - Logs: HTTP (port 4318)
+    """
+    traces_grpc: str  # gRPC endpoint for traces (e.g., "host:4317")
+    metrics_grpc: str  # gRPC endpoint for metrics (e.g., "host:4317")
+    metrics_http: str  # HTTP endpoint for metrics (e.g., "http://host:4318/v1/metrics")
+    logs_http: str     # HTTP endpoint for logs (e.g., "http://host:4318/v1/logs")
+
+
+class OTelConfig:
+    """
+    Centralized configuration for OpenTelemetry exporters.
+
+    This class manages endpoint URLs and ensures proper protocol usage:
+    - gRPC endpoints: host:port (no protocol prefix)
+    - HTTP endpoints: http://host:port/path (with protocol and path)
+    """
+
+    # Default base endpoint (can be overridden by environment variables)
+    DEFAULT_OTEL_COLLECTOR_HOST = "signoz-otel-collector.bakery-ia.svc.cluster.local"
+    DEFAULT_GRPC_PORT = 4317
+    DEFAULT_HTTP_PORT = 4318
+
+    @classmethod
+    def get_endpoints(cls) -> OTelEndpoints:
+        """
+        Get OpenTelemetry endpoints from environment variables with proper fallbacks.
+
+        Environment variables (in order of precedence):
+        1. OTEL_EXPORTER_OTLP_ENDPOINT - Base endpoint (gRPC format: host:port)
+        2. OTEL_EXPORTER_OTLP_TRACES_ENDPOINT - Specific traces endpoint
+        3. OTEL_EXPORTER_OTLP_METRICS_ENDPOINT - Specific metrics endpoint
+        4. OTEL_EXPORTER_OTLP_LOGS_ENDPOINT - Specific logs endpoint
+        5. OTEL_COLLECTOR_ENDPOINT - Legacy variable (HTTP format)
+
+        Returns:
+            OTelEndpoints with all configured endpoints
+        """
+        # Get base endpoint from environment
+        base_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT")
+
+        if base_endpoint:
+            # Clean and parse base endpoint
+            base_grpc = cls._clean_grpc_endpoint(base_endpoint)
+            base_http_host = cls._extract_host(base_endpoint)
+        else:
+            # Use default collector
+            base_grpc = f"{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_GRPC_PORT}"
+            base_http_host = f"http://{cls.DEFAULT_OTEL_COLLECTOR_HOST}:{cls.DEFAULT_HTTP_PORT}"
+
+        # Get signal-specific endpoints (or use base endpoint)
+        traces_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", base_grpc)
+        metrics_endpoint = os.getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT", base_grpc)
+        logs_endpoint = os.getenv("OTEL_EXPORTER_OTLP_LOGS_ENDPOINT")
+
+        # Build final endpoints
+        traces_grpc = cls._clean_grpc_endpoint(traces_endpoint)
+        metrics_grpc = cls._clean_grpc_endpoint(metrics_endpoint)
+
+        # For metrics HTTP, convert gRPC endpoint to HTTP if needed
+        metrics_http = cls._grpc_to_http_endpoint(metrics_grpc, "/v1/metrics")
+
+        # For logs, use HTTP endpoint
+        if logs_endpoint:
+            logs_http = cls._ensure_http_endpoint(logs_endpoint, "/v1/logs")
+        else:
+            logs_http = cls._grpc_to_http_endpoint(base_grpc, "/v1/logs")
+
+        endpoints = OTelEndpoints(
+            traces_grpc=traces_grpc,
+            metrics_grpc=metrics_grpc,
+            metrics_http=metrics_http,
+            logs_http=logs_http
+        )
+
+        logger.info(
+            "OpenTelemetry endpoints configured",
+            traces_grpc=endpoints.traces_grpc,
+            metrics_grpc=endpoints.metrics_grpc,
+            metrics_http=endpoints.metrics_http,
+            logs_http=endpoints.logs_http
+        )
+
+        return endpoints
+
+    @staticmethod
+    def _clean_grpc_endpoint(endpoint: str) -> str:
+        """
+        Clean endpoint for gRPC usage (remove protocol, paths).
+
+        Args:
+            endpoint: Raw endpoint string
+
+        Returns:
+            Cleaned endpoint in format "host:port"
+        """
+        # Remove protocol prefixes
+        endpoint = endpoint.replace("http://", "").replace("https://", "")
+
+        # Remove paths (gRPC doesn't use paths)
+        if "/" in endpoint:
+            endpoint = endpoint.split("/")[0]
+
+        # Ensure it has a port
+        if ":" not in endpoint:
+            endpoint = f"{endpoint}:4317"
+
+        return endpoint
+
+    @staticmethod
+    def _extract_host(endpoint: str) -> str:
+        """
+        Extract host and convert to HTTP endpoint.
+
+        Args:
+            endpoint: Raw endpoint string
+
+        Returns:
+            HTTP endpoint without path (e.g., "http://host:4318")
+        """
+        # Remove protocol if present
+        clean = endpoint.replace("http://", "").replace("https://", "")
+
+        # Remove path if present
+        if "/" in clean:
+            clean = clean.split("/")[0]
+
+        # Extract host without port
+        if ":" in clean:
+            host = clean.split(":")[0]
+        else:
+            host = clean
+
+        return f"http://{host}:4318"
+
+    @staticmethod
+    def _grpc_to_http_endpoint(grpc_endpoint: str, path: str) -> str:
+        """
+        Convert gRPC endpoint to HTTP endpoint with path.
+
+        Args:
+            grpc_endpoint: gRPC endpoint (e.g., "host:4317")
+            path: HTTP path (e.g., "/v1/metrics")
+
+        Returns:
+            HTTP endpoint (e.g., "http://host:4318/v1/metrics")
+        """
+        # Extract host from gRPC endpoint
+        if ":" in grpc_endpoint:
+            host = grpc_endpoint.split(":")[0]
+        else:
+            host = grpc_endpoint
+
+        # Build HTTP endpoint with port 4318
+        return f"http://{host}:4318{path}"
+
+    @staticmethod
+    def _ensure_http_endpoint(endpoint: str, path: str) -> str:
+        """
+        Ensure endpoint is in HTTP format with proper path.
+
+        Args:
+            endpoint: Raw endpoint string
+            path: Required path (e.g., "/v1/logs")
+
+        Returns:
+            HTTP endpoint with protocol and path
+        """
+        # Add protocol if missing
+        if not endpoint.startswith(("http://", "https://")):
+            endpoint = f"http://{endpoint}"
+
+        # Ensure it has the correct port for HTTP
+        if ":4317" in endpoint:
+            endpoint = endpoint.replace(":4317", ":4318")
+        elif ":4318" not in endpoint and ":" in endpoint:
+            # Has a port but not the right one, replace it
+            parts = endpoint.split(":")
+            if len(parts) >= 2:
+                # Remove existing port and path
+                base = ":".join(parts[:-1])
+                endpoint = f"{base}:4318"
+        elif ":" not in endpoint.replace("http://", "").replace("https://", ""):
+            # No port at all, add it
+            endpoint = f"{endpoint}:4318"
+
+        # Ensure path is present
+        if not endpoint.endswith(path):
+            # Remove any existing path first
+            if "/" in endpoint.split("://")[1]:
+                base = endpoint.split("://")[0] + "://" + endpoint.split("://")[1].split("/")[0]
+                endpoint = base
+            endpoint = f"{endpoint}{path}"
+
+        return endpoint
+
+    @classmethod
+    def get_resource_attributes(
+        cls,
+        service_name: str,
+        service_version: str = "1.0.0"
+    ) -> dict:
+        """
+        Get common resource attributes for all OTEL signals.
+
+        Args:
+            service_name: Name of the service
+            service_version: Version of the service
+
+        Returns:
+            Dictionary of resource attributes
+        """
+        return {
+            "service.name": service_name,
+            "service.version": service_version,
+            "deployment.environment": os.getenv("ENVIRONMENT", "development"),
+            "k8s.namespace.name": os.getenv("K8S_NAMESPACE", "bakery-ia"),
+            "k8s.pod.name": os.getenv("HOSTNAME", "unknown"),
+            "k8s.cluster.name": os.getenv("K8S_CLUSTER_NAME", "bakery-ia-cluster"),
+        }
+
+    @classmethod
+    def is_enabled(cls, signal: str) -> bool:
+        """
+        Check if a specific telemetry signal is enabled.
+
+        Args:
+            signal: One of "traces", "metrics", "logs"
+
+        Returns:
+            True if signal is enabled, False otherwise
+        """
+        signal = signal.lower()
+
+        if signal == "traces":
+            return os.getenv("ENABLE_TRACING", "true").lower() == "true"
+        elif signal == "metrics":
+            return os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
+        elif signal == "logs":
+            return os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp"
+        else:
+            return False
+
+    @classmethod
+    def get_protocol(cls, signal: str) -> str:
+        """
+        Get the preferred protocol for a signal.
+
+        Args:
+            signal: One of "traces", "metrics", "logs"
+
+        Returns:
+            Protocol name ("grpc" or "http")
+        """
+        protocol = os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc")
+
+        # Signal-specific overrides
+        if signal == "traces":
+            return os.getenv("OTEL_EXPORTER_OTLP_TRACES_PROTOCOL", protocol)
+        elif signal == "metrics":
+            return os.getenv("OTEL_EXPORTER_OTLP_METRICS_PROTOCOL", protocol)
+        elif signal == "logs":
+            # Logs always use HTTP in our setup
+            return "http"
+
+        return protocol
diff --git a/shared/monitoring/telemetry.py b/shared/monitoring/telemetry.py
new file mode 100644
index 00000000..335b161f
--- /dev/null
+++ b/shared/monitoring/telemetry.py
@@ -0,0 +1,271 @@
+"""
+Unified OpenTelemetry Telemetry Setup
+
+Provides a single entry point to configure all telemetry signals:
+- Traces: Distributed tracing across services
+- Metrics: OTLP metrics export + system metrics collection
+- Logs: Structured logs with trace correlation
+
+All signals are exported to SigNoz via OTLP.
+"""
+
+import os
+import structlog
+from typing import Optional, Dict, Any, Tuple
+from dataclasses import dataclass
+
+from .otel_config import OTelConfig
+from .tracing import setup_tracing
+from .metrics_exporter import setup_otel_metrics
+from .logs_exporter import setup_otel_logging
+from .system_metrics import setup_all_metrics, SystemMetricsCollector, ApplicationMetricsCollector
+
+logger = structlog.get_logger()
+
+
+@dataclass
+class TelemetryProviders:
+    """
+    Container for all OpenTelemetry providers and collectors.
+
+    Attributes:
+        tracer_provider: Provider for distributed tracing
+        meter_provider: Provider for metrics export
+        logging_handler: Handler for structured logs
+        system_metrics: Collector for system-level metrics (CPU, memory, disk, network)
+        app_metrics: Collector for application-level metrics (HTTP, DB)
+    """
+    tracer_provider: Optional[Any] = None
+    meter_provider: Optional[Any] = None
+    logging_handler: Optional[Any] = None
+    system_metrics: Optional[SystemMetricsCollector] = None
+    app_metrics: Optional[ApplicationMetricsCollector] = None
+
+
+def setup_telemetry(
+    app,
+    service_name: str,
+    service_version: str = "1.0.0",
+    enable_traces: bool = True,
+    enable_metrics: bool = True,
+    enable_logs: bool = True,
+    enable_system_metrics: bool = True,
+    metrics_protocol: Optional[str] = None,  # "grpc" or "http", defaults to grpc
+    export_interval_millis: int = 60000
+) -> TelemetryProviders:
+    """
+    Setup all OpenTelemetry telemetry signals (traces, metrics, logs) for a service.
+
+    This is the UNIFIED setup function that configures everything:
+    - Distributed tracing (gRPC, port 4317)
+    - Metrics export (gRPC by default, port 4317)
+    - System metrics collection (CPU, memory, disk, network)
+    - Application metrics (HTTP requests, DB queries)
+    - Structured logs export (HTTP, port 4318)
+
+    All signals use the centralized OTelConfig for endpoint management.
+
+    Args:
+        app: FastAPI application instance
+        service_name: Name of the service (e.g., "auth-service")
+        service_version: Version of the service
+        enable_traces: Enable distributed tracing (default: True)
+        enable_metrics: Enable metrics export to OTLP (default: True)
+        enable_logs: Enable logs export to OTLP (default: True)
+        enable_system_metrics: Enable system metrics collection (default: True, can be disabled via ENABLE_SYSTEM_METRICS env)
+        metrics_protocol: Protocol for metrics ("grpc" or "http", default: "grpc")
+        export_interval_millis: How often to export metrics in milliseconds
+
+    Returns:
+        TelemetryProviders containing all initialized providers and collectors
+
+    Example:
+        from shared.monitoring.telemetry import setup_telemetry
+
+        app = FastAPI(title="Auth Service")
+        providers = setup_telemetry(
+            app,
+            service_name="auth-service",
+            service_version="1.0.0"
+        )
+
+        # All telemetry is now configured:
+        # - Traces automatically captured for HTTP requests
+        # - System metrics automatically collected
+        # - Application metrics via providers.app_metrics
+        # - Logs automatically correlated with traces
+    """
+
+    logger.info(
+        "Setting up unified OpenTelemetry telemetry",
+        service=service_name,
+        version=service_version,
+        traces=enable_traces,
+        metrics=enable_metrics,
+        logs=enable_logs,
+        system_metrics=enable_system_metrics
+    )
+
+    providers = TelemetryProviders()
+
+    # Setup distributed tracing
+    if enable_traces and OTelConfig.is_enabled("traces"):
+        try:
+            providers.tracer_provider = setup_tracing(
+                app,
+                service_name=service_name,
+                service_version=service_version
+            )
+            if providers.tracer_provider:
+                logger.info("✓ Distributed tracing configured", service=service_name)
+            else:
+                logger.warning("✗ Distributed tracing setup returned None", service=service_name)
+        except Exception as e:
+            logger.error("✗ Failed to setup distributed tracing", service=service_name, error=str(e))
+
+    # Setup OTLP metrics export
+    if enable_metrics and OTelConfig.is_enabled("metrics"):
+        try:
+            providers.meter_provider = setup_otel_metrics(
+                service_name=service_name,
+                service_version=service_version,
+                protocol=metrics_protocol,
+                export_interval_millis=export_interval_millis
+            )
+            if providers.meter_provider:
+                logger.info("✓ OTLP metrics export configured", service=service_name)
+
+                # Setup system and application metrics collectors
+                if enable_system_metrics:
+                    enable_system_env = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
+                    if enable_system_env:
+                        try:
+                            providers.system_metrics, providers.app_metrics = setup_all_metrics(
+                                service_name=service_name,
+                                service_version=service_version,
+                                meter_provider=providers.meter_provider
+                            )
+                            logger.info(
+                                "✓ System and application metrics collectors initialized",
+                                service=service_name,
+                                system_metrics=["cpu", "memory", "disk", "network"],
+                                app_metrics=["http_requests", "db_queries"]
+                            )
+                        except Exception as e:
+                            logger.warning("✗ Failed to setup metrics collectors", service=service_name, error=str(e))
+            else:
+                logger.warning("✗ OTLP metrics export setup returned None", service=service_name)
+        except Exception as e:
+            logger.error("✗ Failed to setup OTLP metrics export", service=service_name, error=str(e))
+
+    # Setup logs export
+    if enable_logs and OTelConfig.is_enabled("logs"):
+        try:
+            providers.logging_handler = setup_otel_logging(
+                service_name=service_name,
+                service_version=service_version
+            )
+            if providers.logging_handler:
+                logger.info("✓ Structured logs export configured", service=service_name)
+            else:
+                logger.warning("✗ Logs export setup returned None", service=service_name)
+        except Exception as e:
+            logger.error("✗ Failed to setup logs export", service=service_name, error=str(e))
+
+    # Log endpoint configuration summary
+    try:
+        endpoints = OTelConfig.get_endpoints()
+        summary = {
+            "service": service_name,
+            "version": service_version,
+            "traces": {
+                "enabled": bool(providers.tracer_provider),
+                "endpoint": endpoints.traces_grpc if providers.tracer_provider else "disabled"
+            },
+            "metrics": {
+                "enabled": bool(providers.meter_provider),
+                "endpoint": (endpoints.metrics_grpc if metrics_protocol != "http" else endpoints.metrics_http) if providers.meter_provider else "disabled",
+                "system_metrics": bool(providers.system_metrics),
+                "app_metrics": bool(providers.app_metrics)
+            },
+            "logs": {
+                "enabled": bool(providers.logging_handler),
+                "endpoint": endpoints.logs_http if providers.logging_handler else "disabled"
+            }
+        }
+        logger.info("🎉 Telemetry setup complete", **summary)
+    except Exception as e:
+        logger.warning("Could not log endpoint summary", error=str(e))
+
+    return providers
+
+
+def setup_telemetry_simple(
+    app,
+    service_name: str,
+    service_version: str = "1.0.0"
+) -> TelemetryProviders:
+    """
+    Simplified telemetry setup with all defaults.
+
+    Uses:
+    - gRPC for traces (port 4317)
+    - gRPC for metrics (port 4317)
+    - HTTP for logs (port 4318)
+
+    All settings are read from environment variables and OTelConfig.
+
+    Args:
+        app: FastAPI application instance
+        service_name: Name of the service
+        service_version: Version of the service
+
+    Returns:
+        TelemetryProviders containing all initialized providers
+
+    Example:
+        from shared.monitoring.telemetry import setup_telemetry_simple
+
+        app = FastAPI(title="Auth Service")
+        providers = setup_telemetry_simple(app, "auth-service")
+    """
+    return setup_telemetry(
+        app=app,
+        service_name=service_name,
+        service_version=service_version
+    )
+
+
+def get_telemetry_status() -> Dict[str, Any]:
+    """
+    Get current telemetry configuration status.
+
+    Returns:
+        Dictionary with telemetry status information
+
+    Example:
+        from shared.monitoring.telemetry import get_telemetry_status
+
+        status = get_telemetry_status()
+        print(f"Tracing enabled: {status['traces']['enabled']}")
+    """
+    endpoints = OTelConfig.get_endpoints()
+
+    return {
+        "traces": {
+            "enabled": OTelConfig.is_enabled("traces"),
+            "protocol": "grpc",
+            "endpoint": endpoints.traces_grpc
+        },
+        "metrics": {
+            "enabled": OTelConfig.is_enabled("metrics"),
+            "protocol": OTelConfig.get_protocol("metrics"),
+            "grpc_endpoint": endpoints.metrics_grpc,
+            "http_endpoint": endpoints.metrics_http
+        },
+        "logs": {
+            "enabled": OTelConfig.is_enabled("logs"),
+            "protocol": "http",
+            "endpoint": endpoints.logs_http
+        }
+    }
diff --git a/shared/monitoring/tracing.py b/shared/monitoring/tracing.py
index baf76bef..4cfb35a9 100755
--- a/shared/monitoring/tracing.py
+++ b/shared/monitoring/tracing.py
@@ -3,17 +3,38 @@ OpenTelemetry distributed tracing integration
 Provides end-to-end request tracking across all services
 """
 
+import os
 import structlog
 from typing import Optional
 from opentelemetry import trace
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
-from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
+from opentelemetry.sdk.resources import Resource
 from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+
+# Core instrumentations (should always be available)
 from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
-from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
-from opentelemetry.instrumentation.redis import RedisInstrumentor
-from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
+
+# Optional instrumentations (may not be installed in all services)
+try:
+    from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
+    HTTPX_AVAILABLE = True
+except ImportError:
+    HTTPX_AVAILABLE = False
+
+try:
+    from opentelemetry.instrumentation.redis import RedisInstrumentor
+    REDIS_AVAILABLE = True
+except ImportError:
+    REDIS_AVAILABLE = False
+
+try:
+    from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
+    SQLALCHEMY_AVAILABLE = True
+except ImportError:
+    SQLALCHEMY_AVAILABLE = False
+
+from .otel_config import OTelConfig
 
 logger = structlog.get_logger()
 
@@ -22,8 +43,8 @@ def setup_tracing(
     app,
     service_name: str,
     service_version: str = "1.0.0",
-    otel_endpoint: str = "http://signoz-otel-collector.bakery-ia:4318"
-):
+    otel_endpoint: Optional[str] = None
+) -> Optional[TracerProvider]:
     """
     Setup OpenTelemetry distributed tracing for a FastAPI service.
 
@@ -33,35 +54,56 @@ def setup_tracing(
     - Redis operations
     - PostgreSQL/SQLAlchemy queries
 
+    Uses gRPC protocol (port 4317) for sending traces to SigNoz.
+
     Args:
         app: FastAPI application instance
         service_name: Name of the service (e.g., "auth-service")
         service_version: Version of the service
-        otel_endpoint: OpenTelemetry collector endpoint (SigNoz)
+        otel_endpoint: Optional override for OTLP endpoint (gRPC format: host:port)
+
+    Returns:
+        TracerProvider instance if successful, None otherwise
 
     Example:
         from shared.monitoring.tracing import setup_tracing
 
         app = FastAPI(title="Auth Service")
-        setup_tracing(app, "auth-service")
+        tracer_provider = setup_tracing(app, "auth-service", "1.0.0")
     """
 
+    # Check if tracing is enabled
+    if not OTelConfig.is_enabled("traces"):
+        logger.info(
+            "Distributed tracing disabled",
+            service=service_name,
+            reason="ENABLE_TRACING not set to 'true'"
+        )
+        return None
+
     try:
-        # Create resource with service information
-        resource = Resource(attributes={
-            SERVICE_NAME: service_name,
-            SERVICE_VERSION: service_version,
-            "deployment.environment": "production"
-        })
+        # Get endpoints from centralized config
+        endpoints = OTelConfig.get_endpoints()
+
+        # Use provided endpoint or get from config
+        if otel_endpoint:
+            # Clean user-provided endpoint for gRPC
+            grpc_endpoint = OTelConfig._clean_grpc_endpoint(otel_endpoint)
+        else:
+            grpc_endpoint = endpoints.traces_grpc
+
+        # Get resource attributes
+        resource_attrs = OTelConfig.get_resource_attributes(service_name, service_version)
+        resource = Resource(attributes=resource_attrs)
 
         # Configure tracer provider
         tracer_provider = TracerProvider(resource=resource)
         trace.set_tracer_provider(tracer_provider)
 
-        # Configure OTLP exporter to send to SigNoz
+        # Configure OTLP gRPC exporter for traces
         otlp_exporter = OTLPSpanExporter(
-            endpoint=otel_endpoint,
-            insecure=True  # Use TLS in production
+            endpoint=grpc_endpoint,
+            insecure=True  # Use secure=False in production with proper TLS
         )
 
         # Add span processor with batching for performance
@@ -75,40 +117,46 @@ def setup_tracing(
             excluded_urls="health,metrics"  # Don't trace health/metrics endpoints
         )
 
-        # Auto-instrument HTTPX (inter-service communication)
-        HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
+        # Auto-instrument HTTPX (inter-service communication) if available
+        if HTTPX_AVAILABLE:
+            try:
+                HTTPXClientInstrumentor().instrument(tracer_provider=tracer_provider)
+                logger.debug("HTTPX instrumentation enabled")
+            except Exception as e:
+                logger.warning(f"Failed to instrument HTTPX: {e}")
 
-        # Auto-instrument Redis
-        try:
-            RedisInstrumentor().instrument(tracer_provider=tracer_provider)
-        except Exception as e:
-            logger.warning(f"Failed to instrument Redis: {e}")
+        # Auto-instrument Redis if available
+        if REDIS_AVAILABLE:
+            try:
+                RedisInstrumentor().instrument(tracer_provider=tracer_provider)
+                logger.debug("Redis instrumentation enabled")
+            except Exception as e:
+                logger.warning(f"Failed to instrument Redis: {e}")
 
-        # Auto-instrument PostgreSQL (psycopg2) - skip if not available
-        # Most services use asyncpg instead of psycopg2
-        # try:
-        #     Psycopg2Instrumentor().instrument(tracer_provider=tracer_provider)
-        # except Exception as e:
-        #     logger.warning(f"Failed to instrument Psycopg2: {e}")
-
-        # Auto-instrument SQLAlchemy
-        try:
-            SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
-        except Exception as e:
-            logger.warning(f"Failed to instrument SQLAlchemy: {e}")
+        # Auto-instrument SQLAlchemy if available
+        if SQLALCHEMY_AVAILABLE:
+            try:
+                SQLAlchemyInstrumentor().instrument(tracer_provider=tracer_provider)
+                logger.debug("SQLAlchemy instrumentation enabled")
+            except Exception as e:
+                logger.warning(f"Failed to instrument SQLAlchemy: {e}")
 
         logger.info(
-            "Distributed tracing configured",
+            "Distributed tracing configured successfully",
             service=service_name,
-            otel_endpoint=otel_endpoint
+            grpc_endpoint=grpc_endpoint,
+            protocol="grpc"
         )
 
+        return tracer_provider
+
     except Exception as e:
         logger.error(
             "Failed to setup tracing - continuing without it",
             service=service_name,
             error=str(e)
         )
+        return None
 
 
 def get_current_trace_id() -> Optional[str]:
diff --git a/shared/service_base.py b/shared/service_base.py
index 3ecddbfe..043c0396 100755
--- a/shared/service_base.py
+++ b/shared/service_base.py
@@ -20,10 +20,11 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from fastapi.routing import APIRouter
 
-from shared.monitoring import setup_logging, setup_otel_logging, setup_otel_metrics, setup_all_metrics
-from shared.monitoring.metrics import setup_metrics_early
+from shared.monitoring import (
+    setup_logging,
+    setup_telemetry
+)
 from shared.monitoring.health_checks import setup_fastapi_health_checks
-from shared.monitoring.tracing import setup_tracing
 from shared.database.base import DatabaseManager
 
 if TYPE_CHECKING:
@@ -77,24 +78,13 @@ class BaseFastAPIService:
 
         # Initialize logging
         setup_logging(service_name, log_level)
-
-        # Setup OpenTelemetry logging export if enabled
-        if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
-            try:
-                setup_otel_logging(service_name, version)
-                self.logger = structlog.get_logger()
-                self.logger.info(f"OpenTelemetry logs export enabled for {service_name}")
-            except Exception as e:
-                self.logger = structlog.get_logger()
-                self.logger.warning(f"Failed to setup OpenTelemetry logs export: {e}")
-        else:
-            self.logger = structlog.get_logger()
+        self.logger = structlog.get_logger()
 
         # Will be set during app creation
         self.app: Optional[FastAPI] = None
-        self.metrics_collector = None
         self.health_manager = None
         self.alert_service = None
+        self.telemetry_providers = None  # Contains all OTEL providers and metrics collectors
 
     def create_app(self, **fastapi_kwargs) -> FastAPI:
         """
@@ -116,49 +106,25 @@ class BaseFastAPIService:
         # Create FastAPI app
         self.app = FastAPI(**config)
 
-        # Setup metrics BEFORE middleware and lifespan
-        if self.enable_metrics:
-            self.metrics_collector = setup_metrics_early(self.app, self.service_name)
-
-            # Setup OpenTelemetry metrics export if enabled
-            enable_otel_metrics = os.getenv("ENABLE_OTEL_METRICS", "true").lower() == "true"
-            if enable_otel_metrics:
-                try:
-                    self.otel_meter_provider = setup_otel_metrics(self.service_name, self.version)
-                    if self.otel_meter_provider:
-                        self.logger.info(f"OpenTelemetry metrics export enabled for {self.service_name}")
-
-                        # Setup system metrics collection (CPU, memory, disk, network)
-                        enable_system_metrics = os.getenv("ENABLE_SYSTEM_METRICS", "true").lower() == "true"
-                        if enable_system_metrics:
-                            try:
-                                self.system_metrics, self.app_metrics = setup_all_metrics(
-                                    self.service_name,
-                                    self.version,
-                                    self.otel_meter_provider
-                                )
-                                self.logger.info(f"System metrics collection enabled for {self.service_name}")
-                            except Exception as e:
-                                self.logger.warning(f"Failed to setup system metrics: {e}")
-                except Exception as e:
-                    self.logger.warning(f"Failed to setup OpenTelemetry metrics export: {e}")
-
-        # Setup distributed tracing
-        # Check both constructor flag and environment variable
-        tracing_enabled = self.enable_tracing and os.getenv("ENABLE_TRACING", "true").lower() == "true"
-
-        if tracing_enabled:
-            try:
-                otel_endpoint = os.getenv(
-                    "OTEL_COLLECTOR_ENDPOINT",
-                    "http://signoz-otel-collector.bakery-ia:4318"
-                )
-                setup_tracing(self.app, self.service_name, self.version, otel_endpoint)
-                self.logger.info(f"Distributed tracing enabled for {self.service_name}")
-            except Exception as e:
-                self.logger.warning(f"Failed to setup tracing, continuing without it: {e}")
-        else:
-            self.logger.info(f"Distributed tracing disabled for {self.service_name}")
+        # Setup unified OpenTelemetry telemetry
+        # This single call configures:
+        # - Distributed tracing (gRPC, port 4317)
+        # - OTLP metrics export (gRPC, port 4317)
+        # - System metrics collection (CPU, memory, disk, network)
+        # - Application metrics (HTTP requests, DB queries)
+        # - Structured logs export (HTTP, port 4318)
+        try:
+            self.telemetry_providers = setup_telemetry(
+                app=self.app,
+                service_name=self.service_name,
+                service_version=self.version,
+                enable_traces=self.enable_tracing,
+                enable_metrics=self.enable_metrics,
+                enable_logs=True,  # Controlled by OTEL_LOGS_EXPORTER env var
+                enable_system_metrics=True  # Controlled by ENABLE_SYSTEM_METRICS env var
+            )
+        except Exception as e:
+            self.logger.warning("Failed to setup telemetry", error=str(e))
 
         # Setup lifespan
         self.app.router.lifespan_context = self._create_lifespan()
@@ -361,10 +327,6 @@ class BaseFastAPIService:
                     method=request.method
                 )
 
-                # Record error metric if available
-                if self.metrics_collector:
-                    self.metrics_collector.increment_counter("errors_total", labels={"type": "unhandled"})
-
                 return JSONResponse(
                     status_code=500,
                     content={
@@ -409,7 +371,10 @@ class BaseFastAPIService:
 
     def register_custom_metrics(self, metrics_config: Dict[str, Dict[str, Any]]):
         """
-        Register custom metrics for the service
+        Register custom OTEL metrics for the service.
+
+        Note: System metrics (CPU, memory, disk, network) and application metrics (HTTP, DB)
+        are automatically created by setup_telemetry(). Use this for additional custom metrics.
 
         Args:
             metrics_config: Dict with metric name as key and config as value
@@ -417,25 +382,36 @@ class BaseFastAPIService:
                                "user_registrations": {
                                    "type": "counter",
                                    "description": "Total user registrations",
-                                   "labels": ["status"]
+                                   "unit": "registrations"
                                }
                            }
         """
-        if not self.metrics_collector:
-            self.logger.warning("Metrics collector not available")
+        if not self.telemetry_providers or not self.telemetry_providers.meter_provider:
+            self.logger.warning("OTEL meter provider not available - metrics not registered")
             return
 
+        from opentelemetry.metrics import get_meter
+        meter = get_meter(self.service_name)
+
         for metric_name, config in metrics_config.items():
             metric_type = config.get("type", "counter")
             description = config.get("description", f"{metric_name} metric")
-            labels = config.get("labels", [])
+            unit = config.get("unit", "1")
 
-            if metric_type == "counter":
-                self.metrics_collector.register_counter(metric_name, description, labels=labels)
-            elif metric_type == "histogram":
-                self.metrics_collector.register_histogram(metric_name, description, labels=labels)
-            else:
-                self.logger.warning(f"Unsupported metric type: {metric_type}")
+            try:
+                if metric_type == "counter":
+                    meter.create_counter(metric_name, description=description, unit=unit)
+                    self.logger.info(f"Registered custom counter: {metric_name}")
+                elif metric_type == "histogram":
+                    meter.create_histogram(metric_name, description=description, unit=unit)
+                    self.logger.info(f"Registered custom histogram: {metric_name}")
+                elif metric_type == "gauge":
+                    meter.create_up_down_counter(metric_name, description=description, unit=unit)
+                    self.logger.info(f"Registered custom gauge: {metric_name}")
+                else:
+                    self.logger.warning(f"Unsupported metric type: {metric_type}")
+            except Exception as e:
+                self.logger.error(f"Failed to register metric {metric_name}", error=str(e))
 
     def run_development_server(self, host: str = "0.0.0.0", port: int = 8000, reload: Optional[bool] = None):
         """