Imporve monitoring 5

2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions
--- a/infrastructure/helm/signoz-values-dev.yaml
+++ b/infrastructure/helm/signoz-values-dev.yaml
@@ -48,9 +48,9 @@ signoz:
    signoz_traces_ttl_duration_hrs: "168"
    signoz_metrics_ttl_duration_hrs: "168"
    signoz_logs_ttl_duration_hrs: "168"
-    # OpAMP Server Configuration
-    signoz_opamp_server_enabled: "true"
-    signoz_opamp_server_endpoint: "0.0.0.0:4320"
+    # OpAMP Server Configuration - DISABLED for dev (causes gRPC instability)
+    signoz_opamp_server_enabled: "false"
+    # signoz_opamp_server_endpoint: "0.0.0.0:4320"

  persistence:
    enabled: true
@@ -149,9 +149,10 @@ otelCollector:
    repository: signoz/signoz-otel-collector
    tag: v0.129.12  # Latest recommended version

-  # OpAMP Configuration - Enabled for dynamic configuration management
-  # Note: OpAMP allows remote configuration management via SigNoz backend
-  # This replaces the manual kubectl patch approach
+  # OpAMP Configuration - DISABLED for development
+  # OpAMP is designed for production with remote config management
+  # In dev, it causes gRPC instability and collector reloads
+  # We use static configuration instead

  # Init containers for the Otel Collector pod
  initContainers:
@@ -231,6 +232,9 @@ otelCollector:
        secretName: postgres-tls
    - name: postgres-tls-fixed
      emptyDir: {}
+    - name: varlogpods
+      hostPath:
+        path: /var/log/pods

  extraVolumeMounts:
    - name: redis-tls
@@ -242,13 +246,16 @@ otelCollector:
    - name: postgres-tls-fixed
      mountPath: /etc/postgres-tls
      readOnly: false
+    - name: varlogpods
+      mountPath: /var/log/pods
+      readOnly: true

-  # Enable OpAMP for dynamic configuration management
+  # Disable OpAMP - use static configuration only
+  # Use 'args' instead of 'extraArgs' to completely override the command
  command:
    name: /signoz-otel-collector
-    extraArgs:
+    args:
      - --config=/conf/otel-collector-config.yaml
-      - --manager-config=/conf/otel-collector-opamp-config.yaml
      - --feature-gates=-pkg.translator.prometheus.NormalizeName

  # OpenTelemetry Collector configuration
@@ -275,6 +282,63 @@ otelCollector:
              allowed_origins:
                - "*"

+      # Filelog receiver for Kubernetes pod logs
+      # Collects container stdout/stderr from /var/log/pods
+      filelog:
+        include:
+          - /var/log/pods/*/*/*.log
+        exclude:
+          # Exclude SigNoz's own logs to avoid recursive collection
+          - /var/log/pods/bakery-ia_signoz-*/*/*.log
+        include_file_path: true
+        include_file_name: false
+        operators:
+          # Parse CRI-O / containerd log format
+          - type: regex_parser
+            regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
+            timestamp:
+              parse_from: attributes.time
+              layout: '%Y-%m-%dT%H:%M:%S.%LZ'
+          # Fix timestamp parsing - extract from the parsed time field
+          - type: move
+            from: attributes.time
+            to: attributes.timestamp
+          # Extract Kubernetes metadata from file path
+          - type: regex_parser
+            id: extract_metadata_from_filepath
+            regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
+            parse_from: attributes["log.file.path"]
+          # Move metadata to resource attributes
+          - type: move
+            from: attributes.namespace
+            to: resource["k8s.namespace.name"]
+          - type: move
+            from: attributes.pod_name
+            to: resource["k8s.pod.name"]
+          - type: move
+            from: attributes.container_name
+            to: resource["k8s.container.name"]
+          - type: move
+            from: attributes.log
+            to: body
+
+      # Kubernetes Cluster Receiver - Collects cluster-level metrics
+      # Provides information about nodes, namespaces, pods, and other cluster resources
+      k8s_cluster:
+        collection_interval: 30s
+        node_conditions_to_report:
+          - Ready
+          - MemoryPressure
+          - DiskPressure
+          - PIDPressure
+          - NetworkUnavailable
+        allocatable_types_to_report:
+          - cpu
+          - memory
+          - pods
+
+
+
      # PostgreSQL receivers for database metrics
      # ENABLED: Monitor users configured and credentials stored in secrets
      # Collects metrics directly from PostgreSQL databases with proper TLS
@@ -538,6 +602,43 @@ otelCollector:
        password: ${env:RABBITMQ_PASSWORD}
        collection_interval: 30s

+      # Prometheus Receiver - Scrapes metrics from Kubernetes API
+      # Simplified configuration using only Kubernetes API metrics
+      prometheus:
+        config:
+          scrape_configs:
+            - job_name: 'kubernetes-nodes-cadvisor'
+              scrape_interval: 30s
+              scrape_timeout: 10s
+              scheme: https
+              tls_config:
+                insecure_skip_verify: true
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+              kubernetes_sd_configs:
+                - role: node
+              relabel_configs:
+                - action: labelmap
+                  regex: __meta_kubernetes_node_label_(.+)
+                - target_label: __address__
+                  replacement: kubernetes.default.svc:443
+                - source_labels: [__meta_kubernetes_node_name]
+                  regex: (.+)
+                  target_label: __metrics_path__
+                  replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
+            - job_name: 'kubernetes-apiserver'
+              scrape_interval: 30s
+              scrape_timeout: 10s
+              scheme: https
+              tls_config:
+                insecure_skip_verify: true
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+              kubernetes_sd_configs:
+                - role: endpoints
+              relabel_configs:
+                - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+                  action: keep
+                  regex: default;kubernetes;https
+
    processors:
      # Batch processor for better performance (optimized for high throughput)
      batch:
@@ -562,6 +663,25 @@ otelCollector:
        detectors: [env, system, docker]
        timeout: 5s

+      # Kubernetes attributes processor - CRITICAL for logs
+      # Extracts pod, namespace, container metadata from log attributes
+      k8sattributes:
+        auth_type: "serviceAccount"
+        passthrough: false
+        extract:
+          metadata:
+            - k8s.pod.name
+            - k8s.pod.uid
+            - k8s.deployment.name
+            - k8s.namespace.name
+            - k8s.node.name
+            - k8s.container.name
+          labels:
+            - tag_name: "app"
+            - tag_name: "pod-template-hash"
+          annotations:
+            - tag_name: "description"
+
      # SigNoz span metrics processor with delta aggregation (recommended)
      # Generates RED metrics (Rate, Error, Duration) from trace spans
      signozspanmetrics/delta:
@@ -643,7 +763,7 @@ otelCollector:
            postgresql/orchestrator, postgresql/pos, postgresql/procurement,
            postgresql/production, postgresql/recipes, postgresql/sales,
            postgresql/suppliers, postgresql/tenant, postgresql/training,
-            redis, rabbitmq]
+            redis, rabbitmq, k8s_cluster, prometheus]
          processors: [memory_limiter, batch, resourcedetection]
          exporters: [signozclickhousemetrics]

@@ -653,17 +773,38 @@ otelCollector:
          processors: [batch/meter]
          exporters: [signozclickhousemeter]

-        # Logs pipeline
+        # Logs pipeline - includes both OTLP and Kubernetes pod logs
        logs:
-          receivers: [otlp]
-          processors: [memory_limiter, batch, resourcedetection]
+          receivers: [otlp, filelog]
+          processors: [memory_limiter, batch, resourcedetection, k8sattributes]
          exporters: [clickhouselogsexporter]

 # Additional Configuration
 serviceAccount:
  create: true
  annotations: {}
-  name: ""
+  name: "signoz-otel-collector"
+
+# RBAC Configuration for Kubernetes monitoring
+# Required for k8s_cluster and kubeletstats receivers to access Kubernetes API
+rbac:
+  create: true
+  rules:
+    - apiGroups: [""]
+      resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["apps"]
+      resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["batch"]
+      resources: ["jobs", "cronjobs"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["extensions"]
+      resources: ["deployments", "daemonsets", "replicasets"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["metrics.k8s.io"]
+      resources: ["nodes", "pods"]
+      verbs: ["get", "list", "watch"]

 # Security Context
 securityContext:
--- a/infrastructure/helm/signoz-values-prod.yaml
+++ b/infrastructure/helm/signoz-values-prod.yaml
@@ -66,6 +66,11 @@ signoz:
    signoz_traces_ttl_duration_hrs: "720"
    signoz_metrics_ttl_duration_hrs: "720"
    signoz_logs_ttl_duration_hrs: "720"
+    # OpAMP Server Configuration
+    # WARNING: OpAMP can cause gRPC instability and collector reloads
+    # Only enable if you have a stable OpAMP backend server
+    signoz_opamp_server_enabled: "false"
+    # signoz_opamp_server_endpoint: "0.0.0.0:4320"
    # SMTP configuration for email alerts
    signoz_smtp_enabled: "true"
    signoz_smtp_host: "smtp.gmail.com"
@@ -247,17 +252,52 @@ otelCollector:
    tag: v0.129.12  # Updated to latest recommended version
    pullPolicy: IfNotPresent

+  # Init containers for the Otel Collector pod
+  initContainers:
+    fix-postgres-tls:
+      enabled: true
+      image:
+        registry: docker.io
+        repository: busybox
+        tag: 1.35
+        pullPolicy: IfNotPresent
+      command:
+        - sh
+        - -c
+        - |
+          echo "Fixing PostgreSQL TLS file permissions..."
+          cp /etc/postgres-tls-source/* /etc/postgres-tls/
+          chmod 600 /etc/postgres-tls/server-key.pem
+          chmod 644 /etc/postgres-tls/server-cert.pem
+          chmod 644 /etc/postgres-tls/ca-cert.pem
+          echo "PostgreSQL TLS permissions fixed"
+      volumeMounts:
+        - name: postgres-tls-source
+          mountPath: /etc/postgres-tls-source
+          readOnly: true
+        - name: postgres-tls-fixed
+          mountPath: /etc/postgres-tls
+          readOnly: false
+
  service:
    type: ClusterIP
    ports:
      - name: otlp-grpc
        port: 4317
+        targetPort: 4317
+        protocol: TCP
      - name: otlp-http
        port: 4318
+        targetPort: 4318
+        protocol: TCP
+      - name: prometheus
+        port: 8889
+        targetPort: 8889
+        protocol: TCP
      - name: metrics
        port: 8888
-      - name: healthcheck
-        port: 13133
+        targetPort: 8888
+        protocol: TCP

  resources:
    requests:
@@ -267,6 +307,50 @@ otelCollector:
      cpu: 2000m
      memory: 2Gi

+  # Additional environment variables for receivers
+  additionalEnvs:
+    POSTGRES_MONITOR_USER: "monitoring"
+    POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca"
+    REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k"
+    RABBITMQ_USER: "bakery"
+    RABBITMQ_PASSWORD: "forecast123"
+
+  # Mount TLS certificates for secure connections
+  extraVolumes:
+    - name: redis-tls
+      secret:
+        secretName: redis-tls-secret
+    - name: postgres-tls
+      secret:
+        secretName: postgres-tls
+    - name: postgres-tls-fixed
+      emptyDir: {}
+    - name: varlogpods
+      hostPath:
+        path: /var/log/pods
+
+  extraVolumeMounts:
+    - name: redis-tls
+      mountPath: /etc/redis-tls
+      readOnly: true
+    - name: postgres-tls
+      mountPath: /etc/postgres-tls-source
+      readOnly: true
+    - name: postgres-tls-fixed
+      mountPath: /etc/postgres-tls
+      readOnly: false
+    - name: varlogpods
+      mountPath: /var/log/pods
+      readOnly: true
+
+  # Enable OpAMP for dynamic configuration management
+  command:
+    name: /signoz-otel-collector
+    extraArgs:
+      - --config=/conf/otel-collector-config.yaml
+      - --manager-config=/conf/otel-collector-opamp-config.yaml
+      - --feature-gates=-pkg.translator.prometheus.NormalizeName
+
  # Full OTEL Collector Configuration
  config:
    # Connectors - bridge between pipelines
@@ -297,14 +381,358 @@ otelCollector:
                - "https://monitoring.bakewise.ai"
                - "https://*.bakewise.ai"

+      # Filelog receiver for Kubernetes pod logs
+      # Collects container stdout/stderr from /var/log/pods
+      filelog:
+        include:
+          - /var/log/pods/*/*/*.log
+        exclude:
+          # Exclude SigNoz's own logs to avoid recursive collection
+          - /var/log/pods/bakery-ia_signoz-*/*/*.log
+        include_file_path: true
+        include_file_name: false
+        operators:
+          # Parse CRI-O / containerd log format
+          - type: regex_parser
+            regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
+            timestamp:
+              parse_from: attributes.time
+              layout: '%Y-%m-%dT%H:%M:%S.%LZ'
+          # Fix timestamp parsing - extract from the parsed time field
+          - type: move
+            from: attributes.time
+            to: attributes.timestamp
+          # Extract Kubernetes metadata from file path
+          - type: regex_parser
+            id: extract_metadata_from_filepath
+            regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
+            parse_from: attributes["log.file.path"]
+          # Move metadata to resource attributes
+          - type: move
+            from: attributes.namespace
+            to: resource["k8s.namespace.name"]
+          - type: move
+            from: attributes.pod_name
+            to: resource["k8s.pod.name"]
+          - type: move
+            from: attributes.container_name
+            to: resource["k8s.container.name"]
+          - type: move
+            from: attributes.log
+            to: body
+
+      # Kubernetes Cluster Receiver - Collects cluster-level metrics
+      # Provides information about nodes, namespaces, pods, and other cluster resources
+      k8s_cluster:
+        collection_interval: 30s
+        node_conditions_to_report:
+          - Ready
+          - MemoryPressure
+          - DiskPressure
+          - PIDPressure
+          - NetworkUnavailable
+        allocatable_types_to_report:
+          - cpu
+          - memory
+          - pods
+
      # Prometheus receiver for scraping metrics
      prometheus:
        config:
          scrape_configs:
-            - job_name: 'otel-collector'
+            - job_name: 'kubernetes-nodes-cadvisor'
              scrape_interval: 30s
-              static_configs:
-                - targets: ['localhost:8888']
+              scrape_timeout: 10s
+              scheme: https
+              tls_config:
+                insecure_skip_verify: true
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+              kubernetes_sd_configs:
+                - role: node
+              relabel_configs:
+                - action: labelmap
+                  regex: __meta_kubernetes_node_label_(.+)
+                - target_label: __address__
+                  replacement: kubernetes.default.svc:443
+                - source_labels: [__meta_kubernetes_node_name]
+                  regex: (.+)
+                  target_label: __metrics_path__
+                  replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
+            - job_name: 'kubernetes-apiserver'
+              scrape_interval: 30s
+              scrape_timeout: 10s
+              scheme: https
+              tls_config:
+                insecure_skip_verify: true
+              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+              kubernetes_sd_configs:
+                - role: endpoints
+              relabel_configs:
+                - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+                  action: keep
+                  regex: default;kubernetes;https
+
+      # Redis receiver for cache metrics
+      # ENABLED: Using existing credentials from redis-secrets with TLS
+      redis:
+        endpoint: redis-service.bakery-ia:6379
+        password: ${env:REDIS_PASSWORD}
+        collection_interval: 60s
+        transport: tcp
+        tls:
+          insecure_skip_verify: false
+          cert_file: /etc/redis-tls/redis-cert.pem
+          key_file: /etc/redis-tls/redis-key.pem
+          ca_file: /etc/redis-tls/ca-cert.pem
+        metrics:
+          redis.maxmemory:
+            enabled: true
+          redis.cmd.latency:
+            enabled: true
+
+      # RabbitMQ receiver via management API
+      # ENABLED: Using existing credentials from rabbitmq-secrets
+      rabbitmq:
+        endpoint: http://rabbitmq-service.bakery-ia:15672
+        username: ${env:RABBITMQ_USER}
+        password: ${env:RABBITMQ_PASSWORD}
+        collection_interval: 30s
+
+      # PostgreSQL receivers for database metrics
+      # Monitor all databases with proper TLS configuration
+      postgresql/auth:
+        endpoint: auth-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - auth_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/inventory:
+        endpoint: inventory-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - inventory_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/orders:
+        endpoint: orders-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - orders_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/ai-insights:
+        endpoint: ai-insights-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - ai_insights_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/alert-processor:
+        endpoint: alert-processor-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - alert_processor_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/distribution:
+        endpoint: distribution-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - distribution_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/external:
+        endpoint: external-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - external_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/forecasting:
+        endpoint: forecasting-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - forecasting_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/notification:
+        endpoint: notification-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - notification_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/orchestrator:
+        endpoint: orchestrator-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - orchestrator_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/pos:
+        endpoint: pos-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - pos_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/procurement:
+        endpoint: procurement-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - procurement_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/production:
+        endpoint: production-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - production_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/recipes:
+        endpoint: recipes-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - recipes_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/sales:
+        endpoint: sales-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - sales_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/suppliers:
+        endpoint: suppliers-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - suppliers_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/tenant:
+        endpoint: tenant-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - tenant_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem
+
+      postgresql/training:
+        endpoint: training-db-service.bakery-ia:5432
+        username: ${env:POSTGRES_MONITOR_USER}
+        password: ${env:POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - training_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/postgres-tls/server-cert.pem
+          key_file: /etc/postgres-tls/server-key.pem
+          ca_file: /etc/postgres-tls/ca-cert.pem

    processors:
      # High-performance batch processing (official recommendation)
@@ -326,7 +754,7 @@ otelCollector:

      # Resource detection for K8s
      resourcedetection:
-        detectors: [env, system, docker, kubernetes]
+        detectors: [env, system, docker]
        timeout: 5s

      # Add resource attributes
@@ -339,6 +767,26 @@ otelCollector:
            value: bakery-ia-prod
            action: upsert

+      # Kubernetes attributes processor - CRITICAL for logs
+      # Extracts pod, namespace, container metadata from log attributes
+      k8sattributes:
+        auth_type: "serviceAccount"
+        passthrough: false
+        extract:
+          metadata:
+            - k8s.pod.name
+            - k8s.pod.uid
+            - k8s.deployment.name
+            - k8s.namespace.name
+            - k8s.node.name
+            - k8s.container.name
+          labels:
+            - tag_name: "app"
+            - tag_name: "pod-template-hash"
+            - tag_name: "version"
+          annotations:
+            - tag_name: "description"
+
      # SigNoz span metrics processor with delta aggregation (recommended)
      # Generates RED metrics (Rate, Error, Duration) from trace spans
      signozspanmetrics/delta:
@@ -354,9 +802,9 @@ otelCollector:
          - name: signoz.collector.id

    exporters:
-      # Export to SigNoz ClickHouse
+      # ClickHouse exporter for traces
      clickhousetraces:
-        datasource: tcp://clickhouse:9000/?database=signoz_traces
+        datasource: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_traces
        timeout: 10s
        retry_on_failure:
          enabled: true
@@ -364,8 +812,9 @@ otelCollector:
          max_interval: 30s
          max_elapsed_time: 300s

+      # ClickHouse exporter for metrics
      signozclickhousemetrics:
-        endpoint: "tcp://clickhouse:9000/?database=signoz_metrics"
+        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
        timeout: 10s
        retry_on_failure:
          enabled: true
@@ -375,32 +824,32 @@ otelCollector:

      # ClickHouse exporter for meter data (usage metrics)
      signozclickhousemeter:
-        dsn: "tcp://clickhouse:9000/?database=signoz_meter"
+        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_meter"
        timeout: 45s
        sending_queue:
          enabled: false

+      # ClickHouse exporter for logs
      clickhouselogsexporter:
-        dsn: tcp://clickhouse:9000/?database=signoz_logs
+        dsn: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_logs
        timeout: 10s
        retry_on_failure:
          enabled: true
          initial_interval: 5s
          max_interval: 30s
-          max_elapsed_time: 300s

      # Metadata exporter for service metadata
      metadataexporter:
-        dsn: "tcp://clickhouse:9000/?database=signoz_metadata"
+        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metadata"
        timeout: 10s
        cache:
          provider: in_memory

-      # Debug exporter for debugging (replaces deprecated logging exporter)
+      # Debug exporter for debugging (optional)
      debug:
        verbosity: detailed
-        sampling_initial: 2
-        sampling_thereafter: 500
+        sampling_initial: 5
+        sampling_thereafter: 200

    service:
      extensions: [health_check, zpages]
@@ -411,9 +860,16 @@ otelCollector:
          processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource]
          exporters: [clickhousetraces, metadataexporter, signozmeter]

-        # Metrics pipeline
+        # Metrics pipeline - includes all infrastructure receivers
        metrics:
-          receivers: [otlp, prometheus]
+          receivers: [otlp,
+            postgresql/auth, postgresql/inventory, postgresql/orders,
+            postgresql/ai-insights, postgresql/alert-processor, postgresql/distribution,
+            postgresql/external, postgresql/forecasting, postgresql/notification,
+            postgresql/orchestrator, postgresql/pos, postgresql/procurement,
+            postgresql/production, postgresql/recipes, postgresql/sales,
+            postgresql/suppliers, postgresql/tenant, postgresql/training,
+            redis, rabbitmq, k8s_cluster, prometheus]
          processors: [memory_limiter, batch, resourcedetection, resource]
          exporters: [signozclickhousemetrics]

@@ -423,10 +879,10 @@ otelCollector:
          processors: [batch/meter]
          exporters: [signozclickhousemeter]

-        # Logs pipeline
+        # Logs pipeline - includes both OTLP and Kubernetes pod logs
        logs:
-          receivers: [otlp]
-          processors: [memory_limiter, batch, resourcedetection, resource]
+          receivers: [otlp, filelog]
+          processors: [memory_limiter, batch, resourcedetection, resource, k8sattributes]
          exporters: [clickhouselogsexporter]

  # HPA for OTEL Collector
@@ -455,6 +911,27 @@ serviceAccount:
  annotations: {}
  name: "signoz"

+# RBAC Configuration for Kubernetes monitoring
+# Required for k8s_cluster receiver to access Kubernetes API
+rbac:
+  create: true
+  rules:
+    - apiGroups: [""]
+      resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["apps"]
+      resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["batch"]
+      resources: ["jobs", "cronjobs"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["extensions"]
+      resources: ["deployments", "daemonsets", "replicasets"]
+      verbs: ["get", "list", "watch"]
+    - apiGroups: ["metrics.k8s.io"]
+      resources: ["nodes", "pods"]
+      verbs: ["get", "list", "watch"]
+
 # Security Context
 securityContext:
  runAsNonRoot: true