bakery-ia/infrastructure/monitoring/signoz/signoz-values-prod.yaml

# SigNoz Helm Chart Values - Production Environment
# High-availability configuration with resource optimization
# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress-prod
#
# Official Chart: https://github.com/SigNoz/charts
# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml

global:
  storageClass: "microk8s-hostpath"  # For MicroK8s, use "microk8s-hostpath" or custom storage class
  clusterName: "bakery-ia-prod"
  domain: "monitoring.bakewise.ai"
  # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
  imagePullSecrets:
    - dockerhub-creds

# Docker Hub credentials for pulling images (root level for SigNoz components)
imagePullSecrets:
  - dockerhub-creds

# SigNoz Main Component (unified frontend + query service)
# BREAKING CHANGE: v0.89.0+ uses unified component instead of separate frontend/queryService
signoz:
  replicaCount: 2

  image:
    repository: signoz/signoz
    tag: v0.106.0  # Latest stable version
    pullPolicy: IfNotPresent

  service:
    type: ClusterIP
    port: 8080       # HTTP/API port
    internalPort: 8085  # Internal gRPC port

  # DISABLE built-in ingress - using unified bakery-ingress-prod instead
  # Route configured in infrastructure/kubernetes/overlays/prod/prod-ingress.yaml
  ingress:
    enabled: false

  resources:
    requests:
      cpu: 500m
      memory: 1Gi
    limits:
      cpu: 2000m
      memory: 4Gi

  # Pod Anti-affinity for HA
  affinity:
    podAntiAffinity:
      preferredDuringSchedulingIgnoredDuringExecution:
        - weight: 100
          podAffinityTerm:
            labelSelector:
              matchLabels:
                app.kubernetes.io/component: query-service
            topologyKey: kubernetes.io/hostname

  # Environment variables (new format - replaces configVars)
  env:
    signoz_telemetrystore_provider: "clickhouse"
    dot_metrics_enabled: "true"
    signoz_emailing_enabled: "true"
    signoz_alertmanager_provider: "signoz"
    # Retention configuration (30 days for prod)
    signoz_traces_ttl_duration_hrs: "720"
    signoz_metrics_ttl_duration_hrs: "720"
    signoz_logs_ttl_duration_hrs: "720"
    # OpAMP Server Configuration
    # WARNING: OpAMP can cause gRPC instability and collector reloads
    # Only enable if you have a stable OpAMP backend server
    signoz_opamp_server_enabled: "false"
    # signoz_opamp_server_endpoint: "0.0.0.0:4320"
    # SMTP configuration for email alerts - now using Mailu as SMTP server
    signoz_smtp_enabled: "true"
    signoz_smtp_host: "mailu-postfix.bakery-ia.svc.cluster.local"
    signoz_smtp_port: "587"
    signoz_smtp_from: "alerts@bakewise.ai"
    signoz_smtp_username: "alerts@bakewise.ai"
    # Password should be set via secret: signoz_smtp_password

  persistence:
    enabled: true
    size: 20Gi
    storageClass: "standard"

  # Horizontal Pod Autoscaler
  autoscaling:
    enabled: true
    minReplicas: 2
    maxReplicas: 5
    targetCPUUtilizationPercentage: 70
    targetMemoryUtilizationPercentage: 80

# AlertManager Configuration
alertmanager:
  enabled: true
  replicaCount: 2

  image:
    repository: signoz/alertmanager
    tag: 0.23.5
    pullPolicy: IfNotPresent

  service:
    type: ClusterIP
    port: 9093

  resources:
    requests:
      cpu: 100m
      memory: 128Mi
    limits:
      cpu: 500m
      memory: 512Mi

  # Pod Anti-affinity for HA
  affinity:
    podAntiAffinity:
      preferredDuringSchedulingIgnoredDuringExecution:
        - weight: 100
          podAffinityTerm:
            labelSelector:
              matchExpressions:
                - key: app
                  operator: In
                  values:
                    - signoz-alertmanager
            topologyKey: kubernetes.io/hostname

  persistence:
    enabled: true
    size: 5Gi
    storageClass: "standard"

  config:
    global:
      resolve_timeout: 5m
      smtp_smarthost: 'mailu-postfix.bakery-ia.svc.cluster.local:587'
      smtp_from: 'alerts@bakewise.ai'
      smtp_auth_username: 'alerts@bakewise.ai'
      smtp_auth_password: '${SMTP_PASSWORD}'
      smtp_require_tls: true

    route:
      group_by: ['alertname', 'cluster', 'service', 'severity']
      group_wait: 10s
      group_interval: 10s
      repeat_interval: 12h
      receiver: 'critical-alerts'
      routes:
        - match:
            severity: critical
          receiver: 'critical-alerts'
          continue: true
        - match:
            severity: warning
          receiver: 'warning-alerts'

    receivers:
      - name: 'critical-alerts'
        email_configs:
          - to: 'critical-alerts@bakewise.ai'
            headers:
              Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA'
        # Slack webhook for critical alerts
        slack_configs:
          - api_url: '${SLACK_WEBHOOK_URL}'
            channel: '#alerts-critical'
            title: '[CRITICAL] {{ .GroupLabels.alertname }}'
            text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'

      - name: 'warning-alerts'
        email_configs:
          - to: 'oncall@bakewise.ai'
            headers:
              Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA'

# ClickHouse Configuration - Time Series Database
clickhouse:
  enabled: true
  installCustomStorageClass: false

  image:
    registry: docker.io
    repository: clickhouse/clickhouse-server
    tag: 25.5.6  # Updated to official recommended version
    pullPolicy: IfNotPresent

  # ClickHouse resources (nested config)
  clickhouse:
    resources:
      requests:
        cpu: 1000m
        memory: 2Gi
      limits:
        cpu: 4000m
        memory: 8Gi

  # Pod Anti-affinity for HA
  affinity:
    podAntiAffinity:
      requiredDuringSchedulingIgnoredDuringExecution:
        - labelSelector:
            matchExpressions:
              - key: app
                operator: In
                values:
                  - signoz-clickhouse
          topologyKey: kubernetes.io/hostname

  persistence:
    enabled: true
    size: 100Gi
    storageClass: "standard"

  # Cold storage configuration for better disk space management
  coldStorage:
    enabled: true
    defaultKeepFreeSpaceBytes: 10737418240  # Keep 10GB free
    ttl:
      deleteTTLDays: 30  # Move old data to cold storage after 30 days

# Zookeeper Configuration (required by ClickHouse for coordination)
zookeeper:
  enabled: true
  replicaCount: 3  # CRITICAL: Always use 3 replicas for production HA

  image:
    tag: 3.7.1  # Official recommended version

  resources:
    requests:
      cpu: 100m
      memory: 256Mi
    limits:
      cpu: 500m
      memory: 512Mi

  persistence:
    enabled: true
    size: 10Gi
    storageClass: "standard"

# OpenTelemetry Collector - Integrated with SigNoz
otelCollector:
  enabled: true
  replicaCount: 2

  image:
    repository: signoz/signoz-otel-collector
    tag: v0.129.12  # Updated to latest recommended version
    pullPolicy: IfNotPresent

  # Init containers for the Otel Collector pod
  initContainers:
    fix-postgres-tls:
      enabled: true
      image:
        registry: docker.io
        repository: busybox
        tag: 1.35
        pullPolicy: IfNotPresent
      command:
        - sh
        - -c
        - |
          echo "Fixing PostgreSQL TLS file permissions..."
          cp /etc/postgres-tls-source/* /etc/postgres-tls/
          chmod 600 /etc/postgres-tls/server-key.pem
          chmod 644 /etc/postgres-tls/server-cert.pem
          chmod 644 /etc/postgres-tls/ca-cert.pem
          echo "PostgreSQL TLS permissions fixed"
      volumeMounts:
        - name: postgres-tls-source
          mountPath: /etc/postgres-tls-source
          readOnly: true
        - name: postgres-tls-fixed
          mountPath: /etc/postgres-tls
          readOnly: false

  service:
    type: ClusterIP
    ports:
      - name: otlp-grpc
        port: 4317
        targetPort: 4317
        protocol: TCP
      - name: otlp-http
        port: 4318
        targetPort: 4318
        protocol: TCP
      - name: prometheus
        port: 8889
        targetPort: 8889
        protocol: TCP
      - name: metrics
        port: 8888
        targetPort: 8888
        protocol: TCP

  resources:
    requests:
      cpu: 500m
      memory: 512Mi
    limits:
      cpu: 2000m
      memory: 2Gi

  # Additional environment variables for receivers
  additionalEnvs:
    POSTGRES_MONITOR_USER: "monitoring"
    POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca"
    REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k"
    RABBITMQ_USER: "bakery"
    RABBITMQ_PASSWORD: "forecast123"

  # Mount TLS certificates for secure connections
  extraVolumes:
    - name: redis-tls
      secret:
        secretName: redis-tls-secret
    - name: postgres-tls
      secret:
        secretName: postgres-tls
    - name: postgres-tls-fixed
      emptyDir: {}
    - name: varlogpods
      hostPath:
        path: /var/log/pods

  extraVolumeMounts:
    - name: redis-tls
      mountPath: /etc/redis-tls
      readOnly: true
    - name: postgres-tls
      mountPath: /etc/postgres-tls-source
      readOnly: true
    - name: postgres-tls-fixed
      mountPath: /etc/postgres-tls
      readOnly: false
    - name: varlogpods
      mountPath: /var/log/pods
      readOnly: true

  # Enable OpAMP for dynamic configuration management
  command:
    name: /signoz-otel-collector
    extraArgs:
      - --config=/conf/otel-collector-config.yaml
      - --manager-config=/conf/otel-collector-opamp-config.yaml
      - --feature-gates=-pkg.translator.prometheus.NormalizeName

  # Full OTEL Collector Configuration
  config:
    # Connectors - bridge between pipelines
    connectors:
      signozmeter:
        dimensions:
          - name: service.name
          - name: deployment.environment
          - name: host.name
        metrics_flush_interval: 1h

    extensions:
      health_check:
        endpoint: 0.0.0.0:13133
      zpages:
        endpoint: 0.0.0.0:55679

    receivers:
      otlp:
        protocols:
          grpc:
            endpoint: 0.0.0.0:4317
            max_recv_msg_size_mib: 32  # Increased for larger payloads
          http:
            endpoint: 0.0.0.0:4318
            cors:
              allowed_origins:
                - "https://monitoring.bakewise.ai"
                - "https://*.bakewise.ai"

      # Filelog receiver for Kubernetes pod logs
      # Collects container stdout/stderr from /var/log/pods
      filelog:
        include:
          - /var/log/pods/*/*/*.log
        exclude:
          # Exclude SigNoz's own logs to avoid recursive collection
          - /var/log/pods/bakery-ia_signoz-*/*/*.log
        include_file_path: true
        include_file_name: false
        operators:
          # Parse CRI-O / containerd log format
          - type: regex_parser
            regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
            timestamp:
              parse_from: attributes.time
              layout: '%Y-%m-%dT%H:%M:%S.%LZ'
          # Fix timestamp parsing - extract from the parsed time field
          - type: move
            from: attributes.time
            to: attributes.timestamp
          # Extract Kubernetes metadata from file path
          - type: regex_parser
            id: extract_metadata_from_filepath
            regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
            parse_from: attributes["log.file.path"]
          # Move metadata to resource attributes
          - type: move
            from: attributes.namespace
            to: resource["k8s.namespace.name"]
          - type: move
            from: attributes.pod_name
            to: resource["k8s.pod.name"]
          - type: move
            from: attributes.container_name
            to: resource["k8s.container.name"]
          - type: move
            from: attributes.log
            to: body

      # Kubernetes Cluster Receiver - Collects cluster-level metrics
      # Provides information about nodes, namespaces, pods, and other cluster resources
      k8s_cluster:
        collection_interval: 30s
        node_conditions_to_report:
          - Ready
          - MemoryPressure
          - DiskPressure
          - PIDPressure
          - NetworkUnavailable
        allocatable_types_to_report:
          - cpu
          - memory
          - pods

      # Prometheus receiver for scraping metrics
      prometheus:
        config:
          scrape_configs:
            - job_name: 'kubernetes-nodes-cadvisor'
              scrape_interval: 30s
              scrape_timeout: 10s
              scheme: https
              tls_config:
                insecure_skip_verify: true
              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
              kubernetes_sd_configs:
                - role: node
              relabel_configs:
                - action: labelmap
                  regex: __meta_kubernetes_node_label_(.+)
                - target_label: __address__
                  replacement: kubernetes.default.svc:443
                - source_labels: [__meta_kubernetes_node_name]
                  regex: (.+)
                  target_label: __metrics_path__
                  replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
            - job_name: 'kubernetes-apiserver'
              scrape_interval: 30s
              scrape_timeout: 10s
              scheme: https
              tls_config:
                insecure_skip_verify: true
              bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
              kubernetes_sd_configs:
                - role: endpoints
              relabel_configs:
                - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
                  action: keep
                  regex: default;kubernetes;https

      # Redis receiver for cache metrics
      # ENABLED: Using existing credentials from redis-secrets with TLS
      redis:
        endpoint: redis-service.bakery-ia:6379
        password: ${env:REDIS_PASSWORD}
        collection_interval: 60s
        transport: tcp
        tls:
          insecure_skip_verify: false
          cert_file: /etc/redis-tls/redis-cert.pem
          key_file: /etc/redis-tls/redis-key.pem
          ca_file: /etc/redis-tls/ca-cert.pem
        metrics:
          redis.maxmemory:
            enabled: true
          redis.cmd.latency:
            enabled: true

      # RabbitMQ receiver via management API
      # ENABLED: Using existing credentials from rabbitmq-secrets
      rabbitmq:
        endpoint: http://rabbitmq-service.bakery-ia:15672
        username: ${env:RABBITMQ_USER}
        password: ${env:RABBITMQ_PASSWORD}
        collection_interval: 30s

      # PostgreSQL receivers for database metrics
      # Monitor all databases with proper TLS configuration
      postgresql/auth:
        endpoint: auth-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - auth_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/inventory:
        endpoint: inventory-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - inventory_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/orders:
        endpoint: orders-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - orders_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/ai-insights:
        endpoint: ai-insights-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - ai_insights_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/alert-processor:
        endpoint: alert-processor-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - alert_processor_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/distribution:
        endpoint: distribution-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - distribution_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/external:
        endpoint: external-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - external_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/forecasting:
        endpoint: forecasting-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - forecasting_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/notification:
        endpoint: notification-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - notification_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/orchestrator:
        endpoint: orchestrator-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - orchestrator_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/pos:
        endpoint: pos-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - pos_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/procurement:
        endpoint: procurement-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - procurement_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/production:
        endpoint: production-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - production_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/recipes:
        endpoint: recipes-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - recipes_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/sales:
        endpoint: sales-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - sales_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/suppliers:
        endpoint: suppliers-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - suppliers_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/tenant:
        endpoint: tenant-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - tenant_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

      postgresql/training:
        endpoint: training-db-service.bakery-ia:5432
        username: ${env:POSTGRES_MONITOR_USER}
        password: ${env:POSTGRES_MONITOR_PASSWORD}
        databases:
          - training_db
        collection_interval: 60s
        tls:
          insecure: false
          cert_file: /etc/postgres-tls/server-cert.pem
          key_file: /etc/postgres-tls/server-key.pem
          ca_file: /etc/postgres-tls/ca-cert.pem

    processors:
      # High-performance batch processing (official recommendation)
      batch:
        timeout: 1s  # Reduced from 10s for faster processing
        send_batch_size: 50000  # Increased from 2048 (official recommendation for traces)
        send_batch_max_size: 50000

      # Batch processor for meter data
      batch/meter:
        timeout: 1s
        send_batch_size: 20000
        send_batch_max_size: 25000

      memory_limiter:
        check_interval: 1s
        limit_mib: 1500  # 75% of container memory (2Gi = ~2048Mi)
        spike_limit_mib: 300

      # Resource detection for K8s
      resourcedetection:
        detectors: [env, system, docker]
        timeout: 5s

      # Add resource attributes
      resource:
        attributes:
          - key: deployment.environment
            value: production
            action: upsert
          - key: cluster.name
            value: bakery-ia-prod
            action: upsert

      # Kubernetes attributes processor - CRITICAL for logs
      # Extracts pod, namespace, container metadata from log attributes
      k8sattributes:
        auth_type: "serviceAccount"
        passthrough: false
        extract:
          metadata:
            - k8s.pod.name
            - k8s.pod.uid
            - k8s.deployment.name
            - k8s.namespace.name
            - k8s.node.name
            - k8s.container.name
          labels:
            - tag_name: "app"
            - tag_name: "pod-template-hash"
            - tag_name: "version"
          annotations:
            - tag_name: "description"

      # SigNoz span metrics processor with delta aggregation (recommended)
      # Generates RED metrics (Rate, Error, Duration) from trace spans
      signozspanmetrics/delta:
        aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
        metrics_exporter: signozclickhousemetrics
        latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s]
        dimensions_cache_size: 100000
        dimensions:
          - name: service.namespace
            default: default
          - name: deployment.environment
            default: production
          - name: signoz.collector.id

    exporters:
      # ClickHouse exporter for traces
      clickhousetraces:
        datasource: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_traces
        timeout: 10s
        retry_on_failure:
          enabled: true
          initial_interval: 5s
          max_interval: 30s
          max_elapsed_time: 300s

      # ClickHouse exporter for metrics
      signozclickhousemetrics:
        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
        timeout: 10s
        retry_on_failure:
          enabled: true
          initial_interval: 5s
          max_interval: 30s
          max_elapsed_time: 300s

      # ClickHouse exporter for meter data (usage metrics)
      signozclickhousemeter:
        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_meter"
        timeout: 45s
        sending_queue:
          enabled: false

      # ClickHouse exporter for logs
      clickhouselogsexporter:
        dsn: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_logs
        timeout: 10s
        retry_on_failure:
          enabled: true
          initial_interval: 5s
          max_interval: 30s

      # Metadata exporter for service metadata
      metadataexporter:
        dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metadata"
        timeout: 10s
        cache:
          provider: in_memory

      # Debug exporter for debugging (optional)
      debug:
        verbosity: detailed
        sampling_initial: 5
        sampling_thereafter: 200

    service:
      extensions: [health_check, zpages]
      pipelines:
        # Traces pipeline - exports to ClickHouse and signozmeter connector
        traces:
          receivers: [otlp]
          processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource]
          exporters: [clickhousetraces, metadataexporter, signozmeter]

        # Metrics pipeline - includes all infrastructure receivers
        metrics:
          receivers: [otlp,
            postgresql/auth, postgresql/inventory, postgresql/orders,
            postgresql/ai-insights, postgresql/alert-processor, postgresql/distribution,
            postgresql/external, postgresql/forecasting, postgresql/notification,
            postgresql/orchestrator, postgresql/pos, postgresql/procurement,
            postgresql/production, postgresql/recipes, postgresql/sales,
            postgresql/suppliers, postgresql/tenant, postgresql/training,
            redis, rabbitmq, k8s_cluster, prometheus]
          processors: [memory_limiter, batch, resourcedetection, resource]
          exporters: [signozclickhousemetrics]

        # Meter pipeline - receives from signozmeter connector
        metrics/meter:
          receivers: [signozmeter]
          processors: [batch/meter]
          exporters: [signozclickhousemeter]

        # Logs pipeline - includes both OTLP and Kubernetes pod logs
        logs:
          receivers: [otlp, filelog]
          processors: [memory_limiter, batch, resourcedetection, resource, k8sattributes]
          exporters: [clickhouselogsexporter]

  # HPA for OTEL Collector
  autoscaling:
    enabled: true
    minReplicas: 2
    maxReplicas: 10
    targetCPUUtilizationPercentage: 70
    targetMemoryUtilizationPercentage: 80

  # ClusterRole configuration for Kubernetes monitoring
  # CRITICAL: Required for k8s_cluster receiver to access Kubernetes API
  # Without these permissions, k8s metrics will not appear in SigNoz UI
  clusterRole:
    create: true
    name: "signoz-otel-collector-bakery-ia"
    annotations: {}
    # Complete RBAC rules required by k8sclusterreceiver
    # Based on OpenTelemetry and SigNoz official documentation
    rules:
      # Core API group - fundamental Kubernetes resources
      - apiGroups: [""]
        resources:
          - "events"
          - "namespaces"
          - "nodes"
          - "nodes/proxy"
          - "nodes/metrics"
          - "nodes/spec"
          - "pods"
          - "pods/status"
          - "replicationcontrollers"
          - "replicationcontrollers/status"
          - "resourcequotas"
          - "services"
          - "endpoints"
        verbs: ["get", "list", "watch"]
      # Apps API group - modern workload controllers
      - apiGroups: ["apps"]
        resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
        verbs: ["get", "list", "watch"]
      # Batch API group - job management
      - apiGroups: ["batch"]
        resources: ["jobs", "cronjobs"]
        verbs: ["get", "list", "watch"]
      # Autoscaling API group - HPA metrics (CRITICAL)
      - apiGroups: ["autoscaling"]
        resources: ["horizontalpodautoscalers"]
        verbs: ["get", "list", "watch"]
      # Extensions API group - legacy support
      - apiGroups: ["extensions"]
        resources: ["deployments", "daemonsets", "replicasets"]
        verbs: ["get", "list", "watch"]
      # Metrics API group - resource metrics
      - apiGroups: ["metrics.k8s.io"]
        resources: ["nodes", "pods"]
        verbs: ["get", "list", "watch"]
    clusterRoleBinding:
      annotations: {}
      name: "signoz-otel-collector-bakery-ia"

# Schema Migrator - Manages ClickHouse schema migrations
schemaMigrator:
  enabled: true

  image:
    repository: signoz/signoz-schema-migrator
    tag: v0.129.12  # Updated to latest version
    pullPolicy: IfNotPresent

  # Enable Helm hooks for proper upgrade handling
  upgradeHelmHooks: true

# Additional Configuration
serviceAccount:
  create: true
  annotations: {}
  name: "signoz"

# Security Context
securityContext:
  runAsNonRoot: true
  runAsUser: 1000
  fsGroup: 1000

# Pod Disruption Budgets for HA
podDisruptionBudget:
  frontend:
    enabled: true
    minAvailable: 1
  queryService:
    enabled: true
    minAvailable: 1
  alertmanager:
    enabled: true
    minAvailable: 1
  clickhouse:
    enabled: true
    minAvailable: 1

# Network Policies for security
networkPolicy:
  enabled: true
  policyTypes:
    - Ingress
    - Egress

# Monitoring SigNoz itself
selfMonitoring:
  enabled: true
  serviceMonitor:
    enabled: true
    interval: 30s