Update monitoring packages to latest versions

- Updated all OpenTelemetry packages to latest versions: - opentelemetry-api: 1.27.0 → 1.39.1 - opentelemetry-sdk: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-grpc: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-http: 1.27.0 → 1.39.1 - opentelemetry-instrumentation-fastapi: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-httpx: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-redis: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-sqlalchemy: 0.48b0 → 0.60b1 - Removed prometheus-client==0.23.1 from all services - Unified all services to use the same monitoring package versions Generated by Mistral Vibe. Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
2026-01-08 19:25:52 +01:00
parent dfb7e4b237
commit 29d19087f1
129 changed files with 5718 additions and 1821 deletions
--- a/infrastructure/helm/signoz-values-dev.yaml
+++ b/infrastructure/helm/signoz-values-dev.yaml
@@ -6,7 +6,10 @@

 global:
  storageClass: "standard"
-  domain: "localhost"
+  domain: "monitoring.bakery-ia.local"
+  # Docker Hub credentials for pulling images
+  imagePullSecrets:
+    - name: dockerhub-creds

 # Frontend Configuration
 frontend:
@@ -27,7 +30,7 @@ frontend:
      nginx.ingress.kubernetes.io/rewrite-target: /$2
      nginx.ingress.kubernetes.io/use-regex: "true"
    hosts:
-      - host: localhost
+      - host: monitoring.bakery-ia.local
        paths:
          - path: /signoz(/|$)(.*)
            pathType: ImplementationSpecific
@@ -35,8 +38,8 @@ frontend:

  resources:
    requests:
-      cpu: 50m
-      memory: 128Mi
+      cpu: 25m  # Reduced for local dev
+      memory: 64Mi  # Reduced for local dev
    limits:
      cpu: 200m
      memory: 256Mi
@@ -44,6 +47,8 @@ frontend:
  env:
    - name: FRONTEND_REFRESH_INTERVAL
      value: "30000"
+    - name: BASE_URL
+      value: "https://monitoring.bakery-ia.local/signoz"

 # Query Service Configuration
 queryService:
@@ -59,8 +64,8 @@ queryService:

  resources:
    requests:
-      cpu: 100m
-      memory: 256Mi
+      cpu: 50m  # Reduced for local dev
+      memory: 128Mi  # Reduced for local dev
    limits:
      cpu: 500m
      memory: 512Mi
@@ -90,8 +95,8 @@ alertmanager:

  resources:
    requests:
-      cpu: 50m
-      memory: 128Mi
+      cpu: 25m  # Reduced for local dev
+      memory: 64Mi  # Reduced for local dev
    limits:
      cpu: 200m
      memory: 256Mi
@@ -115,76 +120,59 @@ alertmanager:
        # Add email, slack, webhook configs here

 # ClickHouse Configuration - Time Series Database
+# Minimal resources for local development on constrained Kind cluster
 clickhouse:
-  replicaCount: 1
-  image:
-    repository: clickhouse/clickhouse-server
-    tag: 24.1.2-alpine
-    pullPolicy: IfNotPresent
+  enabled: true
+  installCustomStorageClass: false

-  service:
-    type: ClusterIP
-    httpPort: 8123
-    tcpPort: 9000
+  # Reduce ClickHouse resource requests for local dev
+  clickhouse:
+    resources:
+      requests:
+        cpu: 200m  # Reduced from default 500m
+        memory: 512Mi
+      limits:
+        cpu: 1000m
+        memory: 1Gi

-  resources:
-    requests:
-      cpu: 500m
-      memory: 512Mi
-    limits:
-      cpu: 1000m
-      memory: 1Gi
-
-  persistence:
-    enabled: true
-    size: 10Gi
-    storageClass: "standard"
-
-  # ClickHouse configuration
-  config:
-    logger:
-      level: information
-    max_connections: 1024
-    max_concurrent_queries: 100
-    # Data retention (7 days for dev)
-    merge_tree:
-      parts_to_delay_insert: 150
-      parts_to_throw_insert: 300
-
-# OpenTelemetry Collector - Integrated with SigNoz
+# OpenTelemetry Collector - Data ingestion endpoint for all telemetry
 otelCollector:
  enabled: true
  replicaCount: 1
-  image:
-    repository: signoz/signoz-otel-collector
-    tag: 0.102.8
-    pullPolicy: IfNotPresent

+  # Service configuration - expose both gRPC and HTTP endpoints
  service:
    type: ClusterIP
    ports:
-      otlpGrpc: 4317
-      otlpHttp: 4318
-      metrics: 8888
-      healthCheck: 13133
+      # gRPC receivers
+      - name: otlp-grpc
+        port: 4317
+        targetPort: 4317
+        protocol: TCP
+      # HTTP receivers
+      - name: otlp-http
+        port: 4318
+        targetPort: 4318
+        protocol: TCP
+      # Prometheus remote write
+      - name: prometheus
+        port: 8889
+        targetPort: 8889
+        protocol: TCP

  resources:
    requests:
-      cpu: 100m
-      memory: 256Mi
+      cpu: 50m   # Reduced from 100m
+      memory: 128Mi  # Reduced from 256Mi
    limits:
      cpu: 500m
      memory: 512Mi

-  # Full OTEL Collector Configuration
+  # OpenTelemetry Collector configuration
  config:
-    extensions:
-      health_check:
-        endpoint: 0.0.0.0:13133
-      zpages:
-        endpoint: 0.0.0.0:55679
-
    receivers:
+      # OTLP receivers for traces, metrics, and logs from applications
+      # All application telemetry is pushed via OTLP protocol
      otlp:
        protocols:
          grpc:
@@ -193,105 +181,119 @@ otelCollector:
            endpoint: 0.0.0.0:4318
            cors:
              allowed_origins:
-                - "http://localhost"
-                - "https://localhost"
+                - "*"

-      # Prometheus receiver for scraping metrics
-      prometheus:
-        config:
-          scrape_configs:
-            - job_name: 'otel-collector'
-              scrape_interval: 30s
-              static_configs:
-                - targets: ['localhost:8888']
+      # PostgreSQL receivers for database metrics
+      # Collects metrics directly from PostgreSQL databases
+      postgresql/auth:
+        endpoint: auth-db-service.bakery-ia:5432
+        username: ${POSTGRES_MONITOR_USER}
+        password: ${POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - auth_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+
+      postgresql/inventory:
+        endpoint: inventory-db-service.bakery-ia:5432
+        username: ${POSTGRES_MONITOR_USER}
+        password: ${POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - inventory_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+
+      postgresql/orders:
+        endpoint: orders-db-service.bakery-ia:5432
+        username: ${POSTGRES_MONITOR_USER}
+        password: ${POSTGRES_MONITOR_PASSWORD}
+        databases:
+          - orders_db
+        collection_interval: 60s
+        tls:
+          insecure: false
+
+      # Add more PostgreSQL databases as needed
+      # postgresql/SERVICE:
+      #   endpoint: SERVICE-db-service.bakery-ia:5432
+      #   ...
+
+      # Redis receiver for cache metrics
+      redis:
+        endpoint: redis-service.bakery-ia:6379
+        password: ${REDIS_PASSWORD}
+        collection_interval: 60s
+        tls:
+          insecure: false
+          cert_file: /etc/redis-tls/redis-cert.pem
+          key_file: /etc/redis-tls/redis-key.pem
+          ca_file: /etc/redis-tls/ca-cert.pem
+
+      # RabbitMQ receiver via management API
+      rabbitmq:
+        endpoint: http://rabbitmq-service.bakery-ia:15672
+        username: ${RABBITMQ_USER}
+        password: ${RABBITMQ_PASSWORD}
+        collection_interval: 60s

    processors:
+      # Batch processor for better performance
      batch:
        timeout: 10s
        send_batch_size: 1024

+      # Memory limiter to prevent OOM
      memory_limiter:
        check_interval: 1s
        limit_mib: 400
        spike_limit_mib: 100

-      # Resource detection for K8s
+      # Resource detection
      resourcedetection:
-        detectors: [env, system, docker]
+        detectors: [env, system]
        timeout: 5s

-      # Add resource attributes
-      resource:
-        attributes:
-          - key: deployment.environment
-            value: development
-            action: upsert
-
    exporters:
-      # Export to SigNoz ClickHouse
+      # ClickHouse exporter for traces
      clickhousetraces:
-        datasource: tcp://clickhouse:9000/?database=signoz_traces
+        datasource: tcp://signoz-clickhouse:9000/?database=signoz_traces
        timeout: 10s

+      # ClickHouse exporter for metrics
      clickhousemetricswrite:
-        endpoint: tcp://clickhouse:9000/?database=signoz_metrics
+        endpoint: tcp://signoz-clickhouse:9000/?database=signoz_metrics
        timeout: 10s

+      # ClickHouse exporter for logs
      clickhouselogsexporter:
-        dsn: tcp://clickhouse:9000/?database=signoz_logs
+        dsn: tcp://signoz-clickhouse:9000/?database=signoz_logs
        timeout: 10s

-      # Debug logging
+      # Logging exporter for debugging (optional)
      logging:
        loglevel: info
-        sampling_initial: 5
-        sampling_thereafter: 200

    service:
-      extensions: [health_check, zpages]
      pipelines:
+        # Traces pipeline
        traces:
          receivers: [otlp]
-          processors: [memory_limiter, batch, resourcedetection, resource]
-          exporters: [clickhousetraces, logging]
+          processors: [memory_limiter, batch, resourcedetection]
+          exporters: [clickhousetraces]

+        # Metrics pipeline
        metrics:
-          receivers: [otlp, prometheus]
-          processors: [memory_limiter, batch, resourcedetection, resource]
+          receivers: [otlp, postgresql/auth, postgresql/inventory, postgresql/orders, redis, rabbitmq]
+          processors: [memory_limiter, batch, resourcedetection]
          exporters: [clickhousemetricswrite]

+        # Logs pipeline
        logs:
          receivers: [otlp]
-          processors: [memory_limiter, batch, resourcedetection, resource]
-          exporters: [clickhouselogsexporter, logging]
-
-# OpenTelemetry Collector Deployment Mode
-otelCollectorDeployment:
-  enabled: true
-  mode: deployment
-
-# Node Exporter for infrastructure metrics (optional)
-nodeExporter:
-  enabled: true
-  service:
-    type: ClusterIP
-    port: 9100
-
-  resources:
-    requests:
-      cpu: 50m
-      memory: 64Mi
-    limits:
-      cpu: 100m
-      memory: 128Mi
-
-# Schemamanager - Manages ClickHouse schema
-schemamanager:
-  enabled: true
-  image:
-    repository: signoz/signoz-schema-migrator
-    tag: 0.52.3
-    pullPolicy: IfNotPresent
+          processors: [memory_limiter, batch, resourcedetection]
+          exporters: [clickhouselogsexporter]

 # Additional Configuration
 serviceAccount: