Fix some issues

2026-01-25 20:07:37 +01:00
parent e0be1b22f9
commit 6c6a9fc58c
32 changed files with 1719 additions and 226 deletions
--- a/infrastructure/monitoring/signoz/k8s-infra-values-dev.yaml
+++ b/infrastructure/monitoring/signoz/k8s-infra-values-dev.yaml
@@ -0,0 +1,53 @@
+# SigNoz k8s-infra Helm Chart Values - Development Environment
+# Collects Kubernetes infrastructure metrics and sends to SigNoz
+#
+# Official Chart: https://github.com/SigNoz/charts/tree/main/charts/k8s-infra
+# Install Command: helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-dev.yaml
+
+# ============================================================================
+# OTEL COLLECTOR ENDPOINT
+# ============================================================================
+otelCollectorEndpoint: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
+otelInsecure: true
+clusterName: "bakery-ia-dev"
+
+# ============================================================================
+# PRESETS - Minimal configuration for development
+# ============================================================================
+presets:
+  hostMetrics:
+    enabled: true
+    collectionInterval: 60s  # Less frequent in dev
+
+  kubeletMetrics:
+    enabled: true
+    collectionInterval: 60s
+
+  kubernetesAttributes:
+    enabled: true
+
+  kubernetesEvents:
+    enabled: false  # Disabled in dev to reduce noise
+
+  logsCollection:
+    enabled: false
+
+# ============================================================================
+# OTEL AGENT - Minimal resources for dev
+# ============================================================================
+otelAgent:
+  enabled: true
+  resources:
+    requests:
+      memory: "128Mi"
+      cpu: "50m"
+    limits:
+      memory: "256Mi"
+      cpu: "250m"
+
+otelDeployment:
+  enabled: false
+
+commonLabels:
+  app.kubernetes.io/part-of: "signoz"
+  environment: "development"
--- a/infrastructure/monitoring/signoz/k8s-infra-values-prod.yaml
+++ b/infrastructure/monitoring/signoz/k8s-infra-values-prod.yaml
@@ -0,0 +1,76 @@
+# SigNoz k8s-infra Helm Chart Values - Production Environment
+# Collects ALL Kubernetes infrastructure metrics and sends to SigNoz
+#
+# This chart REPLACES the need for:
+# - kube-state-metrics (delete after deploying this)
+# - node-exporter (delete after deploying this)
+#
+# Official Chart: https://github.com/SigNoz/charts/tree/main/charts/k8s-infra
+#
+# Install Command:
+#   helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-prod.yaml
+#
+# After install, remove redundant exporters:
+#   helm uninstall kube-state-metrics -n bakery-ia
+#   helm uninstall node-exporter-prometheus-node-exporter -n bakery-ia
+#   (or: helm uninstall prometheus -n bakery-ia if installed via prometheus stack)
+
+# ============================================================================
+# CONNECTION TO SIGNOZ
+# ============================================================================
+otelCollectorEndpoint: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
+otelInsecure: true
+clusterName: "bakery-ia-prod"
+
+# ============================================================================
+# PRESETS - What metrics to collect
+# ============================================================================
+presets:
+  # Host metrics: CPU, memory, disk, filesystem, network, load
+  # Replaces node-exporter
+  hostMetrics:
+    enabled: true
+    collectionInterval: 30s
+
+  # Kubelet metrics: Pod/container CPU, memory usage
+  # Essential for seeing resource usage per pod in SigNoz
+  kubeletMetrics:
+    enabled: true
+    collectionInterval: 30s
+
+  # Kubernetes cluster metrics: deployments, pods, nodes status
+  # Replaces kube-state-metrics
+  clusterMetrics:
+    enabled: true
+    collectionInterval: 30s
+
+  # Enriches all telemetry with k8s metadata (pod name, namespace, etc.)
+  kubernetesAttributes:
+    enabled: true
+
+  # Kubernetes events (pod scheduled, failed, etc.)
+  kubernetesEvents:
+    enabled: true
+
+  # Container logs - disabled (apps send logs via OTLP directly)
+  logsCollection:
+    enabled: false
+
+# ============================================================================
+# OTEL AGENT (DaemonSet) - Runs on each node
+# ============================================================================
+otelAgent:
+  enabled: true
+  resources:
+    requests:
+      memory: "256Mi"
+      cpu: "100m"
+    limits:
+      memory: "512Mi"
+      cpu: "500m"
+
+# ============================================================================
+# OTEL DEPLOYMENT - Disabled (using DaemonSet only)
+# ============================================================================
+otelDeployment:
+  enabled: false
--- a/infrastructure/monitoring/signoz/signoz-values-prod.yaml
+++ b/infrastructure/monitoring/signoz/signoz-values-prod.yaml
@@ -3,18 +3,21 @@
 # DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by SigNoz Helm chart
 #
 # Official Chart: https://github.com/SigNoz/charts
-# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
+# Install Command: helm upgrade --install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
+#
+# IMPORTANT: This chart works together with k8s-infra chart for infrastructure monitoring
+# Deploy k8s-infra after this: helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-prod.yaml
+#
+# MEMORY OPTIMIZATION NOTES:
+# - ClickHouse memory increased to 8Gi to prevent OOM errors
+# - Retention reduced to 3 days for traces, 7 days for metrics/logs

 global:
-  storageClass: "microk8s-hostpath"  # For MicroK8s, use "microk8s-hostpath" or custom storage class
+  storageClass: "microk8s-hostpath"
  clusterName: "bakery-ia-prod"
  domain: "monitoring.bakewise.ai"
-  # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)

 # Ingress configuration for SigNoz Frontend
-# Configured to use HTTPS with TLS termination at ingress controller
-# NOTE: SigNoz Helm chart expects ingress under "signoz.ingress", not "frontend.ingress"
-# Reference: https://github.com/SigNoz/charts/blob/main/charts/signoz/values.yaml
 signoz:
  ingress:
    enabled: true
@@ -39,56 +42,50 @@ signoz:
          - monitoring.bakewise.ai
        secretName: bakery-ia-prod-tls-cert

-# Resource configuration for production
-# Optimized for 8 CPU core VPS deployment
+# ============================================================================
+# CLICKHOUSE CONFIGURATION
+# Increased memory to 8Gi to prevent OOM errors (was 4Gi, causing code 241 errors)
+# ============================================================================
 clickhouse:
  persistence:
    size: 20Gi
  resources:
    requests:
-      memory: "2Gi"
-      cpu: "500m"
-    limits:
      memory: "4Gi"
      cpu: "1000m"
-
-otelCollector:
-  resources:
-    requests:
-      memory: "1Gi"
-      cpu: "500m"
    limits:
-      memory: "2Gi"
-      cpu: "1000m"
-  # Additional config for Kubernetes infrastructure metrics scraping
-  config:
-    receivers:
-      prometheus:
-        config:
-          scrape_configs:
-            # Kube-state-metrics - Kubernetes object metrics
-            - job_name: 'kube-state-metrics'
-              static_configs:
-                - targets: ['kube-state-metrics.bakery-ia.svc.cluster.local:8080']
-              scrape_interval: 30s
-              metric_relabel_configs:
-                - source_labels: [__name__]
-                  regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset|replicaset|job|cronjob|persistentvolume|persistentvolumeclaim|resourcequota|service|configmap|secret).*'
-                  action: keep
-            # Node-exporter - Host-level metrics
-            - job_name: 'node-exporter'
-              static_configs:
-                - targets: ['node-exporter-prometheus-node-exporter.bakery-ia.svc.cluster.local:9100']
-              scrape_interval: 30s
-              metric_relabel_configs:
-                - source_labels: [__name__]
-                  regex: 'node_(cpu|memory|disk|filesystem|network|load).*'
-                  action: keep
-    service:
-      pipelines:
-        metrics:
-          receivers: [otlp, prometheus]
+      memory: "8Gi"
+      cpu: "2000m"

+  # Server-level settings only (NOT user-level settings like max_threads)
+  # User-level settings must go in profiles section
+  settings:
+    # Max server memory usage: 80% of container limit (6.4GB of 8GB)
+    max_server_memory_usage: "6400000000"
+    # Mark cache size (256MB)
+    mark_cache_size: "268435456"
+    # Uncompressed cache (256MB)
+    uncompressed_cache_size: "268435456"
+    # Max concurrent queries
+    max_concurrent_queries: "100"
+
+  # User-level settings go in profiles
+  profiles:
+    default:
+      # Max memory per query: 2GB
+      max_memory_usage: "2000000000"
+      # Max threads per query
+      max_threads: "4"
+      # Background merges memory limit
+      max_bytes_to_merge_at_max_space_in_pool: "1073741824"
+
+  coldStorage:
+    enabled: false
+
+# ============================================================================
+# DATA RETENTION CONFIGURATION
+# Reduced retention to minimize storage and memory pressure
+# ============================================================================
 queryService:
  resources:
    requests:
@@ -97,7 +94,33 @@ queryService:
    limits:
      memory: "2Gi"
      cpu: "1000m"
+  # Retention configuration via environment variables
+  configVars:
+    # Trace retention: 3 days (72 hours)
+    SIGNOZ_TRACE_TTL_DURATION_HOURS: "72"
+    # Logs retention: 7 days (168 hours)
+    SIGNOZ_LOGS_TTL_DURATION_HOURS: "168"
+    # Metrics retention: 7 days (168 hours)
+    SIGNOZ_METRICS_TTL_DURATION_HOURS: "168"

+# ============================================================================
+# OTEL COLLECTOR CONFIGURATION
+# This collector receives data from:
+# - Application services (traces, logs, metrics via OTLP)
+# - k8s-infra chart (infrastructure metrics)
+# ============================================================================
+otelCollector:
+  resources:
+    requests:
+      memory: "1Gi"
+      cpu: "500m"
+    limits:
+      memory: "2Gi"
+      cpu: "1000m"
+
+# ============================================================================
+# ALERTMANAGER CONFIGURATION
+# ============================================================================
 alertmanager:
  resources:
    requests:
@@ -106,3 +129,17 @@ alertmanager:
    limits:
      memory: "1Gi"
      cpu: "500m"
+
+# ============================================================================
+# ZOOKEEPER CONFIGURATION
+# ============================================================================
+zookeeper:
+  resources:
+    requests:
+      memory: "512Mi"
+      cpu: "250m"
+    limits:
+      memory: "1Gi"
+      cpu: "500m"
+  persistence:
+    size: 5Gi