Fix some issues

This commit is contained in:
2026-01-25 20:07:37 +01:00
parent e0be1b22f9
commit 6c6a9fc58c
32 changed files with 1719 additions and 226 deletions

View File

@@ -0,0 +1,53 @@
# SigNoz k8s-infra Helm Chart Values - Development Environment
# Collects Kubernetes infrastructure metrics and sends to SigNoz
#
# Official Chart: https://github.com/SigNoz/charts/tree/main/charts/k8s-infra
# Install Command: helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-dev.yaml
# ============================================================================
# OTEL COLLECTOR ENDPOINT
# ============================================================================
otelCollectorEndpoint: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
otelInsecure: true
clusterName: "bakery-ia-dev"
# ============================================================================
# PRESETS - Minimal configuration for development
# ============================================================================
presets:
hostMetrics:
enabled: true
collectionInterval: 60s # Less frequent in dev
kubeletMetrics:
enabled: true
collectionInterval: 60s
kubernetesAttributes:
enabled: true
kubernetesEvents:
enabled: false # Disabled in dev to reduce noise
logsCollection:
enabled: false
# ============================================================================
# OTEL AGENT - Minimal resources for dev
# ============================================================================
otelAgent:
enabled: true
resources:
requests:
memory: "128Mi"
cpu: "50m"
limits:
memory: "256Mi"
cpu: "250m"
otelDeployment:
enabled: false
commonLabels:
app.kubernetes.io/part-of: "signoz"
environment: "development"

View File

@@ -0,0 +1,76 @@
# SigNoz k8s-infra Helm Chart Values - Production Environment
# Collects ALL Kubernetes infrastructure metrics and sends to SigNoz
#
# This chart REPLACES the need for:
# - kube-state-metrics (delete after deploying this)
# - node-exporter (delete after deploying this)
#
# Official Chart: https://github.com/SigNoz/charts/tree/main/charts/k8s-infra
#
# Install Command:
# helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-prod.yaml
#
# After install, remove redundant exporters:
# helm uninstall kube-state-metrics -n bakery-ia
# helm uninstall node-exporter-prometheus-node-exporter -n bakery-ia
# (or: helm uninstall prometheus -n bakery-ia if installed via prometheus stack)
# ============================================================================
# CONNECTION TO SIGNOZ
# ============================================================================
otelCollectorEndpoint: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
otelInsecure: true
clusterName: "bakery-ia-prod"
# ============================================================================
# PRESETS - What metrics to collect
# ============================================================================
presets:
# Host metrics: CPU, memory, disk, filesystem, network, load
# Replaces node-exporter
hostMetrics:
enabled: true
collectionInterval: 30s
# Kubelet metrics: Pod/container CPU, memory usage
# Essential for seeing resource usage per pod in SigNoz
kubeletMetrics:
enabled: true
collectionInterval: 30s
# Kubernetes cluster metrics: deployments, pods, nodes status
# Replaces kube-state-metrics
clusterMetrics:
enabled: true
collectionInterval: 30s
# Enriches all telemetry with k8s metadata (pod name, namespace, etc.)
kubernetesAttributes:
enabled: true
# Kubernetes events (pod scheduled, failed, etc.)
kubernetesEvents:
enabled: true
# Container logs - disabled (apps send logs via OTLP directly)
logsCollection:
enabled: false
# ============================================================================
# OTEL AGENT (DaemonSet) - Runs on each node
# ============================================================================
otelAgent:
enabled: true
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
# ============================================================================
# OTEL DEPLOYMENT - Disabled (using DaemonSet only)
# ============================================================================
otelDeployment:
enabled: false

View File

@@ -3,18 +3,21 @@
# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by SigNoz Helm chart
#
# Official Chart: https://github.com/SigNoz/charts
# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
# Install Command: helm upgrade --install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
#
# IMPORTANT: This chart works together with k8s-infra chart for infrastructure monitoring
# Deploy k8s-infra after this: helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-prod.yaml
#
# MEMORY OPTIMIZATION NOTES:
# - ClickHouse memory increased to 8Gi to prevent OOM errors
# - Retention reduced to 3 days for traces, 7 days for metrics/logs
global:
storageClass: "microk8s-hostpath" # For MicroK8s, use "microk8s-hostpath" or custom storage class
storageClass: "microk8s-hostpath"
clusterName: "bakery-ia-prod"
domain: "monitoring.bakewise.ai"
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
# Ingress configuration for SigNoz Frontend
# Configured to use HTTPS with TLS termination at ingress controller
# NOTE: SigNoz Helm chart expects ingress under "signoz.ingress", not "frontend.ingress"
# Reference: https://github.com/SigNoz/charts/blob/main/charts/signoz/values.yaml
signoz:
ingress:
enabled: true
@@ -39,56 +42,50 @@ signoz:
- monitoring.bakewise.ai
secretName: bakery-ia-prod-tls-cert
# Resource configuration for production
# Optimized for 8 CPU core VPS deployment
# ============================================================================
# CLICKHOUSE CONFIGURATION
# Increased memory to 8Gi to prevent OOM errors (was 4Gi, causing code 241 errors)
# ============================================================================
clickhouse:
persistence:
size: 20Gi
resources:
requests:
memory: "2Gi"
cpu: "500m"
limits:
memory: "4Gi"
cpu: "1000m"
otelCollector:
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
# Additional config for Kubernetes infrastructure metrics scraping
config:
receivers:
prometheus:
config:
scrape_configs:
# Kube-state-metrics - Kubernetes object metrics
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics.bakery-ia.svc.cluster.local:8080']
scrape_interval: 30s
metric_relabel_configs:
- source_labels: [__name__]
regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset|replicaset|job|cronjob|persistentvolume|persistentvolumeclaim|resourcequota|service|configmap|secret).*'
action: keep
# Node-exporter - Host-level metrics
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter-prometheus-node-exporter.bakery-ia.svc.cluster.local:9100']
scrape_interval: 30s
metric_relabel_configs:
- source_labels: [__name__]
regex: 'node_(cpu|memory|disk|filesystem|network|load).*'
action: keep
service:
pipelines:
metrics:
receivers: [otlp, prometheus]
memory: "8Gi"
cpu: "2000m"
# Server-level settings only (NOT user-level settings like max_threads)
# User-level settings must go in profiles section
settings:
# Max server memory usage: 80% of container limit (6.4GB of 8GB)
max_server_memory_usage: "6400000000"
# Mark cache size (256MB)
mark_cache_size: "268435456"
# Uncompressed cache (256MB)
uncompressed_cache_size: "268435456"
# Max concurrent queries
max_concurrent_queries: "100"
# User-level settings go in profiles
profiles:
default:
# Max memory per query: 2GB
max_memory_usage: "2000000000"
# Max threads per query
max_threads: "4"
# Background merges memory limit
max_bytes_to_merge_at_max_space_in_pool: "1073741824"
coldStorage:
enabled: false
# ============================================================================
# DATA RETENTION CONFIGURATION
# Reduced retention to minimize storage and memory pressure
# ============================================================================
queryService:
resources:
requests:
@@ -97,7 +94,33 @@ queryService:
limits:
memory: "2Gi"
cpu: "1000m"
# Retention configuration via environment variables
configVars:
# Trace retention: 3 days (72 hours)
SIGNOZ_TRACE_TTL_DURATION_HOURS: "72"
# Logs retention: 7 days (168 hours)
SIGNOZ_LOGS_TTL_DURATION_HOURS: "168"
# Metrics retention: 7 days (168 hours)
SIGNOZ_METRICS_TTL_DURATION_HOURS: "168"
# ============================================================================
# OTEL COLLECTOR CONFIGURATION
# This collector receives data from:
# - Application services (traces, logs, metrics via OTLP)
# - k8s-infra chart (infrastructure metrics)
# ============================================================================
otelCollector:
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
# ============================================================================
# ALERTMANAGER CONFIGURATION
# ============================================================================
alertmanager:
resources:
requests:
@@ -106,3 +129,17 @@ alertmanager:
limits:
memory: "1Gi"
cpu: "500m"
# ============================================================================
# ZOOKEEPER CONFIGURATION
# ============================================================================
zookeeper:
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
persistence:
size: 5Gi