Root Cause Analysis:
- OTel Collector was starting but OpAMP was overwriting config with "nop" receivers/exporters
- ClickHouse authentication was failing due to missing credentials in DSN strings
- Redis/PostgreSQL/RabbitMQ receivers had missing TLS certs causing startup failures
Changes:
1. Fixed ClickHouse Exporters:
- Added admin credentials to clickhousetraces datasource
- Added admin credentials to clickhouselogsexporter dsn
- Now using: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/
2. Disabled Unconfigured Receivers:
- Commented out PostgreSQL receivers (no monitor users configured)
- Commented out Redis receiver (TLS certificates not available)
- Commented out RabbitMQ receiver (credentials not configured)
- Updated metrics pipeline to use only OTLP receiver
3. OpAMP Disabled:
- OpAMP was causing collector to use nop exporters/receivers
- Cannot disable via Helm (extraArgs appends, doesn't replace)
- Must apply kubectl patch after Helm install:
kubectl patch deployment signoz-otel-collector --type=json -p='[{"op":"replace","path":"/spec/template/spec/containers/0/args","value":["--config=/conf/otel-collector-config.yaml","--feature-gates=-pkg.translator.prometheus.NormalizeName"]}]'
Results:
✅ OTel Collector successfully receiving traces (97+ spans)
✅ Services connecting without UNAVAILABLE errors
✅ No ClickHouse authentication failures
✅ All pipelines active (traces, metrics, logs)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
403 lines
11 KiB
YAML
403 lines
11 KiB
YAML
# SigNoz Helm Chart Values - Development Environment
|
|
# Optimized for local development with minimal resource usage
|
|
# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress
|
|
#
|
|
# Official Chart: https://github.com/SigNoz/charts
|
|
# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-dev.yaml
|
|
|
|
global:
|
|
storageClass: "standard"
|
|
clusterName: "bakery-ia-dev"
|
|
domain: "monitoring.bakery-ia.local"
|
|
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
|
|
imagePullSecrets:
|
|
- dockerhub-creds
|
|
|
|
# Docker Hub credentials for pulling images (root level for SigNoz components)
|
|
imagePullSecrets:
|
|
- dockerhub-creds
|
|
|
|
# SignOz Main Component (includes frontend and query service)
|
|
signoz:
|
|
replicaCount: 1
|
|
|
|
service:
|
|
type: ClusterIP
|
|
port: 8080
|
|
|
|
# DISABLE built-in ingress - using unified bakery-ingress instead
|
|
# Route configured in infrastructure/kubernetes/overlays/dev/dev-ingress.yaml
|
|
ingress:
|
|
enabled: false
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 100m # Combined frontend + query service
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 1Gi
|
|
|
|
# Environment variables (new format - replaces configVars)
|
|
env:
|
|
signoz_telemetrystore_provider: "clickhouse"
|
|
dot_metrics_enabled: "true"
|
|
signoz_emailing_enabled: "false"
|
|
signoz_alertmanager_provider: "signoz"
|
|
# Retention for dev (7 days)
|
|
signoz_traces_ttl_duration_hrs: "168"
|
|
signoz_metrics_ttl_duration_hrs: "168"
|
|
signoz_logs_ttl_duration_hrs: "168"
|
|
|
|
persistence:
|
|
enabled: true
|
|
size: 5Gi
|
|
storageClass: "standard"
|
|
|
|
# AlertManager Configuration
|
|
alertmanager:
|
|
replicaCount: 1
|
|
image:
|
|
repository: signoz/alertmanager
|
|
tag: 0.23.5
|
|
pullPolicy: IfNotPresent
|
|
|
|
service:
|
|
type: ClusterIP
|
|
port: 9093
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 25m # Reduced for local dev
|
|
memory: 64Mi # Reduced for local dev
|
|
limits:
|
|
cpu: 200m
|
|
memory: 256Mi
|
|
|
|
persistence:
|
|
enabled: true
|
|
size: 2Gi
|
|
storageClass: "standard"
|
|
|
|
config:
|
|
global:
|
|
resolve_timeout: 5m
|
|
route:
|
|
group_by: ['alertname', 'cluster', 'service']
|
|
group_wait: 10s
|
|
group_interval: 10s
|
|
repeat_interval: 12h
|
|
receiver: 'default'
|
|
receivers:
|
|
- name: 'default'
|
|
# Add email, slack, webhook configs here
|
|
|
|
# ClickHouse Configuration - Time Series Database
|
|
# Minimal resources for local development on constrained Kind cluster
|
|
clickhouse:
|
|
enabled: true
|
|
installCustomStorageClass: false
|
|
|
|
image:
|
|
registry: docker.io
|
|
repository: clickhouse/clickhouse-server
|
|
tag: 25.5.6 # Official recommended version
|
|
|
|
# Reduce ClickHouse resource requests for local dev
|
|
clickhouse:
|
|
resources:
|
|
requests:
|
|
cpu: 200m # Reduced from default 500m
|
|
memory: 512Mi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 1Gi
|
|
|
|
persistence:
|
|
enabled: true
|
|
size: 20Gi
|
|
|
|
# Zookeeper Configuration (required by ClickHouse)
|
|
zookeeper:
|
|
enabled: true
|
|
replicaCount: 1 # Single replica for dev
|
|
|
|
image:
|
|
tag: 3.7.1 # Official recommended version
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
|
|
persistence:
|
|
enabled: true
|
|
size: 5Gi
|
|
|
|
# OpenTelemetry Collector - Data ingestion endpoint for all telemetry
|
|
otelCollector:
|
|
enabled: true
|
|
replicaCount: 1
|
|
|
|
image:
|
|
repository: signoz/signoz-otel-collector
|
|
tag: v0.129.12 # Latest recommended version
|
|
|
|
# NOTE: OpAMP is disabled via kubectl patch on the deployment
|
|
# Cannot disable via Helm values as extraArgs appends instead of replaces
|
|
# Patch command: kubectl patch deployment signoz-otel-collector --type=json -p='[{"op":"replace","path":"/spec/template/spec/containers/0/args","value":["--config=/conf/otel-collector-config.yaml","--feature-gates=-pkg.translator.prometheus.NormalizeName"]}]'
|
|
|
|
# Service configuration - expose both gRPC and HTTP endpoints
|
|
service:
|
|
type: ClusterIP
|
|
ports:
|
|
# gRPC receivers
|
|
- name: otlp-grpc
|
|
port: 4317
|
|
targetPort: 4317
|
|
protocol: TCP
|
|
# HTTP receivers
|
|
- name: otlp-http
|
|
port: 4318
|
|
targetPort: 4318
|
|
protocol: TCP
|
|
# Prometheus remote write
|
|
- name: prometheus
|
|
port: 8889
|
|
targetPort: 8889
|
|
protocol: TCP
|
|
# Metrics
|
|
- name: metrics
|
|
port: 8888
|
|
targetPort: 8888
|
|
protocol: TCP
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 50m # Reduced from 100m
|
|
memory: 128Mi # Reduced from 256Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
|
|
# OpenTelemetry Collector configuration
|
|
config:
|
|
# Connectors - bridge between pipelines
|
|
connectors:
|
|
signozmeter:
|
|
dimensions:
|
|
- name: service.name
|
|
- name: deployment.environment
|
|
- name: host.name
|
|
metrics_flush_interval: 1h
|
|
|
|
receivers:
|
|
# OTLP receivers for traces, metrics, and logs from applications
|
|
# All application telemetry is pushed via OTLP protocol
|
|
otlp:
|
|
protocols:
|
|
grpc:
|
|
endpoint: 0.0.0.0:4317
|
|
http:
|
|
endpoint: 0.0.0.0:4318
|
|
cors:
|
|
allowed_origins:
|
|
- "*"
|
|
|
|
# PostgreSQL receivers for database metrics
|
|
# DISABLED: Monitor users not configured yet
|
|
# Collects metrics directly from PostgreSQL databases
|
|
# postgresql/auth:
|
|
# endpoint: auth-db-service.bakery-ia:5432
|
|
# username: ${POSTGRES_MONITOR_USER}
|
|
# password: ${POSTGRES_MONITOR_PASSWORD}
|
|
# databases:
|
|
# - auth_db
|
|
# collection_interval: 60s
|
|
# tls:
|
|
# insecure: false
|
|
|
|
# postgresql/inventory:
|
|
# endpoint: inventory-db-service.bakery-ia:5432
|
|
# username: ${POSTGRES_MONITOR_USER}
|
|
# password: ${POSTGRES_MONITOR_PASSWORD}
|
|
# databases:
|
|
# - inventory_db
|
|
# collection_interval: 60s
|
|
# tls:
|
|
# insecure: false
|
|
|
|
# postgresql/orders:
|
|
# endpoint: orders-db-service.bakery-ia:5432
|
|
# username: ${POSTGRES_MONITOR_USER}
|
|
# password: ${POSTGRES_MONITOR_PASSWORD}
|
|
# databases:
|
|
# - orders_db
|
|
# collection_interval: 60s
|
|
# tls:
|
|
# insecure: false
|
|
|
|
# Add more PostgreSQL databases as needed
|
|
# postgresql/SERVICE:
|
|
# endpoint: SERVICE-db-service.bakery-ia:5432
|
|
# ...
|
|
|
|
# Redis receiver for cache metrics
|
|
# DISABLED: TLS certificates not configured yet
|
|
# redis:
|
|
# endpoint: redis-service.bakery-ia:6379
|
|
# password: ${REDIS_PASSWORD}
|
|
# collection_interval: 60s
|
|
# tls:
|
|
# insecure: false
|
|
# cert_file: /etc/redis-tls/redis-cert.pem
|
|
# key_file: /etc/redis-tls/redis-key.pem
|
|
# ca_file: /etc/redis-tls/ca-cert.pem
|
|
|
|
# RabbitMQ receiver via management API
|
|
# DISABLED: RabbitMQ credentials not configured yet
|
|
# rabbitmq:
|
|
# endpoint: http://rabbitmq-service.bakery-ia:15672
|
|
# username: ${RABBITMQ_USER}
|
|
# password: ${RABBITMQ_PASSWORD}
|
|
# collection_interval: 60s
|
|
|
|
processors:
|
|
# Batch processor for better performance (optimized for high throughput)
|
|
batch:
|
|
timeout: 1s
|
|
send_batch_size: 10000 # Increased from 1024 for better performance
|
|
send_batch_max_size: 10000
|
|
|
|
# Batch processor for meter data
|
|
batch/meter:
|
|
timeout: 1s
|
|
send_batch_size: 20000
|
|
send_batch_max_size: 25000
|
|
|
|
# Memory limiter to prevent OOM
|
|
memory_limiter:
|
|
check_interval: 1s
|
|
limit_mib: 400
|
|
spike_limit_mib: 100
|
|
|
|
# Resource detection
|
|
resourcedetection:
|
|
detectors: [env, system, docker]
|
|
timeout: 5s
|
|
|
|
# SigNoz span metrics processor with delta aggregation (recommended)
|
|
# Generates RED metrics (Rate, Error, Duration) from trace spans
|
|
signozspanmetrics/delta:
|
|
aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
|
|
metrics_exporter: signozclickhousemetrics
|
|
latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s]
|
|
dimensions_cache_size: 100000
|
|
dimensions:
|
|
- name: service.namespace
|
|
default: default
|
|
- name: deployment.environment
|
|
default: default
|
|
- name: signoz.collector.id
|
|
|
|
exporters:
|
|
# ClickHouse exporter for traces
|
|
clickhousetraces:
|
|
datasource: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_traces
|
|
timeout: 10s
|
|
retry_on_failure:
|
|
enabled: true
|
|
initial_interval: 5s
|
|
max_interval: 30s
|
|
max_elapsed_time: 300s
|
|
|
|
# ClickHouse exporter for metrics
|
|
signozclickhousemetrics:
|
|
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
|
|
timeout: 10s
|
|
retry_on_failure:
|
|
enabled: true
|
|
initial_interval: 5s
|
|
max_interval: 30s
|
|
max_elapsed_time: 300s
|
|
|
|
# ClickHouse exporter for meter data (usage metrics)
|
|
signozclickhousemeter:
|
|
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_meter"
|
|
timeout: 45s
|
|
sending_queue:
|
|
enabled: false
|
|
|
|
# ClickHouse exporter for logs
|
|
clickhouselogsexporter:
|
|
dsn: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_logs
|
|
timeout: 10s
|
|
retry_on_failure:
|
|
enabled: true
|
|
initial_interval: 5s
|
|
max_interval: 30s
|
|
|
|
# Metadata exporter for service metadata
|
|
metadataexporter:
|
|
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metadata"
|
|
timeout: 10s
|
|
cache:
|
|
provider: in_memory
|
|
|
|
# Debug exporter for debugging (optional)
|
|
debug:
|
|
verbosity: detailed
|
|
sampling_initial: 5
|
|
sampling_thereafter: 200
|
|
|
|
service:
|
|
pipelines:
|
|
# Traces pipeline - exports to ClickHouse and signozmeter connector
|
|
traces:
|
|
receivers: [otlp]
|
|
processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection]
|
|
exporters: [clickhousetraces, metadataexporter, signozmeter]
|
|
|
|
# Metrics pipeline
|
|
metrics:
|
|
receivers: [otlp] # Database/cache receivers disabled until credentials configured
|
|
processors: [memory_limiter, batch, resourcedetection]
|
|
exporters: [signozclickhousemetrics]
|
|
|
|
# Meter pipeline - receives from signozmeter connector
|
|
metrics/meter:
|
|
receivers: [signozmeter]
|
|
processors: [batch/meter]
|
|
exporters: [signozclickhousemeter]
|
|
|
|
# Logs pipeline
|
|
logs:
|
|
receivers: [otlp]
|
|
processors: [memory_limiter, batch, resourcedetection]
|
|
exporters: [clickhouselogsexporter]
|
|
|
|
# Additional Configuration
|
|
serviceAccount:
|
|
create: true
|
|
annotations: {}
|
|
name: ""
|
|
|
|
# Security Context
|
|
securityContext:
|
|
runAsNonRoot: true
|
|
runAsUser: 1000
|
|
fsGroup: 1000
|
|
|
|
# Network Policies (disabled for dev)
|
|
networkPolicy:
|
|
enabled: false
|
|
|
|
# Monitoring SigNoz itself
|
|
selfMonitoring:
|
|
enabled: true
|
|
serviceMonitor:
|
|
enabled: false
|