Files
bakery-ia/infrastructure/monitoring/signoz/signoz-values-prod.yaml

999 lines
31 KiB
YAML
Raw Normal View History

2026-01-08 12:58:00 +01:00
# SigNoz Helm Chart Values - Production Environment
# High-availability configuration with resource optimization
2026-01-09 07:26:11 +01:00
# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress-prod
2026-01-08 12:58:00 +01:00
#
# Official Chart: https://github.com/SigNoz/charts
2026-01-09 07:26:11 +01:00
# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
2026-01-08 12:58:00 +01:00
global:
storageClass: "microk8s-hostpath" # For MicroK8s, use "microk8s-hostpath" or custom storage class
2026-01-09 07:26:11 +01:00
clusterName: "bakery-ia-prod"
2026-01-08 12:58:00 +01:00
domain: "monitoring.bakewise.ai"
2026-01-09 06:57:18 +01:00
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
imagePullSecrets:
- dockerhub-creds
# Docker Hub credentials for pulling images (root level for SigNoz components)
imagePullSecrets:
- dockerhub-creds
2026-01-08 12:58:00 +01:00
2026-01-09 07:26:11 +01:00
# SigNoz Main Component (unified frontend + query service)
# BREAKING CHANGE: v0.89.0+ uses unified component instead of separate frontend/queryService
signoz:
2026-01-08 12:58:00 +01:00
replicaCount: 2
2026-01-09 07:26:11 +01:00
2026-01-08 12:58:00 +01:00
image:
2026-01-09 07:26:11 +01:00
repository: signoz/signoz
tag: v0.106.0 # Latest stable version
2026-01-08 12:58:00 +01:00
pullPolicy: IfNotPresent
service:
type: ClusterIP
2026-01-09 07:26:11 +01:00
port: 8080 # HTTP/API port
internalPort: 8085 # Internal gRPC port
2026-01-08 12:58:00 +01:00
2026-01-09 07:26:11 +01:00
# DISABLE built-in ingress - using unified bakery-ingress-prod instead
# Route configured in infrastructure/kubernetes/overlays/prod/prod-ingress.yaml
2026-01-08 12:58:00 +01:00
ingress:
2026-01-09 07:26:11 +01:00
enabled: false
2026-01-08 12:58:00 +01:00
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
2026-01-09 07:26:11 +01:00
cpu: 2000m
memory: 4Gi
2026-01-08 12:58:00 +01:00
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
2026-01-09 07:26:11 +01:00
matchLabels:
app.kubernetes.io/component: query-service
2026-01-08 12:58:00 +01:00
topologyKey: kubernetes.io/hostname
2026-01-09 07:26:11 +01:00
# Environment variables (new format - replaces configVars)
2026-01-08 12:58:00 +01:00
env:
2026-01-09 07:26:11 +01:00
signoz_telemetrystore_provider: "clickhouse"
dot_metrics_enabled: "true"
signoz_emailing_enabled: "true"
signoz_alertmanager_provider: "signoz"
# Retention configuration (30 days for prod)
signoz_traces_ttl_duration_hrs: "720"
signoz_metrics_ttl_duration_hrs: "720"
signoz_logs_ttl_duration_hrs: "720"
2026-01-09 23:14:12 +01:00
# OpAMP Server Configuration
# WARNING: OpAMP can cause gRPC instability and collector reloads
# Only enable if you have a stable OpAMP backend server
signoz_opamp_server_enabled: "false"
# signoz_opamp_server_endpoint: "0.0.0.0:4320"
2026-01-19 11:55:17 +01:00
# SMTP configuration for email alerts - now using Mailu as SMTP server
2026-01-09 07:26:11 +01:00
signoz_smtp_enabled: "true"
2026-01-19 13:57:50 +01:00
signoz_smtp_host: "mailu-postfix.bakery-ia.svc.cluster.local"
2026-01-09 07:26:11 +01:00
signoz_smtp_port: "587"
signoz_smtp_from: "alerts@bakewise.ai"
signoz_smtp_username: "alerts@bakewise.ai"
# Password should be set via secret: signoz_smtp_password
2026-01-08 12:58:00 +01:00
persistence:
enabled: true
size: 20Gi
storageClass: "standard"
# Horizontal Pod Autoscaler
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 5
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
# AlertManager Configuration
alertmanager:
2026-01-09 07:26:11 +01:00
enabled: true
2026-01-08 12:58:00 +01:00
replicaCount: 2
2026-01-09 07:26:11 +01:00
2026-01-08 12:58:00 +01:00
image:
repository: signoz/alertmanager
tag: 0.23.5
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 9093
resources:
requests:
2026-01-09 07:26:11 +01:00
cpu: 100m
memory: 128Mi
2026-01-08 12:58:00 +01:00
limits:
cpu: 500m
2026-01-09 07:26:11 +01:00
memory: 512Mi
2026-01-08 12:58:00 +01:00
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-alertmanager
topologyKey: kubernetes.io/hostname
persistence:
enabled: true
size: 5Gi
storageClass: "standard"
config:
global:
resolve_timeout: 5m
2026-01-19 13:57:50 +01:00
smtp_smarthost: 'mailu-postfix.bakery-ia.svc.cluster.local:587'
2026-01-08 12:58:00 +01:00
smtp_from: 'alerts@bakewise.ai'
smtp_auth_username: 'alerts@bakewise.ai'
smtp_auth_password: '${SMTP_PASSWORD}'
smtp_require_tls: true
route:
group_by: ['alertname', 'cluster', 'service', 'severity']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'critical-alerts'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
continue: true
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'critical-alerts'
email_configs:
- to: 'critical-alerts@bakewise.ai'
headers:
Subject: '[CRITICAL] {{ .GroupLabels.alertname }} - Bakery IA'
# Slack webhook for critical alerts
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#alerts-critical'
title: '[CRITICAL] {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'warning-alerts'
email_configs:
- to: 'oncall@bakewise.ai'
headers:
Subject: '[WARNING] {{ .GroupLabels.alertname }} - Bakery IA'
# ClickHouse Configuration - Time Series Database
clickhouse:
2026-01-09 07:26:11 +01:00
enabled: true
installCustomStorageClass: false
2026-01-08 12:58:00 +01:00
image:
2026-01-09 07:26:11 +01:00
registry: docker.io
2026-01-08 12:58:00 +01:00
repository: clickhouse/clickhouse-server
2026-01-09 07:26:11 +01:00
tag: 25.5.6 # Updated to official recommended version
2026-01-08 12:58:00 +01:00
pullPolicy: IfNotPresent
2026-01-09 07:26:11 +01:00
# ClickHouse resources (nested config)
clickhouse:
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 4000m
memory: 8Gi
2026-01-08 12:58:00 +01:00
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-clickhouse
topologyKey: kubernetes.io/hostname
persistence:
enabled: true
size: 100Gi
storageClass: "standard"
2026-01-09 07:26:11 +01:00
# Cold storage configuration for better disk space management
coldStorage:
2026-01-08 12:58:00 +01:00
enabled: true
2026-01-09 07:26:11 +01:00
defaultKeepFreeSpaceBytes: 10737418240 # Keep 10GB free
ttl:
deleteTTLDays: 30 # Move old data to cold storage after 30 days
# Zookeeper Configuration (required by ClickHouse for coordination)
zookeeper:
enabled: true
replicaCount: 3 # CRITICAL: Always use 3 replicas for production HA
image:
tag: 3.7.1 # Official recommended version
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
persistence:
enabled: true
size: 10Gi
storageClass: "standard"
2026-01-08 12:58:00 +01:00
# OpenTelemetry Collector - Integrated with SigNoz
otelCollector:
enabled: true
replicaCount: 2
2026-01-09 07:26:11 +01:00
2026-01-08 12:58:00 +01:00
image:
repository: signoz/signoz-otel-collector
2026-01-09 07:26:11 +01:00
tag: v0.129.12 # Updated to latest recommended version
2026-01-08 12:58:00 +01:00
pullPolicy: IfNotPresent
2026-01-09 23:14:12 +01:00
# Init containers for the Otel Collector pod
initContainers:
fix-postgres-tls:
enabled: true
image:
registry: docker.io
repository: busybox
tag: 1.35
pullPolicy: IfNotPresent
command:
- sh
- -c
- |
echo "Fixing PostgreSQL TLS file permissions..."
cp /etc/postgres-tls-source/* /etc/postgres-tls/
chmod 600 /etc/postgres-tls/server-key.pem
chmod 644 /etc/postgres-tls/server-cert.pem
chmod 644 /etc/postgres-tls/ca-cert.pem
echo "PostgreSQL TLS permissions fixed"
volumeMounts:
- name: postgres-tls-source
mountPath: /etc/postgres-tls-source
readOnly: true
- name: postgres-tls-fixed
mountPath: /etc/postgres-tls
readOnly: false
2026-01-08 12:58:00 +01:00
service:
type: ClusterIP
ports:
2026-01-09 07:26:11 +01:00
- name: otlp-grpc
port: 4317
2026-01-09 23:14:12 +01:00
targetPort: 4317
protocol: TCP
2026-01-09 07:26:11 +01:00
- name: otlp-http
port: 4318
2026-01-09 23:14:12 +01:00
targetPort: 4318
protocol: TCP
- name: prometheus
port: 8889
targetPort: 8889
protocol: TCP
2026-01-09 07:26:11 +01:00
- name: metrics
port: 8888
2026-01-09 23:14:12 +01:00
targetPort: 8888
protocol: TCP
2026-01-08 12:58:00 +01:00
resources:
requests:
cpu: 500m
memory: 512Mi
limits:
2026-01-09 07:26:11 +01:00
cpu: 2000m
memory: 2Gi
2026-01-08 12:58:00 +01:00
2026-01-09 23:14:12 +01:00
# Additional environment variables for receivers
additionalEnvs:
POSTGRES_MONITOR_USER: "monitoring"
POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca"
REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k"
RABBITMQ_USER: "bakery"
RABBITMQ_PASSWORD: "forecast123"
# Mount TLS certificates for secure connections
extraVolumes:
- name: redis-tls
secret:
secretName: redis-tls-secret
- name: postgres-tls
secret:
secretName: postgres-tls
- name: postgres-tls-fixed
emptyDir: {}
- name: varlogpods
hostPath:
path: /var/log/pods
extraVolumeMounts:
- name: redis-tls
mountPath: /etc/redis-tls
readOnly: true
- name: postgres-tls
mountPath: /etc/postgres-tls-source
readOnly: true
- name: postgres-tls-fixed
mountPath: /etc/postgres-tls
readOnly: false
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
# Enable OpAMP for dynamic configuration management
command:
name: /signoz-otel-collector
extraArgs:
- --config=/conf/otel-collector-config.yaml
- --manager-config=/conf/otel-collector-opamp-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName
2026-01-08 12:58:00 +01:00
# Full OTEL Collector Configuration
config:
2026-01-09 11:18:20 +01:00
# Connectors - bridge between pipelines
connectors:
signozmeter:
dimensions:
- name: service.name
- name: deployment.environment
- name: host.name
metrics_flush_interval: 1h
2026-01-08 12:58:00 +01:00
extensions:
health_check:
endpoint: 0.0.0.0:13133
zpages:
endpoint: 0.0.0.0:55679
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
2026-01-09 07:26:11 +01:00
max_recv_msg_size_mib: 32 # Increased for larger payloads
2026-01-08 12:58:00 +01:00
http:
endpoint: 0.0.0.0:4318
cors:
allowed_origins:
- "https://monitoring.bakewise.ai"
- "https://*.bakewise.ai"
2026-01-09 23:14:12 +01:00
# Filelog receiver for Kubernetes pod logs
# Collects container stdout/stderr from /var/log/pods
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
# Exclude SigNoz's own logs to avoid recursive collection
- /var/log/pods/bakery-ia_signoz-*/*/*.log
include_file_path: true
include_file_name: false
operators:
# Parse CRI-O / containerd log format
- type: regex_parser
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Fix timestamp parsing - extract from the parsed time field
- type: move
from: attributes.time
to: attributes.timestamp
# Extract Kubernetes metadata from file path
- type: regex_parser
id: extract_metadata_from_filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
parse_from: attributes["log.file.path"]
# Move metadata to resource attributes
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.log
to: body
# Kubernetes Cluster Receiver - Collects cluster-level metrics
# Provides information about nodes, namespaces, pods, and other cluster resources
k8s_cluster:
collection_interval: 30s
node_conditions_to_report:
- Ready
- MemoryPressure
- DiskPressure
- PIDPressure
- NetworkUnavailable
allocatable_types_to_report:
- cpu
- memory
- pods
2026-01-08 12:58:00 +01:00
# Prometheus receiver for scraping metrics
prometheus:
config:
scrape_configs:
2026-01-09 23:14:12 +01:00
- job_name: 'kubernetes-nodes-cadvisor'
2026-01-08 12:58:00 +01:00
scrape_interval: 30s
2026-01-09 23:14:12 +01:00
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Redis receiver for cache metrics
# ENABLED: Using existing credentials from redis-secrets with TLS
redis:
endpoint: redis-service.bakery-ia:6379
password: ${env:REDIS_PASSWORD}
collection_interval: 60s
transport: tcp
tls:
insecure_skip_verify: false
cert_file: /etc/redis-tls/redis-cert.pem
key_file: /etc/redis-tls/redis-key.pem
ca_file: /etc/redis-tls/ca-cert.pem
metrics:
redis.maxmemory:
enabled: true
redis.cmd.latency:
enabled: true
# RabbitMQ receiver via management API
# ENABLED: Using existing credentials from rabbitmq-secrets
rabbitmq:
endpoint: http://rabbitmq-service.bakery-ia:15672
username: ${env:RABBITMQ_USER}
password: ${env:RABBITMQ_PASSWORD}
collection_interval: 30s
# PostgreSQL receivers for database metrics
# Monitor all databases with proper TLS configuration
postgresql/auth:
endpoint: auth-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- auth_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/inventory:
endpoint: inventory-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- inventory_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/orders:
endpoint: orders-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- orders_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/ai-insights:
endpoint: ai-insights-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- ai_insights_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/alert-processor:
endpoint: alert-processor-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- alert_processor_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/distribution:
endpoint: distribution-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- distribution_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/external:
endpoint: external-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- external_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/forecasting:
endpoint: forecasting-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- forecasting_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/notification:
endpoint: notification-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- notification_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/orchestrator:
endpoint: orchestrator-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- orchestrator_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/pos:
endpoint: pos-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- pos_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/procurement:
endpoint: procurement-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- procurement_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/production:
endpoint: production-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- production_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/recipes:
endpoint: recipes-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- recipes_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/sales:
endpoint: sales-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- sales_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/suppliers:
endpoint: suppliers-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- suppliers_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/tenant:
endpoint: tenant-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- tenant_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/training:
endpoint: training-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- training_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
2026-01-08 12:58:00 +01:00
processors:
2026-01-09 07:26:11 +01:00
# High-performance batch processing (official recommendation)
2026-01-08 12:58:00 +01:00
batch:
2026-01-09 07:26:11 +01:00
timeout: 1s # Reduced from 10s for faster processing
send_batch_size: 50000 # Increased from 2048 (official recommendation for traces)
send_batch_max_size: 50000
2026-01-08 12:58:00 +01:00
2026-01-09 11:18:20 +01:00
# Batch processor for meter data
batch/meter:
timeout: 1s
send_batch_size: 20000
send_batch_max_size: 25000
2026-01-08 12:58:00 +01:00
memory_limiter:
check_interval: 1s
2026-01-09 07:26:11 +01:00
limit_mib: 1500 # 75% of container memory (2Gi = ~2048Mi)
spike_limit_mib: 300
2026-01-08 12:58:00 +01:00
# Resource detection for K8s
resourcedetection:
2026-01-09 23:14:12 +01:00
detectors: [env, system, docker]
2026-01-08 12:58:00 +01:00
timeout: 5s
# Add resource attributes
resource:
attributes:
- key: deployment.environment
value: production
action: upsert
- key: cluster.name
value: bakery-ia-prod
action: upsert
2026-01-09 23:14:12 +01:00
# Kubernetes attributes processor - CRITICAL for logs
# Extracts pod, namespace, container metadata from log attributes
k8sattributes:
auth_type: "serviceAccount"
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
- k8s.container.name
labels:
- tag_name: "app"
- tag_name: "pod-template-hash"
- tag_name: "version"
annotations:
- tag_name: "description"
2026-01-09 11:18:20 +01:00
# SigNoz span metrics processor with delta aggregation (recommended)
# Generates RED metrics (Rate, Error, Duration) from trace spans
signozspanmetrics/delta:
aggregation_temporality: AGGREGATION_TEMPORALITY_DELTA
2026-01-09 07:26:11 +01:00
metrics_exporter: signozclickhousemetrics
2026-01-09 11:18:20 +01:00
latency_histogram_buckets: [100us, 1ms, 2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1000ms, 1400ms, 2000ms, 5s, 10s, 20s, 40s, 60s]
2026-01-09 07:26:11 +01:00
dimensions_cache_size: 100000
2026-01-09 11:18:20 +01:00
dimensions:
- name: service.namespace
default: default
- name: deployment.environment
default: production
- name: signoz.collector.id
2026-01-09 07:26:11 +01:00
2026-01-08 12:58:00 +01:00
exporters:
2026-01-09 23:14:12 +01:00
# ClickHouse exporter for traces
2026-01-08 12:58:00 +01:00
clickhousetraces:
2026-01-09 23:14:12 +01:00
datasource: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_traces
2026-01-08 12:58:00 +01:00
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
2026-01-09 23:14:12 +01:00
# ClickHouse exporter for metrics
2026-01-09 06:57:18 +01:00
signozclickhousemetrics:
2026-01-09 23:14:12 +01:00
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
2026-01-08 12:58:00 +01:00
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
2026-01-09 11:18:20 +01:00
# ClickHouse exporter for meter data (usage metrics)
signozclickhousemeter:
2026-01-09 23:14:12 +01:00
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_meter"
2026-01-09 11:18:20 +01:00
timeout: 45s
sending_queue:
enabled: false
2026-01-09 23:14:12 +01:00
# ClickHouse exporter for logs
2026-01-08 12:58:00 +01:00
clickhouselogsexporter:
2026-01-09 23:14:12 +01:00
dsn: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_logs
2026-01-08 12:58:00 +01:00
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
2026-01-09 11:18:20 +01:00
# Metadata exporter for service metadata
metadataexporter:
2026-01-09 23:14:12 +01:00
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metadata"
2026-01-09 11:18:20 +01:00
timeout: 10s
cache:
provider: in_memory
2026-01-09 23:14:12 +01:00
# Debug exporter for debugging (optional)
2026-01-09 06:57:18 +01:00
debug:
verbosity: detailed
2026-01-09 23:14:12 +01:00
sampling_initial: 5
sampling_thereafter: 200
2026-01-08 12:58:00 +01:00
service:
extensions: [health_check, zpages]
pipelines:
2026-01-09 11:18:20 +01:00
# Traces pipeline - exports to ClickHouse and signozmeter connector
2026-01-08 12:58:00 +01:00
traces:
receivers: [otlp]
2026-01-09 11:18:20 +01:00
processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource]
exporters: [clickhousetraces, metadataexporter, signozmeter]
2026-01-08 12:58:00 +01:00
2026-01-09 23:14:12 +01:00
# Metrics pipeline - includes all infrastructure receivers
2026-01-08 12:58:00 +01:00
metrics:
2026-01-09 23:14:12 +01:00
receivers: [otlp,
postgresql/auth, postgresql/inventory, postgresql/orders,
postgresql/ai-insights, postgresql/alert-processor, postgresql/distribution,
postgresql/external, postgresql/forecasting, postgresql/notification,
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
postgresql/production, postgresql/recipes, postgresql/sales,
postgresql/suppliers, postgresql/tenant, postgresql/training,
redis, rabbitmq, k8s_cluster, prometheus]
2026-01-08 12:58:00 +01:00
processors: [memory_limiter, batch, resourcedetection, resource]
2026-01-09 06:57:18 +01:00
exporters: [signozclickhousemetrics]
2026-01-08 12:58:00 +01:00
2026-01-09 11:18:20 +01:00
# Meter pipeline - receives from signozmeter connector
metrics/meter:
receivers: [signozmeter]
processors: [batch/meter]
exporters: [signozclickhousemeter]
2026-01-09 23:14:12 +01:00
# Logs pipeline - includes both OTLP and Kubernetes pod logs
2026-01-08 12:58:00 +01:00
logs:
2026-01-09 23:14:12 +01:00
receivers: [otlp, filelog]
processors: [memory_limiter, batch, resourcedetection, resource, k8sattributes]
2026-01-09 07:26:11 +01:00
exporters: [clickhouselogsexporter]
2026-01-08 12:58:00 +01:00
# HPA for OTEL Collector
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 10
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
2026-01-10 13:43:38 +01:00
# ClusterRole configuration for Kubernetes monitoring
# CRITICAL: Required for k8s_cluster receiver to access Kubernetes API
# Without these permissions, k8s metrics will not appear in SigNoz UI
clusterRole:
create: true
name: "signoz-otel-collector-bakery-ia"
annotations: {}
# Complete RBAC rules required by k8sclusterreceiver
# Based on OpenTelemetry and SigNoz official documentation
rules:
# Core API group - fundamental Kubernetes resources
- apiGroups: [""]
resources:
- "events"
- "namespaces"
- "nodes"
- "nodes/proxy"
- "nodes/metrics"
- "nodes/spec"
- "pods"
- "pods/status"
- "replicationcontrollers"
- "replicationcontrollers/status"
- "resourcequotas"
- "services"
- "endpoints"
verbs: ["get", "list", "watch"]
# Apps API group - modern workload controllers
- apiGroups: ["apps"]
resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
verbs: ["get", "list", "watch"]
# Batch API group - job management
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch"]
# Autoscaling API group - HPA metrics (CRITICAL)
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["get", "list", "watch"]
# Extensions API group - legacy support
- apiGroups: ["extensions"]
resources: ["deployments", "daemonsets", "replicasets"]
verbs: ["get", "list", "watch"]
# Metrics API group - resource metrics
- apiGroups: ["metrics.k8s.io"]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch"]
clusterRoleBinding:
annotations: {}
name: "signoz-otel-collector-bakery-ia"
2026-01-09 07:26:11 +01:00
# Schema Migrator - Manages ClickHouse schema migrations
schemaMigrator:
2026-01-08 12:58:00 +01:00
enabled: true
image:
repository: signoz/signoz-schema-migrator
2026-01-09 07:26:11 +01:00
tag: v0.129.12 # Updated to latest version
2026-01-08 12:58:00 +01:00
pullPolicy: IfNotPresent
2026-01-09 07:26:11 +01:00
# Enable Helm hooks for proper upgrade handling
upgradeHelmHooks: true
2026-01-08 12:58:00 +01:00
# Additional Configuration
serviceAccount:
create: true
annotations: {}
name: "signoz"
# Security Context
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
# Pod Disruption Budgets for HA
podDisruptionBudget:
frontend:
enabled: true
minAvailable: 1
queryService:
enabled: true
minAvailable: 1
alertmanager:
enabled: true
minAvailable: 1
clickhouse:
enabled: true
minAvailable: 1
# Network Policies for security
networkPolicy:
enabled: true
policyTypes:
- Ingress
- Egress
# Monitoring SigNoz itself
selfMonitoring:
enabled: true
serviceMonitor:
enabled: true
interval: 30s