Imporve monitoring 2

This commit is contained in:
Urtzi Alfaro
2026-01-09 07:26:11 +01:00
parent 4af860c010
commit 8ca5d9c100
39 changed files with 1035 additions and 376 deletions

View File

@@ -1,11 +1,13 @@
# SigNoz Helm Chart Values - Production Environment
# High-availability configuration with resource optimization
# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress-prod
#
# Official Chart: https://github.com/SigNoz/charts
# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-prod.yaml
# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
global:
storageClass: "standard"
storageClass: "standard" # For MicroK8s, use "microk8s-hostpath" or custom storage class
clusterName: "bakery-ia-prod"
domain: "monitoring.bakewise.ai"
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
imagePullSecrets:
@@ -15,43 +17,33 @@ global:
imagePullSecrets:
- dockerhub-creds
# Frontend Configuration
frontend:
# SigNoz Main Component (unified frontend + query service)
# BREAKING CHANGE: v0.89.0+ uses unified component instead of separate frontend/queryService
signoz:
replicaCount: 2
image:
repository: signoz/frontend
tag: 0.52.3
repository: signoz/signoz
tag: v0.106.0 # Latest stable version
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 3301
port: 8080 # HTTP/API port
internalPort: 8085 # Internal gRPC port
# DISABLE built-in ingress - using unified bakery-ingress-prod instead
# Route configured in infrastructure/kubernetes/overlays/prod/prod-ingress.yaml
ingress:
enabled: true
className: nginx
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/use-regex: "true"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
hosts:
- host: monitoring.bakewise.ai
paths:
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
tls:
- secretName: signoz-tls
hosts:
- monitoring.bakewise.ai
enabled: false
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 500m
memory: 1Gi
limits:
cpu: 2000m
memory: 4Gi
# Pod Anti-affinity for HA
affinity:
@@ -60,58 +52,27 @@ frontend:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-frontend
matchLabels:
app.kubernetes.io/component: query-service
topologyKey: kubernetes.io/hostname
# Environment variables (new format - replaces configVars)
env:
- name: FRONTEND_REFRESH_INTERVAL
value: "30000"
# Query Service Configuration
queryService:
replicaCount: 2
image:
repository: signoz/query-service
tag: 0.52.3
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 8080
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1000m
memory: 2Gi
# Pod Anti-affinity for HA
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- signoz-query-service
topologyKey: kubernetes.io/hostname
env:
- name: DEPLOYMENT_TYPE
value: "kubernetes-helm"
- name: SIGNOZ_LOCAL_DB_PATH
value: "/var/lib/signoz"
- name: RETENTION_DAYS
value: "30"
signoz_telemetrystore_provider: "clickhouse"
dot_metrics_enabled: "true"
signoz_emailing_enabled: "true"
signoz_alertmanager_provider: "signoz"
# Retention configuration (30 days for prod)
signoz_traces_ttl_duration_hrs: "720"
signoz_metrics_ttl_duration_hrs: "720"
signoz_logs_ttl_duration_hrs: "720"
# SMTP configuration for email alerts
signoz_smtp_enabled: "true"
signoz_smtp_host: "smtp.gmail.com"
signoz_smtp_port: "587"
signoz_smtp_from: "alerts@bakewise.ai"
signoz_smtp_username: "alerts@bakewise.ai"
# Password should be set via secret: signoz_smtp_password
persistence:
enabled: true
@@ -128,7 +89,9 @@ queryService:
# AlertManager Configuration
alertmanager:
enabled: true
replicaCount: 2
image:
repository: signoz/alertmanager
tag: 0.23.5
@@ -140,11 +103,11 @@ alertmanager:
resources:
requests:
cpu: 250m
memory: 512Mi
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 1Gi
memory: 512Mi
# Pod Anti-affinity for HA
affinity:
@@ -210,24 +173,24 @@ alertmanager:
# ClickHouse Configuration - Time Series Database
clickhouse:
replicaCount: 2
enabled: true
installCustomStorageClass: false
image:
registry: docker.io
repository: clickhouse/clickhouse-server
tag: 24.1.2-alpine
tag: 25.5.6 # Updated to official recommended version
pullPolicy: IfNotPresent
service:
type: ClusterIP
httpPort: 8123
tcpPort: 9000
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 2000m
memory: 4Gi
# ClickHouse resources (nested config)
clickhouse:
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 4000m
memory: 8Gi
# Pod Anti-affinity for HA
affinity:
@@ -246,50 +209,63 @@ clickhouse:
size: 100Gi
storageClass: "standard"
# ClickHouse configuration
config:
logger:
level: information
max_connections: 4096
max_concurrent_queries: 500
# Data retention (30 days for prod)
merge_tree:
parts_to_delay_insert: 150
parts_to_throw_insert: 300
# Performance tuning
max_memory_usage: 10000000000
max_bytes_before_external_group_by: 20000000000
# Backup configuration
backup:
# Cold storage configuration for better disk space management
coldStorage:
enabled: true
schedule: "0 2 * * *"
retention: 7
defaultKeepFreeSpaceBytes: 10737418240 # Keep 10GB free
ttl:
deleteTTLDays: 30 # Move old data to cold storage after 30 days
# Zookeeper Configuration (required by ClickHouse for coordination)
zookeeper:
enabled: true
replicaCount: 3 # CRITICAL: Always use 3 replicas for production HA
image:
tag: 3.7.1 # Official recommended version
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
persistence:
enabled: true
size: 10Gi
storageClass: "standard"
# OpenTelemetry Collector - Integrated with SigNoz
otelCollector:
enabled: true
replicaCount: 2
image:
repository: signoz/signoz-otel-collector
tag: 0.102.8
tag: v0.129.12 # Updated to latest recommended version
pullPolicy: IfNotPresent
service:
type: ClusterIP
ports:
otlpGrpc: 4317
otlpHttp: 4318
metrics: 8888
healthCheck: 13133
- name: otlp-grpc
port: 4317
- name: otlp-http
port: 4318
- name: metrics
port: 8888
- name: healthcheck
port: 13133
resources:
requests:
cpu: 500m
memory: 512Mi
limits:
cpu: 1000m
memory: 1Gi
cpu: 2000m
memory: 2Gi
# Full OTEL Collector Configuration
config:
@@ -304,7 +280,7 @@ otelCollector:
protocols:
grpc:
endpoint: 0.0.0.0:4317
max_recv_msg_size_mib: 16
max_recv_msg_size_mib: 32 # Increased for larger payloads
http:
endpoint: 0.0.0.0:4318
cors:
@@ -322,19 +298,20 @@ otelCollector:
- targets: ['localhost:8888']
processors:
# High-performance batch processing (official recommendation)
batch:
timeout: 10s
send_batch_size: 2048
send_batch_max_size: 4096
timeout: 1s # Reduced from 10s for faster processing
send_batch_size: 50000 # Increased from 2048 (official recommendation for traces)
send_batch_max_size: 50000
memory_limiter:
check_interval: 1s
limit_mib: 800
spike_limit_mib: 200
limit_mib: 1500 # 75% of container memory (2Gi = ~2048Mi)
spike_limit_mib: 300
# Resource detection for K8s
resourcedetection:
detectors: [env, system, docker]
detectors: [env, system, docker, kubernetes]
timeout: 5s
# Add resource attributes
@@ -347,6 +324,12 @@ otelCollector:
value: bakery-ia-prod
action: upsert
# Span metrics processor for automatic service performance metrics
spanmetrics:
metrics_exporter: signozclickhousemetrics
latency_histogram_buckets: [2ms, 4ms, 6ms, 8ms, 10ms, 50ms, 100ms, 200ms, 400ms, 800ms, 1s, 1400ms, 2s, 5s, 10s, 15s]
dimensions_cache_size: 100000
exporters:
# Export to SigNoz ClickHouse
clickhousetraces:
@@ -387,8 +370,8 @@ otelCollector:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhousetraces, debug]
processors: [memory_limiter, batch, spanmetrics, resourcedetection, resource]
exporters: [clickhousetraces]
metrics:
receivers: [otlp, prometheus]
@@ -398,12 +381,7 @@ otelCollector:
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhouselogsexporter, debug]
# OpenTelemetry Collector Deployment Mode
otelCollectorDeployment:
enabled: true
mode: deployment
exporters: [clickhouselogsexporter]
# HPA for OTEL Collector
autoscaling:
@@ -413,29 +391,18 @@ otelCollectorDeployment:
targetCPUUtilizationPercentage: 70
targetMemoryUtilizationPercentage: 80
# Node Exporter for infrastructure metrics
nodeExporter:
# Schema Migrator - Manages ClickHouse schema migrations
schemaMigrator:
enabled: true
service:
type: ClusterIP
port: 9100
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
# Schemamanager - Manages ClickHouse schema
schemamanager:
enabled: true
image:
repository: signoz/signoz-schema-migrator
tag: 0.52.3
tag: v0.129.12 # Updated to latest version
pullPolicy: IfNotPresent
# Enable Helm hooks for proper upgrade handling
upgradeHelmHooks: true
# Additional Configuration
serviceAccount:
create: true