Imporve monitoring 2
This commit is contained in:
@@ -1,11 +1,13 @@
|
||||
# SigNoz Helm Chart Values - Production Environment
|
||||
# High-availability configuration with resource optimization
|
||||
# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by bakery-ingress-prod
|
||||
#
|
||||
# Official Chart: https://github.com/SigNoz/charts
|
||||
# Install Command: helm install signoz signoz/signoz -n signoz --create-namespace -f signoz-values-prod.yaml
|
||||
# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
|
||||
|
||||
global:
|
||||
storageClass: "standard"
|
||||
storageClass: "standard" # For MicroK8s, use "microk8s-hostpath" or custom storage class
|
||||
clusterName: "bakery-ia-prod"
|
||||
domain: "monitoring.bakewise.ai"
|
||||
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
|
||||
imagePullSecrets:
|
||||
@@ -15,43 +17,33 @@ global:
|
||||
imagePullSecrets:
|
||||
- dockerhub-creds
|
||||
|
||||
# Frontend Configuration
|
||||
frontend:
|
||||
# SigNoz Main Component (unified frontend + query service)
|
||||
# BREAKING CHANGE: v0.89.0+ uses unified component instead of separate frontend/queryService
|
||||
signoz:
|
||||
replicaCount: 2
|
||||
|
||||
image:
|
||||
repository: signoz/frontend
|
||||
tag: 0.52.3
|
||||
repository: signoz/signoz
|
||||
tag: v0.106.0 # Latest stable version
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 3301
|
||||
port: 8080 # HTTP/API port
|
||||
internalPort: 8085 # Internal gRPC port
|
||||
|
||||
# DISABLE built-in ingress - using unified bakery-ingress-prod instead
|
||||
# Route configured in infrastructure/kubernetes/overlays/prod/prod-ingress.yaml
|
||||
ingress:
|
||||
enabled: true
|
||||
className: nginx
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
||||
nginx.ingress.kubernetes.io/use-regex: "true"
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-prod"
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
hosts:
|
||||
- host: monitoring.bakewise.ai
|
||||
paths:
|
||||
- path: /signoz(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
tls:
|
||||
- secretName: signoz-tls
|
||||
hosts:
|
||||
- monitoring.bakewise.ai
|
||||
enabled: false
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 2000m
|
||||
memory: 4Gi
|
||||
|
||||
# Pod Anti-affinity for HA
|
||||
affinity:
|
||||
@@ -60,58 +52,27 @@ frontend:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- signoz-frontend
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: query-service
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
||||
# Environment variables (new format - replaces configVars)
|
||||
env:
|
||||
- name: FRONTEND_REFRESH_INTERVAL
|
||||
value: "30000"
|
||||
|
||||
# Query Service Configuration
|
||||
queryService:
|
||||
replicaCount: 2
|
||||
image:
|
||||
repository: signoz/query-service
|
||||
tag: 0.52.3
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 8080
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
|
||||
# Pod Anti-affinity for HA
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- signoz-query-service
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
||||
env:
|
||||
- name: DEPLOYMENT_TYPE
|
||||
value: "kubernetes-helm"
|
||||
- name: SIGNOZ_LOCAL_DB_PATH
|
||||
value: "/var/lib/signoz"
|
||||
- name: RETENTION_DAYS
|
||||
value: "30"
|
||||
signoz_telemetrystore_provider: "clickhouse"
|
||||
dot_metrics_enabled: "true"
|
||||
signoz_emailing_enabled: "true"
|
||||
signoz_alertmanager_provider: "signoz"
|
||||
# Retention configuration (30 days for prod)
|
||||
signoz_traces_ttl_duration_hrs: "720"
|
||||
signoz_metrics_ttl_duration_hrs: "720"
|
||||
signoz_logs_ttl_duration_hrs: "720"
|
||||
# SMTP configuration for email alerts
|
||||
signoz_smtp_enabled: "true"
|
||||
signoz_smtp_host: "smtp.gmail.com"
|
||||
signoz_smtp_port: "587"
|
||||
signoz_smtp_from: "alerts@bakewise.ai"
|
||||
signoz_smtp_username: "alerts@bakewise.ai"
|
||||
# Password should be set via secret: signoz_smtp_password
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
@@ -128,7 +89,9 @@ queryService:
|
||||
|
||||
# AlertManager Configuration
|
||||
alertmanager:
|
||||
enabled: true
|
||||
replicaCount: 2
|
||||
|
||||
image:
|
||||
repository: signoz/alertmanager
|
||||
tag: 0.23.5
|
||||
@@ -140,11 +103,11 @@ alertmanager:
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
memory: 512Mi
|
||||
|
||||
# Pod Anti-affinity for HA
|
||||
affinity:
|
||||
@@ -210,24 +173,24 @@ alertmanager:
|
||||
|
||||
# ClickHouse Configuration - Time Series Database
|
||||
clickhouse:
|
||||
replicaCount: 2
|
||||
enabled: true
|
||||
installCustomStorageClass: false
|
||||
|
||||
image:
|
||||
registry: docker.io
|
||||
repository: clickhouse/clickhouse-server
|
||||
tag: 24.1.2-alpine
|
||||
tag: 25.5.6 # Updated to official recommended version
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
httpPort: 8123
|
||||
tcpPort: 9000
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
limits:
|
||||
cpu: 2000m
|
||||
memory: 4Gi
|
||||
# ClickHouse resources (nested config)
|
||||
clickhouse:
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
limits:
|
||||
cpu: 4000m
|
||||
memory: 8Gi
|
||||
|
||||
# Pod Anti-affinity for HA
|
||||
affinity:
|
||||
@@ -246,50 +209,63 @@ clickhouse:
|
||||
size: 100Gi
|
||||
storageClass: "standard"
|
||||
|
||||
# ClickHouse configuration
|
||||
config:
|
||||
logger:
|
||||
level: information
|
||||
max_connections: 4096
|
||||
max_concurrent_queries: 500
|
||||
# Data retention (30 days for prod)
|
||||
merge_tree:
|
||||
parts_to_delay_insert: 150
|
||||
parts_to_throw_insert: 300
|
||||
# Performance tuning
|
||||
max_memory_usage: 10000000000
|
||||
max_bytes_before_external_group_by: 20000000000
|
||||
|
||||
# Backup configuration
|
||||
backup:
|
||||
# Cold storage configuration for better disk space management
|
||||
coldStorage:
|
||||
enabled: true
|
||||
schedule: "0 2 * * *"
|
||||
retention: 7
|
||||
defaultKeepFreeSpaceBytes: 10737418240 # Keep 10GB free
|
||||
ttl:
|
||||
deleteTTLDays: 30 # Move old data to cold storage after 30 days
|
||||
|
||||
# Zookeeper Configuration (required by ClickHouse for coordination)
|
||||
zookeeper:
|
||||
enabled: true
|
||||
replicaCount: 3 # CRITICAL: Always use 3 replicas for production HA
|
||||
|
||||
image:
|
||||
tag: 3.7.1 # Official recommended version
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
storageClass: "standard"
|
||||
|
||||
# OpenTelemetry Collector - Integrated with SigNoz
|
||||
otelCollector:
|
||||
enabled: true
|
||||
replicaCount: 2
|
||||
|
||||
image:
|
||||
repository: signoz/signoz-otel-collector
|
||||
tag: 0.102.8
|
||||
tag: v0.129.12 # Updated to latest recommended version
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
otlpGrpc: 4317
|
||||
otlpHttp: 4318
|
||||
metrics: 8888
|
||||
healthCheck: 13133
|
||||
- name: otlp-grpc
|
||||
port: 4317
|
||||
- name: otlp-http
|
||||
port: 4318
|
||||
- name: metrics
|
||||
port: 8888
|
||||
- name: healthcheck
|
||||
port: 13133
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
cpu: 2000m
|
||||
memory: 2Gi
|
||||
|
||||
# Full OTEL Collector Configuration
|
||||
config:
|
||||
@@ -304,7 +280,7 @@ otelCollector:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
max_recv_msg_size_mib: 16
|
||||
max_recv_msg_size_mib: 32 # Increased for larger payloads
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
cors:
|
||||
@@ -322,19 +298,20 @@ otelCollector:
|
||||
- targets: ['localhost:8888']
|
||||
|
||||
processors:
|
||||
# High-performance batch processing (official recommendation)
|
||||
batch:
|
||||
timeout: 10s
|
||||
send_batch_size: 2048
|
||||
send_batch_max_size: 4096
|
||||
timeout: 1s # Reduced from 10s for faster processing
|
||||
send_batch_size: 50000 # Increased from 2048 (official recommendation for traces)
|
||||
send_batch_max_size: 50000
|
||||
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 800
|
||||
spike_limit_mib: 200
|
||||
limit_mib: 1500 # 75% of container memory (2Gi = ~2048Mi)
|
||||
spike_limit_mib: 300
|
||||
|
||||
# Resource detection for K8s
|
||||
resourcedetection:
|
||||
detectors: [env, system, docker]
|
||||
detectors: [env, system, docker, kubernetes]
|
||||
timeout: 5s
|
||||
|
||||
# Add resource attributes
|
||||
@@ -347,6 +324,12 @@ otelCollector:
|
||||
value: bakery-ia-prod
|
||||
action: upsert
|
||||
|
||||
# Span metrics processor for automatic service performance metrics
|
||||
spanmetrics:
|
||||
metrics_exporter: signozclickhousemetrics
|
||||
latency_histogram_buckets: [2ms, 4ms, 6ms, 8ms, 10ms, 50ms, 100ms, 200ms, 400ms, 800ms, 1s, 1400ms, 2s, 5s, 10s, 15s]
|
||||
dimensions_cache_size: 100000
|
||||
|
||||
exporters:
|
||||
# Export to SigNoz ClickHouse
|
||||
clickhousetraces:
|
||||
@@ -387,8 +370,8 @@ otelCollector:
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||
exporters: [clickhousetraces, debug]
|
||||
processors: [memory_limiter, batch, spanmetrics, resourcedetection, resource]
|
||||
exporters: [clickhousetraces]
|
||||
|
||||
metrics:
|
||||
receivers: [otlp, prometheus]
|
||||
@@ -398,12 +381,7 @@ otelCollector:
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||
exporters: [clickhouselogsexporter, debug]
|
||||
|
||||
# OpenTelemetry Collector Deployment Mode
|
||||
otelCollectorDeployment:
|
||||
enabled: true
|
||||
mode: deployment
|
||||
exporters: [clickhouselogsexporter]
|
||||
|
||||
# HPA for OTEL Collector
|
||||
autoscaling:
|
||||
@@ -413,29 +391,18 @@ otelCollectorDeployment:
|
||||
targetCPUUtilizationPercentage: 70
|
||||
targetMemoryUtilizationPercentage: 80
|
||||
|
||||
# Node Exporter for infrastructure metrics
|
||||
nodeExporter:
|
||||
# Schema Migrator - Manages ClickHouse schema migrations
|
||||
schemaMigrator:
|
||||
enabled: true
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 9100
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
|
||||
# Schemamanager - Manages ClickHouse schema
|
||||
schemamanager:
|
||||
enabled: true
|
||||
image:
|
||||
repository: signoz/signoz-schema-migrator
|
||||
tag: 0.52.3
|
||||
tag: v0.129.12 # Updated to latest version
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
# Enable Helm hooks for proper upgrade handling
|
||||
upgradeHelmHooks: true
|
||||
|
||||
# Additional Configuration
|
||||
serviceAccount:
|
||||
create: true
|
||||
|
||||
Reference in New Issue
Block a user