Imporve monitoring 5

This commit is contained in:
Urtzi Alfaro
2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions

View File

@@ -48,9 +48,9 @@ signoz:
signoz_traces_ttl_duration_hrs: "168"
signoz_metrics_ttl_duration_hrs: "168"
signoz_logs_ttl_duration_hrs: "168"
# OpAMP Server Configuration
signoz_opamp_server_enabled: "true"
signoz_opamp_server_endpoint: "0.0.0.0:4320"
# OpAMP Server Configuration - DISABLED for dev (causes gRPC instability)
signoz_opamp_server_enabled: "false"
# signoz_opamp_server_endpoint: "0.0.0.0:4320"
persistence:
enabled: true
@@ -149,9 +149,10 @@ otelCollector:
repository: signoz/signoz-otel-collector
tag: v0.129.12 # Latest recommended version
# OpAMP Configuration - Enabled for dynamic configuration management
# Note: OpAMP allows remote configuration management via SigNoz backend
# This replaces the manual kubectl patch approach
# OpAMP Configuration - DISABLED for development
# OpAMP is designed for production with remote config management
# In dev, it causes gRPC instability and collector reloads
# We use static configuration instead
# Init containers for the Otel Collector pod
initContainers:
@@ -231,6 +232,9 @@ otelCollector:
secretName: postgres-tls
- name: postgres-tls-fixed
emptyDir: {}
- name: varlogpods
hostPath:
path: /var/log/pods
extraVolumeMounts:
- name: redis-tls
@@ -242,13 +246,16 @@ otelCollector:
- name: postgres-tls-fixed
mountPath: /etc/postgres-tls
readOnly: false
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
# Enable OpAMP for dynamic configuration management
# Disable OpAMP - use static configuration only
# Use 'args' instead of 'extraArgs' to completely override the command
command:
name: /signoz-otel-collector
extraArgs:
args:
- --config=/conf/otel-collector-config.yaml
- --manager-config=/conf/otel-collector-opamp-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName
# OpenTelemetry Collector configuration
@@ -275,6 +282,63 @@ otelCollector:
allowed_origins:
- "*"
# Filelog receiver for Kubernetes pod logs
# Collects container stdout/stderr from /var/log/pods
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
# Exclude SigNoz's own logs to avoid recursive collection
- /var/log/pods/bakery-ia_signoz-*/*/*.log
include_file_path: true
include_file_name: false
operators:
# Parse CRI-O / containerd log format
- type: regex_parser
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Fix timestamp parsing - extract from the parsed time field
- type: move
from: attributes.time
to: attributes.timestamp
# Extract Kubernetes metadata from file path
- type: regex_parser
id: extract_metadata_from_filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
parse_from: attributes["log.file.path"]
# Move metadata to resource attributes
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.log
to: body
# Kubernetes Cluster Receiver - Collects cluster-level metrics
# Provides information about nodes, namespaces, pods, and other cluster resources
k8s_cluster:
collection_interval: 30s
node_conditions_to_report:
- Ready
- MemoryPressure
- DiskPressure
- PIDPressure
- NetworkUnavailable
allocatable_types_to_report:
- cpu
- memory
- pods
# PostgreSQL receivers for database metrics
# ENABLED: Monitor users configured and credentials stored in secrets
# Collects metrics directly from PostgreSQL databases with proper TLS
@@ -538,6 +602,43 @@ otelCollector:
password: ${env:RABBITMQ_PASSWORD}
collection_interval: 30s
# Prometheus Receiver - Scrapes metrics from Kubernetes API
# Simplified configuration using only Kubernetes API metrics
prometheus:
config:
scrape_configs:
- job_name: 'kubernetes-nodes-cadvisor'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
processors:
# Batch processor for better performance (optimized for high throughput)
batch:
@@ -562,6 +663,25 @@ otelCollector:
detectors: [env, system, docker]
timeout: 5s
# Kubernetes attributes processor - CRITICAL for logs
# Extracts pod, namespace, container metadata from log attributes
k8sattributes:
auth_type: "serviceAccount"
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
- k8s.container.name
labels:
- tag_name: "app"
- tag_name: "pod-template-hash"
annotations:
- tag_name: "description"
# SigNoz span metrics processor with delta aggregation (recommended)
# Generates RED metrics (Rate, Error, Duration) from trace spans
signozspanmetrics/delta:
@@ -643,7 +763,7 @@ otelCollector:
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
postgresql/production, postgresql/recipes, postgresql/sales,
postgresql/suppliers, postgresql/tenant, postgresql/training,
redis, rabbitmq]
redis, rabbitmq, k8s_cluster, prometheus]
processors: [memory_limiter, batch, resourcedetection]
exporters: [signozclickhousemetrics]
@@ -653,17 +773,38 @@ otelCollector:
processors: [batch/meter]
exporters: [signozclickhousemeter]
# Logs pipeline
# Logs pipeline - includes both OTLP and Kubernetes pod logs
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection]
receivers: [otlp, filelog]
processors: [memory_limiter, batch, resourcedetection, k8sattributes]
exporters: [clickhouselogsexporter]
# Additional Configuration
serviceAccount:
create: true
annotations: {}
name: ""
name: "signoz-otel-collector"
# RBAC Configuration for Kubernetes monitoring
# Required for k8s_cluster and kubeletstats receivers to access Kubernetes API
rbac:
create: true
rules:
- apiGroups: [""]
resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources: ["deployments", "daemonsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["metrics.k8s.io"]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch"]
# Security Context
securityContext: