Imporve monitoring 5
This commit is contained in:
@@ -48,9 +48,9 @@ signoz:
|
||||
signoz_traces_ttl_duration_hrs: "168"
|
||||
signoz_metrics_ttl_duration_hrs: "168"
|
||||
signoz_logs_ttl_duration_hrs: "168"
|
||||
# OpAMP Server Configuration
|
||||
signoz_opamp_server_enabled: "true"
|
||||
signoz_opamp_server_endpoint: "0.0.0.0:4320"
|
||||
# OpAMP Server Configuration - DISABLED for dev (causes gRPC instability)
|
||||
signoz_opamp_server_enabled: "false"
|
||||
# signoz_opamp_server_endpoint: "0.0.0.0:4320"
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
@@ -149,9 +149,10 @@ otelCollector:
|
||||
repository: signoz/signoz-otel-collector
|
||||
tag: v0.129.12 # Latest recommended version
|
||||
|
||||
# OpAMP Configuration - Enabled for dynamic configuration management
|
||||
# Note: OpAMP allows remote configuration management via SigNoz backend
|
||||
# This replaces the manual kubectl patch approach
|
||||
# OpAMP Configuration - DISABLED for development
|
||||
# OpAMP is designed for production with remote config management
|
||||
# In dev, it causes gRPC instability and collector reloads
|
||||
# We use static configuration instead
|
||||
|
||||
# Init containers for the Otel Collector pod
|
||||
initContainers:
|
||||
@@ -231,6 +232,9 @@ otelCollector:
|
||||
secretName: postgres-tls
|
||||
- name: postgres-tls-fixed
|
||||
emptyDir: {}
|
||||
- name: varlogpods
|
||||
hostPath:
|
||||
path: /var/log/pods
|
||||
|
||||
extraVolumeMounts:
|
||||
- name: redis-tls
|
||||
@@ -242,13 +246,16 @@ otelCollector:
|
||||
- name: postgres-tls-fixed
|
||||
mountPath: /etc/postgres-tls
|
||||
readOnly: false
|
||||
- name: varlogpods
|
||||
mountPath: /var/log/pods
|
||||
readOnly: true
|
||||
|
||||
# Enable OpAMP for dynamic configuration management
|
||||
# Disable OpAMP - use static configuration only
|
||||
# Use 'args' instead of 'extraArgs' to completely override the command
|
||||
command:
|
||||
name: /signoz-otel-collector
|
||||
extraArgs:
|
||||
args:
|
||||
- --config=/conf/otel-collector-config.yaml
|
||||
- --manager-config=/conf/otel-collector-opamp-config.yaml
|
||||
- --feature-gates=-pkg.translator.prometheus.NormalizeName
|
||||
|
||||
# OpenTelemetry Collector configuration
|
||||
@@ -275,6 +282,63 @@ otelCollector:
|
||||
allowed_origins:
|
||||
- "*"
|
||||
|
||||
# Filelog receiver for Kubernetes pod logs
|
||||
# Collects container stdout/stderr from /var/log/pods
|
||||
filelog:
|
||||
include:
|
||||
- /var/log/pods/*/*/*.log
|
||||
exclude:
|
||||
# Exclude SigNoz's own logs to avoid recursive collection
|
||||
- /var/log/pods/bakery-ia_signoz-*/*/*.log
|
||||
include_file_path: true
|
||||
include_file_name: false
|
||||
operators:
|
||||
# Parse CRI-O / containerd log format
|
||||
- type: regex_parser
|
||||
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
|
||||
timestamp:
|
||||
parse_from: attributes.time
|
||||
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
|
||||
# Fix timestamp parsing - extract from the parsed time field
|
||||
- type: move
|
||||
from: attributes.time
|
||||
to: attributes.timestamp
|
||||
# Extract Kubernetes metadata from file path
|
||||
- type: regex_parser
|
||||
id: extract_metadata_from_filepath
|
||||
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
|
||||
parse_from: attributes["log.file.path"]
|
||||
# Move metadata to resource attributes
|
||||
- type: move
|
||||
from: attributes.namespace
|
||||
to: resource["k8s.namespace.name"]
|
||||
- type: move
|
||||
from: attributes.pod_name
|
||||
to: resource["k8s.pod.name"]
|
||||
- type: move
|
||||
from: attributes.container_name
|
||||
to: resource["k8s.container.name"]
|
||||
- type: move
|
||||
from: attributes.log
|
||||
to: body
|
||||
|
||||
# Kubernetes Cluster Receiver - Collects cluster-level metrics
|
||||
# Provides information about nodes, namespaces, pods, and other cluster resources
|
||||
k8s_cluster:
|
||||
collection_interval: 30s
|
||||
node_conditions_to_report:
|
||||
- Ready
|
||||
- MemoryPressure
|
||||
- DiskPressure
|
||||
- PIDPressure
|
||||
- NetworkUnavailable
|
||||
allocatable_types_to_report:
|
||||
- cpu
|
||||
- memory
|
||||
- pods
|
||||
|
||||
|
||||
|
||||
# PostgreSQL receivers for database metrics
|
||||
# ENABLED: Monitor users configured and credentials stored in secrets
|
||||
# Collects metrics directly from PostgreSQL databases with proper TLS
|
||||
@@ -538,6 +602,43 @@ otelCollector:
|
||||
password: ${env:RABBITMQ_PASSWORD}
|
||||
collection_interval: 30s
|
||||
|
||||
# Prometheus Receiver - Scrapes metrics from Kubernetes API
|
||||
# Simplified configuration using only Kubernetes API metrics
|
||||
prometheus:
|
||||
config:
|
||||
scrape_configs:
|
||||
- job_name: 'kubernetes-nodes-cadvisor'
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
scheme: https
|
||||
tls_config:
|
||||
insecure_skip_verify: true
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
kubernetes_sd_configs:
|
||||
- role: node
|
||||
relabel_configs:
|
||||
- action: labelmap
|
||||
regex: __meta_kubernetes_node_label_(.+)
|
||||
- target_label: __address__
|
||||
replacement: kubernetes.default.svc:443
|
||||
- source_labels: [__meta_kubernetes_node_name]
|
||||
regex: (.+)
|
||||
target_label: __metrics_path__
|
||||
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
|
||||
- job_name: 'kubernetes-apiserver'
|
||||
scrape_interval: 30s
|
||||
scrape_timeout: 10s
|
||||
scheme: https
|
||||
tls_config:
|
||||
insecure_skip_verify: true
|
||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
kubernetes_sd_configs:
|
||||
- role: endpoints
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
|
||||
action: keep
|
||||
regex: default;kubernetes;https
|
||||
|
||||
processors:
|
||||
# Batch processor for better performance (optimized for high throughput)
|
||||
batch:
|
||||
@@ -562,6 +663,25 @@ otelCollector:
|
||||
detectors: [env, system, docker]
|
||||
timeout: 5s
|
||||
|
||||
# Kubernetes attributes processor - CRITICAL for logs
|
||||
# Extracts pod, namespace, container metadata from log attributes
|
||||
k8sattributes:
|
||||
auth_type: "serviceAccount"
|
||||
passthrough: false
|
||||
extract:
|
||||
metadata:
|
||||
- k8s.pod.name
|
||||
- k8s.pod.uid
|
||||
- k8s.deployment.name
|
||||
- k8s.namespace.name
|
||||
- k8s.node.name
|
||||
- k8s.container.name
|
||||
labels:
|
||||
- tag_name: "app"
|
||||
- tag_name: "pod-template-hash"
|
||||
annotations:
|
||||
- tag_name: "description"
|
||||
|
||||
# SigNoz span metrics processor with delta aggregation (recommended)
|
||||
# Generates RED metrics (Rate, Error, Duration) from trace spans
|
||||
signozspanmetrics/delta:
|
||||
@@ -643,7 +763,7 @@ otelCollector:
|
||||
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
|
||||
postgresql/production, postgresql/recipes, postgresql/sales,
|
||||
postgresql/suppliers, postgresql/tenant, postgresql/training,
|
||||
redis, rabbitmq]
|
||||
redis, rabbitmq, k8s_cluster, prometheus]
|
||||
processors: [memory_limiter, batch, resourcedetection]
|
||||
exporters: [signozclickhousemetrics]
|
||||
|
||||
@@ -653,17 +773,38 @@ otelCollector:
|
||||
processors: [batch/meter]
|
||||
exporters: [signozclickhousemeter]
|
||||
|
||||
# Logs pipeline
|
||||
# Logs pipeline - includes both OTLP and Kubernetes pod logs
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch, resourcedetection]
|
||||
receivers: [otlp, filelog]
|
||||
processors: [memory_limiter, batch, resourcedetection, k8sattributes]
|
||||
exporters: [clickhouselogsexporter]
|
||||
|
||||
# Additional Configuration
|
||||
serviceAccount:
|
||||
create: true
|
||||
annotations: {}
|
||||
name: ""
|
||||
name: "signoz-otel-collector"
|
||||
|
||||
# RBAC Configuration for Kubernetes monitoring
|
||||
# Required for k8s_cluster and kubeletstats receivers to access Kubernetes API
|
||||
rbac:
|
||||
create: true
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: ["batch"]
|
||||
resources: ["jobs", "cronjobs"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: ["extensions"]
|
||||
resources: ["deployments", "daemonsets", "replicasets"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: ["metrics.k8s.io"]
|
||||
resources: ["nodes", "pods"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
|
||||
# Security Context
|
||||
securityContext:
|
||||
|
||||
Reference in New Issue
Block a user