Imporve monitoring 5

This commit is contained in:
Urtzi Alfaro
2026-01-09 23:14:12 +01:00
parent 22dab143ba
commit c05538cafb
23 changed files with 4737 additions and 1932 deletions

View File

@@ -48,9 +48,9 @@ signoz:
signoz_traces_ttl_duration_hrs: "168"
signoz_metrics_ttl_duration_hrs: "168"
signoz_logs_ttl_duration_hrs: "168"
# OpAMP Server Configuration
signoz_opamp_server_enabled: "true"
signoz_opamp_server_endpoint: "0.0.0.0:4320"
# OpAMP Server Configuration - DISABLED for dev (causes gRPC instability)
signoz_opamp_server_enabled: "false"
# signoz_opamp_server_endpoint: "0.0.0.0:4320"
persistence:
enabled: true
@@ -149,9 +149,10 @@ otelCollector:
repository: signoz/signoz-otel-collector
tag: v0.129.12 # Latest recommended version
# OpAMP Configuration - Enabled for dynamic configuration management
# Note: OpAMP allows remote configuration management via SigNoz backend
# This replaces the manual kubectl patch approach
# OpAMP Configuration - DISABLED for development
# OpAMP is designed for production with remote config management
# In dev, it causes gRPC instability and collector reloads
# We use static configuration instead
# Init containers for the Otel Collector pod
initContainers:
@@ -231,6 +232,9 @@ otelCollector:
secretName: postgres-tls
- name: postgres-tls-fixed
emptyDir: {}
- name: varlogpods
hostPath:
path: /var/log/pods
extraVolumeMounts:
- name: redis-tls
@@ -242,13 +246,16 @@ otelCollector:
- name: postgres-tls-fixed
mountPath: /etc/postgres-tls
readOnly: false
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
# Enable OpAMP for dynamic configuration management
# Disable OpAMP - use static configuration only
# Use 'args' instead of 'extraArgs' to completely override the command
command:
name: /signoz-otel-collector
extraArgs:
args:
- --config=/conf/otel-collector-config.yaml
- --manager-config=/conf/otel-collector-opamp-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName
# OpenTelemetry Collector configuration
@@ -275,6 +282,63 @@ otelCollector:
allowed_origins:
- "*"
# Filelog receiver for Kubernetes pod logs
# Collects container stdout/stderr from /var/log/pods
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
# Exclude SigNoz's own logs to avoid recursive collection
- /var/log/pods/bakery-ia_signoz-*/*/*.log
include_file_path: true
include_file_name: false
operators:
# Parse CRI-O / containerd log format
- type: regex_parser
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Fix timestamp parsing - extract from the parsed time field
- type: move
from: attributes.time
to: attributes.timestamp
# Extract Kubernetes metadata from file path
- type: regex_parser
id: extract_metadata_from_filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
parse_from: attributes["log.file.path"]
# Move metadata to resource attributes
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.log
to: body
# Kubernetes Cluster Receiver - Collects cluster-level metrics
# Provides information about nodes, namespaces, pods, and other cluster resources
k8s_cluster:
collection_interval: 30s
node_conditions_to_report:
- Ready
- MemoryPressure
- DiskPressure
- PIDPressure
- NetworkUnavailable
allocatable_types_to_report:
- cpu
- memory
- pods
# PostgreSQL receivers for database metrics
# ENABLED: Monitor users configured and credentials stored in secrets
# Collects metrics directly from PostgreSQL databases with proper TLS
@@ -538,6 +602,43 @@ otelCollector:
password: ${env:RABBITMQ_PASSWORD}
collection_interval: 30s
# Prometheus Receiver - Scrapes metrics from Kubernetes API
# Simplified configuration using only Kubernetes API metrics
prometheus:
config:
scrape_configs:
- job_name: 'kubernetes-nodes-cadvisor'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
processors:
# Batch processor for better performance (optimized for high throughput)
batch:
@@ -562,6 +663,25 @@ otelCollector:
detectors: [env, system, docker]
timeout: 5s
# Kubernetes attributes processor - CRITICAL for logs
# Extracts pod, namespace, container metadata from log attributes
k8sattributes:
auth_type: "serviceAccount"
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
- k8s.container.name
labels:
- tag_name: "app"
- tag_name: "pod-template-hash"
annotations:
- tag_name: "description"
# SigNoz span metrics processor with delta aggregation (recommended)
# Generates RED metrics (Rate, Error, Duration) from trace spans
signozspanmetrics/delta:
@@ -643,7 +763,7 @@ otelCollector:
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
postgresql/production, postgresql/recipes, postgresql/sales,
postgresql/suppliers, postgresql/tenant, postgresql/training,
redis, rabbitmq]
redis, rabbitmq, k8s_cluster, prometheus]
processors: [memory_limiter, batch, resourcedetection]
exporters: [signozclickhousemetrics]
@@ -653,17 +773,38 @@ otelCollector:
processors: [batch/meter]
exporters: [signozclickhousemeter]
# Logs pipeline
# Logs pipeline - includes both OTLP and Kubernetes pod logs
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection]
receivers: [otlp, filelog]
processors: [memory_limiter, batch, resourcedetection, k8sattributes]
exporters: [clickhouselogsexporter]
# Additional Configuration
serviceAccount:
create: true
annotations: {}
name: ""
name: "signoz-otel-collector"
# RBAC Configuration for Kubernetes monitoring
# Required for k8s_cluster and kubeletstats receivers to access Kubernetes API
rbac:
create: true
rules:
- apiGroups: [""]
resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources: ["deployments", "daemonsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["metrics.k8s.io"]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch"]
# Security Context
securityContext:

View File

@@ -66,6 +66,11 @@ signoz:
signoz_traces_ttl_duration_hrs: "720"
signoz_metrics_ttl_duration_hrs: "720"
signoz_logs_ttl_duration_hrs: "720"
# OpAMP Server Configuration
# WARNING: OpAMP can cause gRPC instability and collector reloads
# Only enable if you have a stable OpAMP backend server
signoz_opamp_server_enabled: "false"
# signoz_opamp_server_endpoint: "0.0.0.0:4320"
# SMTP configuration for email alerts
signoz_smtp_enabled: "true"
signoz_smtp_host: "smtp.gmail.com"
@@ -247,17 +252,52 @@ otelCollector:
tag: v0.129.12 # Updated to latest recommended version
pullPolicy: IfNotPresent
# Init containers for the Otel Collector pod
initContainers:
fix-postgres-tls:
enabled: true
image:
registry: docker.io
repository: busybox
tag: 1.35
pullPolicy: IfNotPresent
command:
- sh
- -c
- |
echo "Fixing PostgreSQL TLS file permissions..."
cp /etc/postgres-tls-source/* /etc/postgres-tls/
chmod 600 /etc/postgres-tls/server-key.pem
chmod 644 /etc/postgres-tls/server-cert.pem
chmod 644 /etc/postgres-tls/ca-cert.pem
echo "PostgreSQL TLS permissions fixed"
volumeMounts:
- name: postgres-tls-source
mountPath: /etc/postgres-tls-source
readOnly: true
- name: postgres-tls-fixed
mountPath: /etc/postgres-tls
readOnly: false
service:
type: ClusterIP
ports:
- name: otlp-grpc
port: 4317
targetPort: 4317
protocol: TCP
- name: otlp-http
port: 4318
targetPort: 4318
protocol: TCP
- name: prometheus
port: 8889
targetPort: 8889
protocol: TCP
- name: metrics
port: 8888
- name: healthcheck
port: 13133
targetPort: 8888
protocol: TCP
resources:
requests:
@@ -267,6 +307,50 @@ otelCollector:
cpu: 2000m
memory: 2Gi
# Additional environment variables for receivers
additionalEnvs:
POSTGRES_MONITOR_USER: "monitoring"
POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca"
REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k"
RABBITMQ_USER: "bakery"
RABBITMQ_PASSWORD: "forecast123"
# Mount TLS certificates for secure connections
extraVolumes:
- name: redis-tls
secret:
secretName: redis-tls-secret
- name: postgres-tls
secret:
secretName: postgres-tls
- name: postgres-tls-fixed
emptyDir: {}
- name: varlogpods
hostPath:
path: /var/log/pods
extraVolumeMounts:
- name: redis-tls
mountPath: /etc/redis-tls
readOnly: true
- name: postgres-tls
mountPath: /etc/postgres-tls-source
readOnly: true
- name: postgres-tls-fixed
mountPath: /etc/postgres-tls
readOnly: false
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
# Enable OpAMP for dynamic configuration management
command:
name: /signoz-otel-collector
extraArgs:
- --config=/conf/otel-collector-config.yaml
- --manager-config=/conf/otel-collector-opamp-config.yaml
- --feature-gates=-pkg.translator.prometheus.NormalizeName
# Full OTEL Collector Configuration
config:
# Connectors - bridge between pipelines
@@ -297,14 +381,358 @@ otelCollector:
- "https://monitoring.bakewise.ai"
- "https://*.bakewise.ai"
# Filelog receiver for Kubernetes pod logs
# Collects container stdout/stderr from /var/log/pods
filelog:
include:
- /var/log/pods/*/*/*.log
exclude:
# Exclude SigNoz's own logs to avoid recursive collection
- /var/log/pods/bakery-ia_signoz-*/*/*.log
include_file_path: true
include_file_name: false
operators:
# Parse CRI-O / containerd log format
- type: regex_parser
regex: '^(?P<time>[^ ]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) (?P<log>.*)$'
timestamp:
parse_from: attributes.time
layout: '%Y-%m-%dT%H:%M:%S.%LZ'
# Fix timestamp parsing - extract from the parsed time field
- type: move
from: attributes.time
to: attributes.timestamp
# Extract Kubernetes metadata from file path
- type: regex_parser
id: extract_metadata_from_filepath
regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[^\/]+)\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
parse_from: attributes["log.file.path"]
# Move metadata to resource attributes
- type: move
from: attributes.namespace
to: resource["k8s.namespace.name"]
- type: move
from: attributes.pod_name
to: resource["k8s.pod.name"]
- type: move
from: attributes.container_name
to: resource["k8s.container.name"]
- type: move
from: attributes.log
to: body
# Kubernetes Cluster Receiver - Collects cluster-level metrics
# Provides information about nodes, namespaces, pods, and other cluster resources
k8s_cluster:
collection_interval: 30s
node_conditions_to_report:
- Ready
- MemoryPressure
- DiskPressure
- PIDPressure
- NetworkUnavailable
allocatable_types_to_report:
- cpu
- memory
- pods
# Prometheus receiver for scraping metrics
prometheus:
config:
scrape_configs:
- job_name: 'otel-collector'
- job_name: 'kubernetes-nodes-cadvisor'
scrape_interval: 30s
static_configs:
- targets: ['localhost:8888']
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiserver'
scrape_interval: 30s
scrape_timeout: 10s
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Redis receiver for cache metrics
# ENABLED: Using existing credentials from redis-secrets with TLS
redis:
endpoint: redis-service.bakery-ia:6379
password: ${env:REDIS_PASSWORD}
collection_interval: 60s
transport: tcp
tls:
insecure_skip_verify: false
cert_file: /etc/redis-tls/redis-cert.pem
key_file: /etc/redis-tls/redis-key.pem
ca_file: /etc/redis-tls/ca-cert.pem
metrics:
redis.maxmemory:
enabled: true
redis.cmd.latency:
enabled: true
# RabbitMQ receiver via management API
# ENABLED: Using existing credentials from rabbitmq-secrets
rabbitmq:
endpoint: http://rabbitmq-service.bakery-ia:15672
username: ${env:RABBITMQ_USER}
password: ${env:RABBITMQ_PASSWORD}
collection_interval: 30s
# PostgreSQL receivers for database metrics
# Monitor all databases with proper TLS configuration
postgresql/auth:
endpoint: auth-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- auth_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/inventory:
endpoint: inventory-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- inventory_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/orders:
endpoint: orders-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- orders_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/ai-insights:
endpoint: ai-insights-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- ai_insights_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/alert-processor:
endpoint: alert-processor-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- alert_processor_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/distribution:
endpoint: distribution-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- distribution_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/external:
endpoint: external-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- external_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/forecasting:
endpoint: forecasting-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- forecasting_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/notification:
endpoint: notification-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- notification_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/orchestrator:
endpoint: orchestrator-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- orchestrator_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/pos:
endpoint: pos-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- pos_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/procurement:
endpoint: procurement-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- procurement_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/production:
endpoint: production-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- production_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/recipes:
endpoint: recipes-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- recipes_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/sales:
endpoint: sales-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- sales_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/suppliers:
endpoint: suppliers-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- suppliers_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/tenant:
endpoint: tenant-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- tenant_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
postgresql/training:
endpoint: training-db-service.bakery-ia:5432
username: ${env:POSTGRES_MONITOR_USER}
password: ${env:POSTGRES_MONITOR_PASSWORD}
databases:
- training_db
collection_interval: 60s
tls:
insecure: false
cert_file: /etc/postgres-tls/server-cert.pem
key_file: /etc/postgres-tls/server-key.pem
ca_file: /etc/postgres-tls/ca-cert.pem
processors:
# High-performance batch processing (official recommendation)
@@ -326,7 +754,7 @@ otelCollector:
# Resource detection for K8s
resourcedetection:
detectors: [env, system, docker, kubernetes]
detectors: [env, system, docker]
timeout: 5s
# Add resource attributes
@@ -339,6 +767,26 @@ otelCollector:
value: bakery-ia-prod
action: upsert
# Kubernetes attributes processor - CRITICAL for logs
# Extracts pod, namespace, container metadata from log attributes
k8sattributes:
auth_type: "serviceAccount"
passthrough: false
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
- k8s.container.name
labels:
- tag_name: "app"
- tag_name: "pod-template-hash"
- tag_name: "version"
annotations:
- tag_name: "description"
# SigNoz span metrics processor with delta aggregation (recommended)
# Generates RED metrics (Rate, Error, Duration) from trace spans
signozspanmetrics/delta:
@@ -354,9 +802,9 @@ otelCollector:
- name: signoz.collector.id
exporters:
# Export to SigNoz ClickHouse
# ClickHouse exporter for traces
clickhousetraces:
datasource: tcp://clickhouse:9000/?database=signoz_traces
datasource: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_traces
timeout: 10s
retry_on_failure:
enabled: true
@@ -364,8 +812,9 @@ otelCollector:
max_interval: 30s
max_elapsed_time: 300s
# ClickHouse exporter for metrics
signozclickhousemetrics:
endpoint: "tcp://clickhouse:9000/?database=signoz_metrics"
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
timeout: 10s
retry_on_failure:
enabled: true
@@ -375,32 +824,32 @@ otelCollector:
# ClickHouse exporter for meter data (usage metrics)
signozclickhousemeter:
dsn: "tcp://clickhouse:9000/?database=signoz_meter"
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_meter"
timeout: 45s
sending_queue:
enabled: false
# ClickHouse exporter for logs
clickhouselogsexporter:
dsn: tcp://clickhouse:9000/?database=signoz_logs
dsn: tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/?database=signoz_logs
timeout: 10s
retry_on_failure:
enabled: true
initial_interval: 5s
max_interval: 30s
max_elapsed_time: 300s
# Metadata exporter for service metadata
metadataexporter:
dsn: "tcp://clickhouse:9000/?database=signoz_metadata"
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metadata"
timeout: 10s
cache:
provider: in_memory
# Debug exporter for debugging (replaces deprecated logging exporter)
# Debug exporter for debugging (optional)
debug:
verbosity: detailed
sampling_initial: 2
sampling_thereafter: 500
sampling_initial: 5
sampling_thereafter: 200
service:
extensions: [health_check, zpages]
@@ -411,9 +860,16 @@ otelCollector:
processors: [memory_limiter, batch, signozspanmetrics/delta, resourcedetection, resource]
exporters: [clickhousetraces, metadataexporter, signozmeter]
# Metrics pipeline
# Metrics pipeline - includes all infrastructure receivers
metrics:
receivers: [otlp, prometheus]
receivers: [otlp,
postgresql/auth, postgresql/inventory, postgresql/orders,
postgresql/ai-insights, postgresql/alert-processor, postgresql/distribution,
postgresql/external, postgresql/forecasting, postgresql/notification,
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
postgresql/production, postgresql/recipes, postgresql/sales,
postgresql/suppliers, postgresql/tenant, postgresql/training,
redis, rabbitmq, k8s_cluster, prometheus]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [signozclickhousemetrics]
@@ -423,10 +879,10 @@ otelCollector:
processors: [batch/meter]
exporters: [signozclickhousemeter]
# Logs pipeline
# Logs pipeline - includes both OTLP and Kubernetes pod logs
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
receivers: [otlp, filelog]
processors: [memory_limiter, batch, resourcedetection, resource, k8sattributes]
exporters: [clickhouselogsexporter]
# HPA for OTEL Collector
@@ -455,6 +911,27 @@ serviceAccount:
annotations: {}
name: "signoz"
# RBAC Configuration for Kubernetes monitoring
# Required for k8s_cluster receiver to access Kubernetes API
rbac:
create: true
rules:
- apiGroups: [""]
resources: ["nodes", "nodes/proxy", "nodes/metrics", "pods", "services", "endpoints", "namespaces"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments", "daemonsets", "statefulsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
resources: ["deployments", "daemonsets", "replicasets"]
verbs: ["get", "list", "watch"]
- apiGroups: ["metrics.k8s.io"]
resources: ["nodes", "pods"]
verbs: ["get", "list", "watch"]
# Security Context
securityContext:
runAsNonRoot: true

View File

@@ -15,9 +15,13 @@ data:
LOG_LEVEL: "INFO"
# Observability Settings - SigNoz enabled
# Note: Detailed OTEL configuration is in the OBSERVABILITY section below
ENABLE_TRACING: "true"
ENABLE_METRICS: "true"
ENABLE_LOGS: "true"
ENABLE_OTEL_METRICS: "true"
ENABLE_SYSTEM_METRICS: "true"
OTEL_LOGS_EXPORTER: "otlp"
# Database initialization settings
# IMPORTANT: Services NEVER run migrations - they only verify DB is ready
@@ -384,15 +388,44 @@ data:
# ================================================================
# OBSERVABILITY - SigNoz (Unified Monitoring)
# ================================================================
# OpenTelemetry Configuration - Direct to SigNoz
# IMPORTANT: gRPC endpoints should NOT include http:// prefix
# OpenTelemetry Configuration - Direct to SigNoz OTel Collector
#
# ENDPOINT CONFIGURATION:
# - OTEL_EXPORTER_OTLP_ENDPOINT: Base gRPC endpoint (host:port format, NO http:// prefix)
# Used by traces and metrics (gRPC) by default
# Format: "host:4317" (gRPC port)
#
# PROTOCOL USAGE:
# - Traces: gRPC (port 4317) - High performance, low latency
# - Metrics: gRPC (port 4317) - Efficient batch export
# - Logs: HTTP (port 4318) - Required for OTLP log protocol
#
# The monitoring library automatically handles:
# - Converting gRPC endpoint (4317) to HTTP endpoint (4318) for logs
# - Adding proper paths (/v1/traces, /v1/metrics, /v1/logs)
# - Protocol prefixes (http:// for HTTP, none for gRPC)
#
# Base OTLP endpoint (gRPC format - used by traces and metrics)
OTEL_EXPORTER_OTLP_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
# Protocol configuration (gRPC is recommended for better performance)
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
# Optional: Signal-specific endpoint overrides (if different from base)
# OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
# OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
# OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: "http://signoz-otel-collector.bakery-ia.svc.cluster.local:4318"
# Optional: Protocol overrides per signal
# OTEL_EXPORTER_OTLP_TRACES_PROTOCOL: "grpc"
# OTEL_EXPORTER_OTLP_METRICS_PROTOCOL: "grpc"
# Note: Logs always use HTTP protocol regardless of this setting
# Resource attributes (added to all telemetry signals)
OTEL_SERVICE_NAME: "bakery-ia"
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
OTEL_LOGS_EXPORTER: "otlp"
# SigNoz Endpoints (v0.106.0+ unified service)
# SigNoz service endpoints (for UI and API access)
SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local"

View File

@@ -1,104 +1,170 @@
{
"dashboard": {
"title": "Bakery IA - Alert Management",
"description": "Alert monitoring and management dashboard",
"tags": ["alerts", "monitoring", "management"],
"panels": [
{
"title": "Active Alerts",
"type": "stat",
"query": {
"metric": "alerts_active",
"aggregate": "sum",
"filters": [
{
"key": "severity",
"operator": "=",
"value": "${severity}"
},
{
"key": "status",
"operator": "=",
"value": "firing"
}
]
},
"unit": "number"
},
{
"title": "Alert Rate",
"type": "timeseries",
"query": {
"metric": "alerts_total",
"aggregate": "rate",
"filters": [
{
"key": "severity",
"operator": "=",
"value": "${severity}"
}
]
},
"unit": "alerts/s"
},
{
"title": "Alerts by Severity",
"type": "pie",
"query": {
"metric": "alerts_total",
"aggregate": "sum",
"groupBy": ["severity"],
"filters": [
{
"key": "severity",
"operator": "=",
"value": "${severity}"
}
]
}
},
{
"title": "Alerts by Status",
"type": "pie",
"query": {
"metric": "alerts_total",
"aggregate": "sum",
"groupBy": ["status"],
"filters": [
{
"key": "status",
"operator": "=",
"value": "${status}"
}
]
}
}
],
"variables": [
{
"name": "severity",
"label": "Severity",
"type": "dropdown",
"default": "*",
"values": ["*", "critical", "high", "medium", "low"]
},
{
"name": "status",
"label": "Status",
"type": "dropdown",
"default": "*",
"values": ["*", "firing", "resolved", "acknowledged"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Alert monitoring and management dashboard",
"tags": ["alerts", "monitoring", "management"],
"name": "bakery-ia-alert-management",
"title": "Bakery IA - Alert Management",
"uploadedGrafana": false,
"uuid": "bakery-ia-alerts-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "active-alerts",
"moved": false,
"static": false
},
"refresh": "15s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "alert-rate",
"moved": false,
"static": false
}
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'alerts_active' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "active-alerts",
"title": "Active Alerts",
"description": "Number of currently active alerts",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "alerts_active",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Active Alerts",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "alert-rate",
"title": "Alert Rate",
"description": "Rate of alerts over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "alerts_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "alerts/s"
}
]
}

View File

@@ -1,102 +1,351 @@
{
"dashboard": {
"title": "Bakery IA - API Performance",
"description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints",
"tags": ["api", "performance", "rest", "graphql"],
"panels": [
{
"title": "Request Volume",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["api"],
"filters": [
{
"key": "api",
"operator": "=",
"value": "${api}"
}
]
},
"unit": "req/s"
},
{
"title": "Error Rate",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["api", "status"],
"filters": [
{
"key": "api",
"operator": "=",
"value": "${api}"
},
{
"key": "status",
"operator": "=~",
"value": "5.."
}
]
},
"unit": "req/s"
},
{
"title": "Average Response Time",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_sum",
"aggregate": "avg",
"groupBy": ["api", "endpoint"],
"filters": [
{
"key": "api",
"operator": "=",
"value": "${api}"
}
]
},
"unit": "seconds"
},
{
"title": "P95 Latency",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_bucket",
"aggregate": "histogram_quantile",
"quantile": 0.95,
"groupBy": ["api", "endpoint"],
"filters": [
{
"key": "api",
"operator": "=",
"value": "${api}"
}
]
},
"unit": "seconds"
}
],
"variables": [
{
"name": "api",
"label": "API Service",
"type": "dropdown",
"default": "*",
"values": ["*", "gateway-api", "auth-api", "inventory-api", "production-api", "forecasting-api", "procurement-api"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints",
"tags": ["api", "performance", "rest", "graphql"],
"name": "bakery-ia-api-performance",
"title": "Bakery IA - API Performance",
"uploadedGrafana": false,
"uuid": "bakery-ia-api-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "request-volume",
"moved": false,
"static": false
},
"refresh": "15s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "error-rate",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "avg-response-time",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "p95-latency",
"moved": false,
"static": false
}
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by API service",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'http_server_requests_seconds_count' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "request-volume",
"title": "Request Volume",
"description": "API request volume by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{api.name}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "req/s"
},
{
"id": "error-rate",
"title": "Error Rate",
"description": "API error rate by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.api}}"
},
{
"key": {
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=~",
"value": "5.."
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
{
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{api.name}} - {{status_code}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "req/s"
},
{
"id": "avg-response-time",
"title": "Average Response Time",
"description": "Average API response time by endpoint",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "http_server_requests_seconds_sum",
"dataType": "float64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.api}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
{
"key": "endpoint",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{api.name}} - {{endpoint}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "seconds"
},
{
"id": "p95-latency",
"title": "P95 Latency",
"description": "95th percentile latency by endpoint",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "histogram_quantile",
"aggregateAttribute": {
"key": "http_server_requests_seconds_bucket",
"dataType": "float64",
"type": "Histogram",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.api}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "api.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
{
"key": "endpoint",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{api.name}} - {{endpoint}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "seconds"
}
]
}

View File

@@ -1,101 +1,333 @@
{
"dashboard": {
"title": "Bakery IA - Application Performance",
"description": "Application performance monitoring dashboard for Bakery IA microservices",
"tags": ["application", "performance", "apm"],
"panels": [
{
"title": "Request Rate",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "req/s"
},
{
"title": "Error Rate",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["service", "status"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
},
{
"key": "status",
"operator": "=~",
"value": "5.."
}
]
},
"unit": "req/s"
},
{
"title": "Average Response Time",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_sum",
"aggregate": "avg",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "seconds"
},
{
"title": "Throughput",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "rate",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "req/s"
}
],
"variables": [
{
"name": "service",
"label": "Service",
"type": "dropdown",
"default": "*",
"values": ["*", "auth-service", "gateway-service", "forecasting-service", "inventory-service", "production-service", "procurement-service"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Application performance monitoring dashboard using distributed traces and metrics",
"tags": ["application", "performance", "traces", "apm"],
"name": "bakery-ia-application-performance",
"title": "Bakery IA - Application Performance (APM)",
"uploadedGrafana": false,
"uuid": "bakery-ia-apm-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "latency-p99",
"moved": false,
"static": false
},
"refresh": "15s",
"time": {
"from": "now-30m",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "request-rate",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "error-rate",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "avg-duration",
"moved": false,
"static": false
}
}
}
],
"variables": {
"service_name": {
"id": "service-var",
"name": "service_name",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(serviceName) FROM signoz_traces.distributed_signoz_index_v2 ORDER BY serviceName",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "latency-p99",
"title": "P99 Latency",
"description": "99th percentile latency for selected service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "traces",
"queryName": "A",
"aggregateOperator": "p99",
"aggregateAttribute": {
"key": "duration_ns",
"dataType": "float64",
"type": "",
"isColumn": true
},
"timeAggregation": "avg",
"spaceAggregation": "p99",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "ms"
},
{
"id": "request-rate",
"title": "Request Rate",
"description": "Requests per second for the service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "traces",
"queryName": "A",
"aggregateOperator": "count",
"aggregateAttribute": {
"key": "",
"dataType": "",
"type": "",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "reqps"
},
{
"id": "error-rate",
"title": "Error Rate",
"description": "Error rate percentage for the service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "traces",
"queryName": "A",
"aggregateOperator": "count",
"aggregateAttribute": {
"key": "",
"dataType": "",
"type": "",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
},
{
"key": {
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "STATUS_CODE_ERROR"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "reqps"
},
{
"id": "avg-duration",
"title": "Average Duration",
"description": "Average request duration",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "traces",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "duration_ns",
"dataType": "float64",
"type": "",
"isColumn": true
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service_name}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "ms"
}
]
}

View File

@@ -1,101 +1,425 @@
{
"dashboard": {
"title": "Bakery IA - Database Performance",
"description": "Comprehensive database performance monitoring for PostgreSQL and Redis",
"tags": ["database", "postgresql", "redis", "performance"],
"panels": [
{
"title": "Database Connections",
"type": "timeseries",
"query": {
"metric": "pg_stat_activity_count",
"aggregate": "sum",
"groupBy": ["datname"],
"filters": [
{
"key": "datname",
"operator": "=",
"value": "${database}"
}
]
},
"unit": "number"
},
{
"title": "Active Queries",
"type": "timeseries",
"query": {
"metric": "pg_stat_activity_count",
"aggregate": "sum",
"groupBy": ["datname"],
"filters": [
{
"key": "datname",
"operator": "=",
"value": "${database}"
},
{
"key": "state",
"operator": "=",
"value": "active"
}
]
},
"unit": "number"
},
{
"title": "Database Size",
"type": "timeseries",
"query": {
"metric": "pg_database_size_bytes",
"aggregate": "sum",
"groupBy": ["datname"],
"filters": [
{
"key": "datname",
"operator": "=",
"value": "${database}"
}
]
},
"unit": "bytes"
},
{
"title": "Query Execution Time",
"type": "timeseries",
"query": {
"metric": "pg_stat_statements_total_time",
"aggregate": "avg",
"groupBy": ["datname"],
"filters": [
{
"key": "datname",
"operator": "=",
"value": "${database}"
}
]
},
"unit": "seconds"
}
],
"variables": [
{
"name": "database",
"label": "Database",
"type": "dropdown",
"default": "*",
"values": ["*", "postgresql", "redis"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Comprehensive database performance monitoring for PostgreSQL, Redis, and RabbitMQ",
"tags": ["database", "postgresql", "redis", "rabbitmq", "performance"],
"name": "bakery-ia-database-performance",
"title": "Bakery IA - Database Performance",
"uploadedGrafana": false,
"uuid": "bakery-ia-db-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "pg-connections",
"moved": false,
"static": false
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "pg-db-size",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "redis-connected-clients",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "redis-memory",
"moved": false,
"static": false
},
{
"x": 0,
"y": 6,
"w": 6,
"h": 3,
"i": "rabbitmq-messages",
"moved": false,
"static": false
},
{
"x": 6,
"y": 6,
"w": 6,
"h": 3,
"i": "rabbitmq-consumers",
"moved": false,
"static": false
}
}
}
],
"variables": {
"database": {
"id": "database-var",
"name": "database",
"description": "Filter by PostgreSQL database name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['postgresql.database.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'postgresql.db_size' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "pg-connections",
"title": "PostgreSQL - Active Connections",
"description": "Number of active PostgreSQL connections",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "postgresql.backends",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.database}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{postgresql.database.name}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "pg-db-size",
"title": "PostgreSQL - Database Size",
"description": "Size of PostgreSQL databases in bytes",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "postgresql.db_size",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.database}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "postgresql.database.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{postgresql.database.name}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "bytes"
},
{
"id": "redis-connected-clients",
"title": "Redis - Connected Clients",
"description": "Number of clients connected to Redis",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "redis.clients.connected",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "host.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{host.name}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "redis-memory",
"title": "Redis - Memory Usage",
"description": "Redis memory usage in bytes",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "redis.memory.used",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "host.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{host.name}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "bytes"
},
{
"id": "rabbitmq-messages",
"title": "RabbitMQ - Current Messages",
"description": "Number of messages currently in RabbitMQ queues",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "rabbitmq.message.current",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "queue",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "Queue: {{queue}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "rabbitmq-consumers",
"title": "RabbitMQ - Consumer Count",
"description": "Number of consumers per queue",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "rabbitmq.consumer.count",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "queue",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "Queue: {{queue}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
}
]
}

View File

@@ -1,105 +1,348 @@
{
"dashboard": {
"title": "Bakery IA - Error Tracking",
"description": "Comprehensive error tracking and analysis dashboard",
"tags": ["errors", "exceptions", "tracking"],
"panels": [
{
"title": "Total Errors",
"type": "stat",
"query": {
"metric": "error_total",
"aggregate": "sum",
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "number"
},
{
"title": "Error Rate",
"type": "timeseries",
"query": {
"metric": "error_total",
"aggregate": "rate",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "errors/s"
},
{
"title": "HTTP 5xx Errors",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["service", "status"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
},
{
"key": "status",
"operator": "=~",
"value": "5.."
}
]
},
"unit": "number"
},
{
"title": "HTTP 4xx Errors",
"type": "timeseries",
"query": {
"metric": "http_server_requests_seconds_count",
"aggregate": "sum",
"groupBy": ["service", "status"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
},
{
"key": "status",
"operator": "=~",
"value": "4.."
}
]
},
"unit": "number"
}
],
"variables": [
{
"name": "service",
"label": "Service",
"type": "dropdown",
"default": "*",
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Comprehensive error tracking and analysis dashboard",
"tags": ["errors", "exceptions", "tracking"],
"name": "bakery-ia-error-tracking",
"title": "Bakery IA - Error Tracking",
"uploadedGrafana": false,
"uuid": "bakery-ia-errors-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "total-errors",
"moved": false,
"static": false
},
"refresh": "15s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "error-rate",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "http-5xx",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "http-4xx",
"moved": false,
"static": false
}
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'error_total' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "total-errors",
"title": "Total Errors",
"description": "Total number of errors across all services",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "error_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Total Errors",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "error-rate",
"title": "Error Rate",
"description": "Error rate over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "error_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "errors/s"
},
{
"id": "http-5xx",
"title": "HTTP 5xx Errors",
"description": "Server errors (5xx status codes)",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
},
{
"key": {
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=~",
"value": "5.."
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
{
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{serviceName}} - {{status_code}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "number"
},
{
"id": "http-4xx",
"title": "HTTP 4xx Errors",
"description": "Client errors (4xx status codes)",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "http_server_requests_seconds_count",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
},
{
"key": {
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=~",
"value": "4.."
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
{
"key": "status_code",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{serviceName}} - {{status_code}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "number"
}
]
}

View File

@@ -1,105 +1,423 @@
{
"dashboard": {
"title": "Bakery IA - Infrastructure Monitoring",
"description": "Comprehensive infrastructure monitoring dashboard for Bakery IA system",
"tags": ["infrastructure", "system", "kubernetes"],
"panels": [
{
"title": "CPU Usage",
"type": "timeseries",
"query": {
"metric": "container_cpu_usage_seconds_total",
"aggregate": "sum",
"groupBy": ["namespace"],
"filters": [
"description": "Comprehensive infrastructure monitoring dashboard for Bakery IA Kubernetes cluster",
"tags": ["infrastructure", "kubernetes", "k8s", "system"],
"name": "bakery-ia-infrastructure-monitoring",
"title": "Bakery IA - Infrastructure Monitoring",
"uploadedGrafana": false,
"uuid": "bakery-ia-infra-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "pod-count",
"moved": false,
"static": false
},
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "pod-phase",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "container-restarts",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "node-condition",
"moved": false,
"static": false
},
{
"x": 0,
"y": 6,
"w": 12,
"h": 3,
"i": "deployment-status",
"moved": false,
"static": false
}
],
"variables": {
"namespace": {
"id": "namespace-var",
"name": "namespace",
"description": "Filter by Kubernetes namespace",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'k8s.pod.phase' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": "bakery-ia"
}
},
"widgets": [
{
"id": "pod-count",
"title": "Total Pods",
"description": "Total number of pods in the namespace",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"key": "namespace",
"operator": "=",
"value": "bakery-ia"
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "count",
"aggregateAttribute": {
"key": "k8s.pod.phase",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Total Pods",
"reduceTo": "sum"
}
]
],
"queryFormulas": []
},
"unit": "percent",
"yAxis": {
"min": 0,
"max": 100
}
"queryType": "builder"
},
{
"title": "Memory Usage",
"type": "timeseries",
"query": {
"metric": "container_memory_working_set_bytes",
"aggregate": "sum",
"groupBy": ["namespace"],
"filters": [
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "pod-phase",
"title": "Pod Phase Distribution",
"description": "Pods by phase (Running, Pending, Failed, etc.)",
"isStacked": true,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"key": "namespace",
"operator": "=",
"value": "bakery-ia"
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "k8s.pod.phase",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "phase",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{phase}}",
"reduceTo": "sum"
}
]
],
"queryFormulas": []
},
"unit": "bytes"
"queryType": "builder"
},
{
"title": "Network Traffic",
"type": "timeseries",
"query": {
"metric": "container_network_receive_bytes_total",
"aggregate": "sum",
"groupBy": ["namespace"],
"filters": [
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "container-restarts",
"title": "Container Restarts",
"description": "Container restart count over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"key": "namespace",
"operator": "=",
"value": "bakery-ia"
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "k8s.container.restarts",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "increase",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.pod.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.pod.name}}",
"reduceTo": "sum"
}
]
],
"queryFormulas": []
},
"unit": "bytes"
"queryType": "builder"
},
{
"title": "Pod Status",
"type": "stat",
"query": {
"metric": "kube_pod_status_phase",
"aggregate": "count",
"groupBy": ["phase"],
"filters": [
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "node-condition",
"title": "Node Conditions",
"description": "Node condition status (Ready, MemoryPressure, DiskPressure, etc.)",
"isStacked": true,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"key": "namespace",
"operator": "=",
"value": "bakery-ia"
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "k8s.node.condition_ready",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.node.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.node.name}} Ready",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "deployment-status",
"title": "Deployment Status (Desired vs Available)",
"description": "Deployment replicas: desired vs available",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "k8s.deployment.desired",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.deployment.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.deployment.name}} (desired)",
"reduceTo": "avg"
},
{
"key": "phase",
"operator": "=",
"value": "Running"
"dataSource": "metrics",
"queryName": "B",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "k8s.deployment.available",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "B",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "k8s.deployment.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{k8s.deployment.name}} (available)",
"reduceTo": "avg"
}
]
],
"queryFormulas": []
},
"unit": "number"
}
],
"variables": [
{
"name": "namespace",
"label": "Namespace",
"type": "dropdown",
"default": "bakery-ia",
"values": ["bakery-ia", "default", "kube-system"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
}
}
}
]
}

View File

@@ -1,99 +1,333 @@
{
"dashboard": {
"title": "Bakery IA - Log Analysis",
"description": "Comprehensive log analysis and search dashboard",
"tags": ["logs", "analysis", "search"],
"panels": [
{
"title": "Log Volume",
"type": "timeseries",
"query": {
"metric": "log_lines_total",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "logs/s"
},
{
"title": "Error Logs",
"type": "timeseries",
"query": {
"metric": "log_lines_total",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
},
{
"key": "level",
"operator": "=",
"value": "error"
}
]
},
"unit": "logs/s"
},
{
"title": "Logs by Level",
"type": "pie",
"query": {
"metric": "log_lines_total",
"aggregate": "sum",
"groupBy": ["level"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
}
},
{
"title": "Logs by Service",
"type": "pie",
"query": {
"metric": "log_lines_total",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
}
}
],
"variables": [
{
"name": "service",
"label": "Service",
"type": "dropdown",
"default": "*",
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Comprehensive log analysis and search dashboard",
"tags": ["logs", "analysis", "search"],
"name": "bakery-ia-log-analysis",
"title": "Bakery IA - Log Analysis",
"uploadedGrafana": false,
"uuid": "bakery-ia-logs-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "log-volume",
"moved": false,
"static": false
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "error-logs",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "logs-by-level",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "logs-by-service",
"moved": false,
"static": false
}
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'log_lines_total' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "log-volume",
"title": "Log Volume",
"description": "Total log volume by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "logs/s"
},
{
"id": "error-logs",
"title": "Error Logs",
"description": "Error log volume by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "rate",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
},
{
"key": {
"key": "level",
"dataType": "string",
"type": "tag",
"isColumn": false
},
"op": "=",
"value": "error"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}} (errors)",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "logs/s"
},
{
"id": "logs-by-level",
"title": "Logs by Level",
"description": "Distribution of logs by severity level",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "pie",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "level",
"dataType": "string",
"type": "tag",
"isColumn": false
}
],
"legend": "{{level}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "logs-by-service",
"title": "Logs by Service",
"description": "Distribution of logs by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "pie",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "log_lines_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
}
]
}

View File

@@ -1,92 +1,295 @@
{
"dashboard": {
"title": "Bakery IA - System Health",
"description": "Comprehensive system health monitoring dashboard",
"tags": ["system", "health", "monitoring"],
"panels": [
{
"title": "System Availability",
"type": "stat",
"query": {
"metric": "system_availability",
"aggregate": "avg",
"filters": [
{
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "percent"
},
{
"title": "Service Health Score",
"type": "stat",
"query": {
"metric": "service_health_score",
"aggregate": "avg",
"filters": [
{
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "number"
},
{
"title": "CPU Usage",
"type": "timeseries",
"query": {
"metric": "system_cpu_usage",
"aggregate": "avg",
"filters": [
{
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "percent"
},
{
"title": "Memory Usage",
"type": "timeseries",
"query": {
"metric": "system_memory_usage",
"aggregate": "avg",
"filters": [
{
"key": "namespace",
"operator": "=",
"value": "${namespace}"
}
]
},
"unit": "percent"
}
],
"variables": [
{
"name": "namespace",
"label": "Namespace",
"type": "dropdown",
"default": "bakery-ia",
"values": ["bakery-ia", "default"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "Comprehensive system health monitoring dashboard",
"tags": ["system", "health", "monitoring"],
"name": "bakery-ia-system-health",
"title": "Bakery IA - System Health",
"uploadedGrafana": false,
"uuid": "bakery-ia-health-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "system-availability",
"moved": false,
"static": false
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "health-score",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "cpu-usage",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "memory-usage",
"moved": false,
"static": false
}
}
],
"variables": {
"namespace": {
"id": "namespace-var",
"name": "namespace",
"description": "Filter by Kubernetes namespace",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['k8s.namespace.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'system_availability' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": "bakery-ia"
}
},
"widgets": [
{
"id": "system-availability",
"title": "System Availability",
"description": "Overall system availability percentage",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "system_availability",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "System Availability",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "percent"
},
{
"id": "health-score",
"title": "Service Health Score",
"description": "Overall service health score",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "value",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "service_health_score",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Health Score",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "cpu-usage",
"title": "CPU Usage",
"description": "System CPU usage over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "system_cpu_usage",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "CPU Usage",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "percent"
},
{
"id": "memory-usage",
"title": "Memory Usage",
"description": "System memory usage over time",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "system_memory_usage",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "k8s.namespace.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.namespace}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [],
"legend": "Memory Usage",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "percent"
}
]
}

View File

@@ -1,96 +1,323 @@
{
"dashboard": {
"title": "Bakery IA - User Activity",
"description": "User activity and behavior monitoring dashboard",
"tags": ["user", "activity", "behavior"],
"panels": [
{
"title": "Active Users",
"type": "timeseries",
"query": {
"metric": "active_users",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "number"
},
{
"title": "User Sessions",
"type": "timeseries",
"query": {
"metric": "user_sessions_total",
"aggregate": "sum",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "number"
},
{
"title": "API Calls per User",
"type": "timeseries",
"query": {
"metric": "api_calls_per_user",
"aggregate": "avg",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "number"
},
{
"title": "Session Duration",
"type": "timeseries",
"query": {
"metric": "session_duration_seconds",
"aggregate": "avg",
"groupBy": ["service"],
"filters": [
{
"key": "service",
"operator": "=",
"value": "${service}"
}
]
},
"unit": "seconds"
}
],
"variables": [
{
"name": "service",
"label": "Service",
"type": "dropdown",
"default": "*",
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service"]
}
],
"layout": {
"type": "grid",
"columns": 12,
"gap": [16, 16]
"description": "User activity and behavior monitoring dashboard",
"tags": ["user", "activity", "behavior"],
"name": "bakery-ia-user-activity",
"title": "Bakery IA - User Activity",
"uploadedGrafana": false,
"uuid": "bakery-ia-user-01",
"version": "v4",
"collapsableRowsMigrated": true,
"layout": [
{
"x": 0,
"y": 0,
"w": 6,
"h": 3,
"i": "active-users",
"moved": false,
"static": false
},
"refresh": "30s",
"time": {
"from": "now-1h",
"to": "now"
{
"x": 6,
"y": 0,
"w": 6,
"h": 3,
"i": "user-sessions",
"moved": false,
"static": false
},
{
"x": 0,
"y": 3,
"w": 6,
"h": 3,
"i": "api-calls-per-user",
"moved": false,
"static": false
},
{
"x": 6,
"y": 3,
"w": 6,
"h": 3,
"i": "session-duration",
"moved": false,
"static": false
}
}
],
"variables": {
"service": {
"id": "service-var",
"name": "service",
"description": "Filter by service name",
"type": "QUERY",
"queryValue": "SELECT DISTINCT(resource_attrs['service.name']) as value FROM signoz_metrics.distributed_time_series_v4_1day WHERE metric_name = 'active_users' AND value != '' ORDER BY value",
"customValue": "",
"textboxValue": "",
"showALLOption": true,
"multiSelect": false,
"order": 1,
"modificationUUID": "",
"sort": "ASC",
"selectedValue": null
}
},
"widgets": [
{
"id": "active-users",
"title": "Active Users",
"description": "Number of active users by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "active_users",
"dataType": "int64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "latest",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "service.name",
"dataType": "string",
"type": "resource",
"isColumn": false
}
],
"legend": "{{service.name}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "user-sessions",
"title": "User Sessions",
"description": "Total user sessions by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "sum",
"aggregateAttribute": {
"key": "user_sessions_total",
"dataType": "int64",
"type": "Counter",
"isColumn": false
},
"timeAggregation": "sum",
"spaceAggregation": "sum",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "sum"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "api-calls-per-user",
"title": "API Calls per User",
"description": "Average API calls per user by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "api_calls_per_user",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "none"
},
{
"id": "session-duration",
"title": "Session Duration",
"description": "Average session duration by service",
"isStacked": false,
"nullZeroValues": "zero",
"opacity": "1",
"panelTypes": "graph",
"query": {
"builder": {
"queryData": [
{
"dataSource": "metrics",
"queryName": "A",
"aggregateOperator": "avg",
"aggregateAttribute": {
"key": "session_duration_seconds",
"dataType": "float64",
"type": "Gauge",
"isColumn": false
},
"timeAggregation": "avg",
"spaceAggregation": "avg",
"functions": [],
"filters": {
"items": [
{
"key": {
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
},
"op": "=",
"value": "{{.service}}"
}
],
"op": "AND"
},
"expression": "A",
"disabled": false,
"having": [],
"stepInterval": 60,
"limit": null,
"orderBy": [],
"groupBy": [
{
"key": "serviceName",
"dataType": "string",
"type": "tag",
"isColumn": true
}
],
"legend": "{{serviceName}}",
"reduceTo": "avg"
}
],
"queryFormulas": []
},
"queryType": "builder"
},
"fillSpans": false,
"yAxisUnit": "seconds"
}
]
}