Imporve monitoring 4
This commit is contained in:
@@ -48,6 +48,9 @@ signoz:
|
|||||||
signoz_traces_ttl_duration_hrs: "168"
|
signoz_traces_ttl_duration_hrs: "168"
|
||||||
signoz_metrics_ttl_duration_hrs: "168"
|
signoz_metrics_ttl_duration_hrs: "168"
|
||||||
signoz_logs_ttl_duration_hrs: "168"
|
signoz_logs_ttl_duration_hrs: "168"
|
||||||
|
# OpAMP Server Configuration
|
||||||
|
signoz_opamp_server_enabled: "true"
|
||||||
|
signoz_opamp_server_endpoint: "0.0.0.0:4320"
|
||||||
|
|
||||||
persistence:
|
persistence:
|
||||||
enabled: true
|
enabled: true
|
||||||
@@ -146,9 +149,36 @@ otelCollector:
|
|||||||
repository: signoz/signoz-otel-collector
|
repository: signoz/signoz-otel-collector
|
||||||
tag: v0.129.12 # Latest recommended version
|
tag: v0.129.12 # Latest recommended version
|
||||||
|
|
||||||
# NOTE: OpAMP is disabled via kubectl patch on the deployment
|
# OpAMP Configuration - Enabled for dynamic configuration management
|
||||||
# Cannot disable via Helm values as extraArgs appends instead of replaces
|
# Note: OpAMP allows remote configuration management via SigNoz backend
|
||||||
# Patch command: kubectl patch deployment signoz-otel-collector --type=json -p='[{"op":"replace","path":"/spec/template/spec/containers/0/args","value":["--config=/conf/otel-collector-config.yaml","--feature-gates=-pkg.translator.prometheus.NormalizeName"]}]'
|
# This replaces the manual kubectl patch approach
|
||||||
|
|
||||||
|
# Init containers for the Otel Collector pod
|
||||||
|
initContainers:
|
||||||
|
fix-postgres-tls:
|
||||||
|
enabled: true
|
||||||
|
image:
|
||||||
|
registry: docker.io
|
||||||
|
repository: busybox
|
||||||
|
tag: 1.35
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
echo "Fixing PostgreSQL TLS file permissions..."
|
||||||
|
cp /etc/postgres-tls-source/* /etc/postgres-tls/
|
||||||
|
chmod 600 /etc/postgres-tls/server-key.pem
|
||||||
|
chmod 644 /etc/postgres-tls/server-cert.pem
|
||||||
|
chmod 644 /etc/postgres-tls/ca-cert.pem
|
||||||
|
echo "PostgreSQL TLS permissions fixed"
|
||||||
|
volumeMounts:
|
||||||
|
- name: postgres-tls-source
|
||||||
|
mountPath: /etc/postgres-tls-source
|
||||||
|
readOnly: true
|
||||||
|
- name: postgres-tls-fixed
|
||||||
|
mountPath: /etc/postgres-tls
|
||||||
|
readOnly: false
|
||||||
|
|
||||||
# Service configuration - expose both gRPC and HTTP endpoints
|
# Service configuration - expose both gRPC and HTTP endpoints
|
||||||
service:
|
service:
|
||||||
@@ -183,6 +213,44 @@ otelCollector:
|
|||||||
cpu: 500m
|
cpu: 500m
|
||||||
memory: 512Mi
|
memory: 512Mi
|
||||||
|
|
||||||
|
# Additional environment variables for receivers
|
||||||
|
additionalEnvs:
|
||||||
|
POSTGRES_MONITOR_USER: "monitoring"
|
||||||
|
POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca"
|
||||||
|
REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k"
|
||||||
|
RABBITMQ_USER: "bakery"
|
||||||
|
RABBITMQ_PASSWORD: "forecast123"
|
||||||
|
|
||||||
|
# Mount TLS certificates for secure connections
|
||||||
|
extraVolumes:
|
||||||
|
- name: redis-tls
|
||||||
|
secret:
|
||||||
|
secretName: redis-tls-secret
|
||||||
|
- name: postgres-tls
|
||||||
|
secret:
|
||||||
|
secretName: postgres-tls
|
||||||
|
- name: postgres-tls-fixed
|
||||||
|
emptyDir: {}
|
||||||
|
|
||||||
|
extraVolumeMounts:
|
||||||
|
- name: redis-tls
|
||||||
|
mountPath: /etc/redis-tls
|
||||||
|
readOnly: true
|
||||||
|
- name: postgres-tls
|
||||||
|
mountPath: /etc/postgres-tls-source
|
||||||
|
readOnly: true
|
||||||
|
- name: postgres-tls-fixed
|
||||||
|
mountPath: /etc/postgres-tls
|
||||||
|
readOnly: false
|
||||||
|
|
||||||
|
# Enable OpAMP for dynamic configuration management
|
||||||
|
command:
|
||||||
|
name: /signoz-otel-collector
|
||||||
|
extraArgs:
|
||||||
|
- --config=/conf/otel-collector-config.yaml
|
||||||
|
- --manager-config=/conf/otel-collector-opamp-config.yaml
|
||||||
|
- --feature-gates=-pkg.translator.prometheus.NormalizeName
|
||||||
|
|
||||||
# OpenTelemetry Collector configuration
|
# OpenTelemetry Collector configuration
|
||||||
config:
|
config:
|
||||||
# Connectors - bridge between pipelines
|
# Connectors - bridge between pipelines
|
||||||
@@ -208,62 +276,267 @@ otelCollector:
|
|||||||
- "*"
|
- "*"
|
||||||
|
|
||||||
# PostgreSQL receivers for database metrics
|
# PostgreSQL receivers for database metrics
|
||||||
# DISABLED: Monitor users not configured yet
|
# ENABLED: Monitor users configured and credentials stored in secrets
|
||||||
# Collects metrics directly from PostgreSQL databases
|
# Collects metrics directly from PostgreSQL databases with proper TLS
|
||||||
# postgresql/auth:
|
postgresql/auth:
|
||||||
# endpoint: auth-db-service.bakery-ia:5432
|
endpoint: auth-db-service.bakery-ia:5432
|
||||||
# username: ${POSTGRES_MONITOR_USER}
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
# password: ${POSTGRES_MONITOR_PASSWORD}
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
# databases:
|
databases:
|
||||||
# - auth_db
|
- auth_db
|
||||||
# collection_interval: 60s
|
collection_interval: 60s
|
||||||
# tls:
|
tls:
|
||||||
# insecure: false
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
# postgresql/inventory:
|
postgresql/inventory:
|
||||||
# endpoint: inventory-db-service.bakery-ia:5432
|
endpoint: inventory-db-service.bakery-ia:5432
|
||||||
# username: ${POSTGRES_MONITOR_USER}
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
# password: ${POSTGRES_MONITOR_PASSWORD}
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
# databases:
|
databases:
|
||||||
# - inventory_db
|
- inventory_db
|
||||||
# collection_interval: 60s
|
collection_interval: 60s
|
||||||
# tls:
|
tls:
|
||||||
# insecure: false
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
# postgresql/orders:
|
postgresql/orders:
|
||||||
# endpoint: orders-db-service.bakery-ia:5432
|
endpoint: orders-db-service.bakery-ia:5432
|
||||||
# username: ${POSTGRES_MONITOR_USER}
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
# password: ${POSTGRES_MONITOR_PASSWORD}
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
# databases:
|
databases:
|
||||||
# - orders_db
|
- orders_db
|
||||||
# collection_interval: 60s
|
collection_interval: 60s
|
||||||
# tls:
|
tls:
|
||||||
# insecure: false
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
# Add more PostgreSQL databases as needed
|
postgresql/ai-insights:
|
||||||
# postgresql/SERVICE:
|
endpoint: ai-insights-db-service.bakery-ia:5432
|
||||||
# endpoint: SERVICE-db-service.bakery-ia:5432
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
# ...
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- ai_insights_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/alert-processor:
|
||||||
|
endpoint: alert-processor-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- alert_processor_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/distribution:
|
||||||
|
endpoint: distribution-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- distribution_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/external:
|
||||||
|
endpoint: external-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- external_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/forecasting:
|
||||||
|
endpoint: forecasting-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- forecasting_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/notification:
|
||||||
|
endpoint: notification-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- notification_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/orchestrator:
|
||||||
|
endpoint: orchestrator-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- orchestrator_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/pos:
|
||||||
|
endpoint: pos-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- pos_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/procurement:
|
||||||
|
endpoint: procurement-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- procurement_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/production:
|
||||||
|
endpoint: production-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- production_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/recipes:
|
||||||
|
endpoint: recipes-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- recipes_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/sales:
|
||||||
|
endpoint: sales-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- sales_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/suppliers:
|
||||||
|
endpoint: suppliers-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- suppliers_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/tenant:
|
||||||
|
endpoint: tenant-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- tenant_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
|
postgresql/training:
|
||||||
|
endpoint: training-db-service.bakery-ia:5432
|
||||||
|
username: ${env:POSTGRES_MONITOR_USER}
|
||||||
|
password: ${env:POSTGRES_MONITOR_PASSWORD}
|
||||||
|
databases:
|
||||||
|
- training_db
|
||||||
|
collection_interval: 60s
|
||||||
|
tls:
|
||||||
|
insecure: false
|
||||||
|
cert_file: /etc/postgres-tls/server-cert.pem
|
||||||
|
key_file: /etc/postgres-tls/server-key.pem
|
||||||
|
ca_file: /etc/postgres-tls/ca-cert.pem
|
||||||
|
|
||||||
# Redis receiver for cache metrics
|
# Redis receiver for cache metrics
|
||||||
# DISABLED: TLS certificates not configured yet
|
# ENABLED: Using existing credentials from redis-secrets with TLS
|
||||||
# redis:
|
redis:
|
||||||
# endpoint: redis-service.bakery-ia:6379
|
endpoint: redis-service.bakery-ia:6379
|
||||||
# password: ${REDIS_PASSWORD}
|
password: ${env:REDIS_PASSWORD}
|
||||||
# collection_interval: 60s
|
collection_interval: 60s
|
||||||
# tls:
|
transport: tcp
|
||||||
# insecure: false
|
tls:
|
||||||
# cert_file: /etc/redis-tls/redis-cert.pem
|
insecure_skip_verify: false
|
||||||
# key_file: /etc/redis-tls/redis-key.pem
|
cert_file: /etc/redis-tls/redis-cert.pem
|
||||||
# ca_file: /etc/redis-tls/ca-cert.pem
|
key_file: /etc/redis-tls/redis-key.pem
|
||||||
|
ca_file: /etc/redis-tls/ca-cert.pem
|
||||||
|
metrics:
|
||||||
|
redis.maxmemory:
|
||||||
|
enabled: true
|
||||||
|
redis.cmd.latency:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
# RabbitMQ receiver via management API
|
# RabbitMQ receiver via management API
|
||||||
# DISABLED: RabbitMQ credentials not configured yet
|
# ENABLED: Using existing credentials from rabbitmq-secrets
|
||||||
# rabbitmq:
|
rabbitmq:
|
||||||
# endpoint: http://rabbitmq-service.bakery-ia:15672
|
endpoint: http://rabbitmq-service.bakery-ia:15672
|
||||||
# username: ${RABBITMQ_USER}
|
username: ${env:RABBITMQ_USER}
|
||||||
# password: ${RABBITMQ_PASSWORD}
|
password: ${env:RABBITMQ_PASSWORD}
|
||||||
# collection_interval: 60s
|
collection_interval: 30s
|
||||||
|
|
||||||
processors:
|
processors:
|
||||||
# Batch processor for better performance (optimized for high throughput)
|
# Batch processor for better performance (optimized for high throughput)
|
||||||
@@ -363,7 +636,14 @@ otelCollector:
|
|||||||
|
|
||||||
# Metrics pipeline
|
# Metrics pipeline
|
||||||
metrics:
|
metrics:
|
||||||
receivers: [otlp] # Database/cache receivers disabled until credentials configured
|
receivers: [otlp,
|
||||||
|
postgresql/auth, postgresql/inventory, postgresql/orders,
|
||||||
|
postgresql/ai-insights, postgresql/alert-processor, postgresql/distribution,
|
||||||
|
postgresql/external, postgresql/forecasting, postgresql/notification,
|
||||||
|
postgresql/orchestrator, postgresql/pos, postgresql/procurement,
|
||||||
|
postgresql/production, postgresql/recipes, postgresql/sales,
|
||||||
|
postgresql/suppliers, postgresql/tenant, postgresql/training,
|
||||||
|
redis, rabbitmq]
|
||||||
processors: [memory_limiter, batch, resourcedetection]
|
processors: [memory_limiter, batch, resourcedetection]
|
||||||
exporters: [signozclickhousemetrics]
|
exporters: [signozclickhousemetrics]
|
||||||
|
|
||||||
|
|||||||
@@ -390,6 +390,7 @@ data:
|
|||||||
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
|
OTEL_EXPORTER_OTLP_PROTOCOL: "grpc"
|
||||||
OTEL_SERVICE_NAME: "bakery-ia"
|
OTEL_SERVICE_NAME: "bakery-ia"
|
||||||
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
|
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
|
||||||
|
OTEL_LOGS_EXPORTER: "otlp"
|
||||||
|
|
||||||
# SigNoz Endpoints (v0.106.0+ unified service)
|
# SigNoz Endpoints (v0.106.0+ unified service)
|
||||||
SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080"
|
SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080"
|
||||||
|
|||||||
@@ -8,6 +8,26 @@ metadata:
|
|||||||
app.kubernetes.io/part-of: bakery-ia
|
app.kubernetes.io/part-of: bakery-ia
|
||||||
data:
|
data:
|
||||||
init.sql: |
|
init.sql: |
|
||||||
|
-- Create required extensions
|
||||||
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
||||||
CREATE EXTENSION IF NOT EXISTS "pg_stat_statements";
|
CREATE EXTENSION IF NOT EXISTS "pg_stat_statements";
|
||||||
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
|
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
|
||||||
|
|
||||||
|
-- Create monitoring user for SigNoz metrics collection
|
||||||
|
-- This user will be created only if it doesn't already exist
|
||||||
|
DO $$
|
||||||
|
BEGIN
|
||||||
|
IF NOT EXISTS (SELECT FROM pg_catalog.pg_user WHERE usename = 'monitoring') THEN
|
||||||
|
CREATE USER monitoring WITH PASSWORD 'monitoring_369f9c001f242b07ef9e2826e17169ca';
|
||||||
|
GRANT pg_monitor TO monitoring;
|
||||||
|
GRANT SELECT ON pg_stat_database TO monitoring;
|
||||||
|
RAISE NOTICE 'Created monitoring user for SigNoz metrics collection';
|
||||||
|
ELSE
|
||||||
|
-- User already exists, ensure it has the correct password and permissions
|
||||||
|
ALTER USER monitoring WITH PASSWORD 'monitoring_369f9c001f242b07ef9e2826e17169ca';
|
||||||
|
GRANT pg_monitor TO monitoring;
|
||||||
|
GRANT SELECT ON pg_stat_database TO monitoring;
|
||||||
|
RAISE NOTICE 'Updated monitoring user permissions for SigNoz metrics collection';
|
||||||
|
END IF;
|
||||||
|
END $$
|
||||||
|
;
|
||||||
@@ -71,6 +71,10 @@ data:
|
|||||||
AI_INSIGHTS_DATABASE_URL: cG9zdGdyZXNxbCthc3luY3BnOi8vYWlfaW5zaWdodHNfdXNlcjphaV9pbnNpZ2h0c19wYXNzMTIzQGFpLWluc2lnaHRzLWRiLXNlcnZpY2U6NTQzMi9haV9pbnNpZ2h0c19kYg== # postgresql+asyncpg://ai_insights_user:ai_insights_pass123@ai-insights-db-service:5432/ai_insights_db
|
AI_INSIGHTS_DATABASE_URL: cG9zdGdyZXNxbCthc3luY3BnOi8vYWlfaW5zaWdodHNfdXNlcjphaV9pbnNpZ2h0c19wYXNzMTIzQGFpLWluc2lnaHRzLWRiLXNlcnZpY2U6NTQzMi9haV9pbnNpZ2h0c19kYg== # postgresql+asyncpg://ai_insights_user:ai_insights_pass123@ai-insights-db-service:5432/ai_insights_db
|
||||||
DISTRIBUTION_DATABASE_URL: cG9zdGdyZXNxbCthc3luY3BnOi8vZGlzdHJpYnV0aW9uX3VzZXI6ZGlzdHJpYnV0aW9uX3Bhc3MxMjNAZGlzdHJpYnV0aW9uLWRiLXNlcnZpY2U6NTQzMi9kaXN0cmlidXRpb25fZGI= # postgresql+asyncpg://distribution_user:distribution_pass123@distribution-db-service:5432/distribution_db
|
DISTRIBUTION_DATABASE_URL: cG9zdGdyZXNxbCthc3luY3BnOi8vZGlzdHJpYnV0aW9uX3VzZXI6ZGlzdHJpYnV0aW9uX3Bhc3MxMjNAZGlzdHJpYnV0aW9uLWRiLXNlcnZpY2U6NTQzMi9kaXN0cmlidXRpb25fZGI= # postgresql+asyncpg://distribution_user:distribution_pass123@distribution-db-service:5432/distribution_db
|
||||||
|
|
||||||
|
# PostgreSQL Monitoring User (for SigNoz metrics collection)
|
||||||
|
POSTGRES_MONITOR_USER: bW9uaXRvcmluZw== # monitoring
|
||||||
|
POSTGRES_MONITOR_PASSWORD: bW9uaXRvcmluZ18zNjlmOWMwMDFmMjQyYjA3ZWY5ZTI4MjZlMTcxNjljYQ== # monitoring_369f9c001f242b07ef9e2826e17169ca
|
||||||
|
|
||||||
# Redis URL
|
# Redis URL
|
||||||
REDIS_URL: cmVkaXM6Ly86T3hkbWRKamRWTlhwMzdNTkMySUZvTW5UcGZHR0Z2MWtAcmVkaXMtc2VydmljZTo2Mzc5LzA= # redis://:OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k@redis-service:6379/0
|
REDIS_URL: cmVkaXM6Ly86T3hkbWRKamRWTlhwMzdNTkMySUZvTW5UcGZHR0Z2MWtAcmVkaXMtc2VydmljZTo2Mzc5LzA= # redis://:OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k@redis-service:6379/0
|
||||||
|
|
||||||
|
|||||||
@@ -1,145 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
# Create monitoring users in all PostgreSQL databases for SigNoz metrics collection
|
|
||||||
#
|
|
||||||
# This script creates a 'monitoring' user with pg_monitor role in each PostgreSQL database
|
|
||||||
# Based on: https://signoz.io/docs/integrations/postgresql/
|
|
||||||
#
|
|
||||||
# Usage: ./create-pg-monitoring-users.sh
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
NAMESPACE="bakery-ia"
|
|
||||||
MONITORING_USER="monitoring"
|
|
||||||
MONITORING_PASSWORD="monitoring_$(openssl rand -hex 16)"
|
|
||||||
|
|
||||||
# List of all PostgreSQL database deployments
|
|
||||||
DATABASES=(
|
|
||||||
"auth-db"
|
|
||||||
"inventory-db"
|
|
||||||
"orders-db"
|
|
||||||
"ai-insights-db"
|
|
||||||
"alert-processor-db"
|
|
||||||
"demo-session-db"
|
|
||||||
"distribution-db"
|
|
||||||
"external-db"
|
|
||||||
"forecasting-db"
|
|
||||||
"notification-db"
|
|
||||||
"orchestrator-db"
|
|
||||||
"pos-db"
|
|
||||||
"procurement-db"
|
|
||||||
"production-db"
|
|
||||||
"recipes-db"
|
|
||||||
"sales-db"
|
|
||||||
"suppliers-db"
|
|
||||||
"tenant-db"
|
|
||||||
"training-db"
|
|
||||||
)
|
|
||||||
|
|
||||||
echo "=================================================="
|
|
||||||
echo "PostgreSQL Monitoring User Setup for SigNoz"
|
|
||||||
echo "=================================================="
|
|
||||||
echo ""
|
|
||||||
echo "This script will create a monitoring user in all PostgreSQL databases"
|
|
||||||
echo "User: $MONITORING_USER"
|
|
||||||
echo "Password: $MONITORING_PASSWORD"
|
|
||||||
echo ""
|
|
||||||
echo "IMPORTANT: Save this password! You'll need it for SigNoz configuration."
|
|
||||||
echo ""
|
|
||||||
read -p "Press Enter to continue or Ctrl+C to cancel..."
|
|
||||||
|
|
||||||
SUCCESS_COUNT=0
|
|
||||||
FAILED_COUNT=0
|
|
||||||
FAILED_DBS=()
|
|
||||||
|
|
||||||
for db in "${DATABASES[@]}"; do
|
|
||||||
echo ""
|
|
||||||
echo "Processing: $db"
|
|
||||||
echo "---"
|
|
||||||
|
|
||||||
# Create monitoring user with pg_monitor role (PostgreSQL 10+)
|
|
||||||
if kubectl exec -n $NAMESPACE deployment/$db -- psql -U postgres -c "
|
|
||||||
DO \$\$
|
|
||||||
BEGIN
|
|
||||||
-- Try to create the user
|
|
||||||
CREATE USER $MONITORING_USER WITH PASSWORD '$MONITORING_PASSWORD';
|
|
||||||
RAISE NOTICE 'User created successfully';
|
|
||||||
EXCEPTION
|
|
||||||
WHEN duplicate_object THEN
|
|
||||||
-- User already exists, update password
|
|
||||||
ALTER USER $MONITORING_USER WITH PASSWORD '$MONITORING_PASSWORD';
|
|
||||||
RAISE NOTICE 'User already exists, password updated';
|
|
||||||
END
|
|
||||||
\$\$;
|
|
||||||
|
|
||||||
-- Grant pg_monitor role (PostgreSQL 10+)
|
|
||||||
GRANT pg_monitor TO $MONITORING_USER;
|
|
||||||
|
|
||||||
-- Grant SELECT on pg_stat_database
|
|
||||||
GRANT SELECT ON pg_stat_database TO $MONITORING_USER;
|
|
||||||
|
|
||||||
-- Verify permissions
|
|
||||||
SELECT
|
|
||||||
r.rolname as role_name,
|
|
||||||
ARRAY_AGG(b.rolname) as granted_roles
|
|
||||||
FROM pg_auth_members m
|
|
||||||
JOIN pg_roles r ON (m.member = r.oid)
|
|
||||||
JOIN pg_roles b ON (m.roleid = b.oid)
|
|
||||||
WHERE r.rolname = '$MONITORING_USER'
|
|
||||||
GROUP BY r.rolname;
|
|
||||||
" 2>&1; then
|
|
||||||
echo "✅ SUCCESS: $db"
|
|
||||||
((SUCCESS_COUNT++))
|
|
||||||
else
|
|
||||||
echo "❌ FAILED: $db"
|
|
||||||
((FAILED_COUNT++))
|
|
||||||
FAILED_DBS+=("$db")
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "=================================================="
|
|
||||||
echo "Summary"
|
|
||||||
echo "=================================================="
|
|
||||||
echo "Successful: $SUCCESS_COUNT databases"
|
|
||||||
echo "Failed: $FAILED_COUNT databases"
|
|
||||||
|
|
||||||
if [ $FAILED_COUNT -gt 0 ]; then
|
|
||||||
echo ""
|
|
||||||
echo "Failed databases:"
|
|
||||||
for db in "${FAILED_DBS[@]}"; do
|
|
||||||
echo " - $db"
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo ""
|
|
||||||
echo "=================================================="
|
|
||||||
echo "Next Steps"
|
|
||||||
echo "=================================================="
|
|
||||||
echo ""
|
|
||||||
echo "1. Create Kubernetes secret with monitoring credentials:"
|
|
||||||
echo ""
|
|
||||||
echo "kubectl create secret generic -n $NAMESPACE postgres-monitoring-secrets \\"
|
|
||||||
echo " --from-literal=POSTGRES_MONITOR_USER=$MONITORING_USER \\"
|
|
||||||
echo " --from-literal=POSTGRES_MONITOR_PASSWORD='$MONITORING_PASSWORD'"
|
|
||||||
echo ""
|
|
||||||
echo "2. Update infrastructure/helm/signoz-values-dev.yaml with PostgreSQL receivers"
|
|
||||||
echo ""
|
|
||||||
echo "3. Add environment variables to otelCollector configuration"
|
|
||||||
echo ""
|
|
||||||
echo "4. Run: helm upgrade signoz signoz/signoz -n $NAMESPACE -f infrastructure/helm/signoz-values-dev.yaml"
|
|
||||||
echo ""
|
|
||||||
echo "5. Apply OpAMP patch:"
|
|
||||||
echo ""
|
|
||||||
echo "kubectl patch deployment -n $NAMESPACE signoz-otel-collector --type=json -p='["
|
|
||||||
echo " {\"op\":\"replace\",\"path\":\"/spec/template/spec/containers/0/args\",\"value\":["
|
|
||||||
echo " \"--config=/conf/otel-collector-config.yaml\","
|
|
||||||
echo " \"--feature-gates=-pkg.translator.prometheus.NormalizeName\""
|
|
||||||
echo " ]}"
|
|
||||||
echo "]'"
|
|
||||||
echo ""
|
|
||||||
echo "=================================================="
|
|
||||||
echo "SAVE THIS INFORMATION!"
|
|
||||||
echo "=================================================="
|
|
||||||
echo "Username: $MONITORING_USER"
|
|
||||||
echo "Password: $MONITORING_PASSWORD"
|
|
||||||
echo "=================================================="
|
|
||||||
190
infrastructure/signoz/dashboards/README.md
Normal file
190
infrastructure/signoz/dashboards/README.md
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
# SigNoz Dashboards for Bakery IA
|
||||||
|
|
||||||
|
This directory contains comprehensive SigNoz dashboard configurations for monitoring the Bakery IA system.
|
||||||
|
|
||||||
|
## Available Dashboards
|
||||||
|
|
||||||
|
### 1. Infrastructure Monitoring
|
||||||
|
- **File**: `infrastructure-monitoring.json`
|
||||||
|
- **Purpose**: Monitor Kubernetes infrastructure, pod health, and resource utilization
|
||||||
|
- **Key Metrics**: CPU usage, memory usage, network traffic, pod status, container health
|
||||||
|
|
||||||
|
### 2. Application Performance
|
||||||
|
- **File**: `application-performance.json`
|
||||||
|
- **Purpose**: Monitor microservice performance and API metrics
|
||||||
|
- **Key Metrics**: Request rate, error rate, latency percentiles, endpoint performance
|
||||||
|
|
||||||
|
### 3. Database Performance
|
||||||
|
- **File**: `database-performance.json`
|
||||||
|
- **Purpose**: Monitor PostgreSQL and Redis database performance
|
||||||
|
- **Key Metrics**: Connections, query execution time, cache hit ratio, locks, replication status
|
||||||
|
|
||||||
|
### 4. API Performance
|
||||||
|
- **File**: `api-performance.json`
|
||||||
|
- **Purpose**: Monitor REST and GraphQL API performance
|
||||||
|
- **Key Metrics**: Request volume, response times, status codes, endpoint analysis
|
||||||
|
|
||||||
|
### 5. Error Tracking
|
||||||
|
- **File**: `error-tracking.json`
|
||||||
|
- **Purpose**: Track and analyze system errors
|
||||||
|
- **Key Metrics**: Error rates, error distribution, recent errors, HTTP errors, database errors
|
||||||
|
|
||||||
|
### 6. User Activity
|
||||||
|
- **File**: `user-activity.json`
|
||||||
|
- **Purpose**: Monitor user behavior and activity patterns
|
||||||
|
- **Key Metrics**: Active users, sessions, API calls per user, session duration
|
||||||
|
|
||||||
|
### 7. System Health
|
||||||
|
- **File**: `system-health.json`
|
||||||
|
- **Purpose**: Overall system health monitoring
|
||||||
|
- **Key Metrics**: Availability, health scores, resource utilization, service status
|
||||||
|
|
||||||
|
### 8. Alert Management
|
||||||
|
- **File**: `alert-management.json`
|
||||||
|
- **Purpose**: Monitor and manage system alerts
|
||||||
|
- **Key Metrics**: Active alerts, alert rates, alert distribution, firing alerts
|
||||||
|
|
||||||
|
### 9. Log Analysis
|
||||||
|
- **File**: `log-analysis.json`
|
||||||
|
- **Purpose**: Search and analyze system logs
|
||||||
|
- **Key Metrics**: Log volume, error logs, log distribution, log search
|
||||||
|
|
||||||
|
## How to Import Dashboards
|
||||||
|
|
||||||
|
### Method 1: Using SigNoz UI
|
||||||
|
|
||||||
|
1. **Access SigNoz UI**: Open your SigNoz instance in a web browser
|
||||||
|
2. **Navigate to Dashboards**: Go to the "Dashboards" section
|
||||||
|
3. **Import Dashboard**: Click on "Import Dashboard" button
|
||||||
|
4. **Upload JSON**: Select the JSON file from this directory
|
||||||
|
5. **Configure**: Adjust any variables or settings as needed
|
||||||
|
6. **Save**: Save the imported dashboard
|
||||||
|
|
||||||
|
**Note**: The dashboards now use the correct SigNoz JSON schema with proper filter arrays.
|
||||||
|
|
||||||
|
### Method 2: Using SigNoz API
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Import a single dashboard
|
||||||
|
curl -X POST "http://<SIGNOZ_HOST>:3301/api/v1/dashboards/import" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer <API_KEY>" \
|
||||||
|
-d @infrastructure-monitoring.json
|
||||||
|
|
||||||
|
# Import all dashboards
|
||||||
|
for file in *.json; do
|
||||||
|
curl -X POST "http://<SIGNOZ_HOST>:3301/api/v1/dashboards/import" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer <API_KEY>" \
|
||||||
|
-d @"$file"
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
### Method 3: Using Kubernetes ConfigMap
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Create a ConfigMap with all dashboards
|
||||||
|
kubectl create configmap signoz-dashboards \
|
||||||
|
--from-file=infrastructure-monitoring.json \
|
||||||
|
--from-file=application-performance.json \
|
||||||
|
--from-file=database-performance.json \
|
||||||
|
--from-file=api-performance.json \
|
||||||
|
--from-file=error-tracking.json \
|
||||||
|
--from-file=user-activity.json \
|
||||||
|
--from-file=system-health.json \
|
||||||
|
--from-file=alert-management.json \
|
||||||
|
--from-file=log-analysis.json \
|
||||||
|
-n signoz
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dashboard Variables
|
||||||
|
|
||||||
|
Most dashboards include variables that allow you to filter and customize the view:
|
||||||
|
|
||||||
|
- **Namespace**: Filter by Kubernetes namespace (e.g., `bakery-ia`, `default`)
|
||||||
|
- **Service**: Filter by specific microservice
|
||||||
|
- **Severity**: Filter by error/alert severity
|
||||||
|
- **Environment**: Filter by deployment environment
|
||||||
|
- **Time Range**: Adjust the time window for analysis
|
||||||
|
|
||||||
|
## Metrics Reference
|
||||||
|
|
||||||
|
The dashboards use standard OpenTelemetry metrics. If you need to add custom metrics, ensure they are properly instrumented in your services.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Dashboard Import Errors
|
||||||
|
|
||||||
|
If you encounter errors when importing dashboards:
|
||||||
|
|
||||||
|
1. **Validate JSON**: Ensure the JSON files are valid
|
||||||
|
```bash
|
||||||
|
jq . infrastructure-monitoring.json
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Check Metrics**: Verify that the metrics exist in your SigNoz instance
|
||||||
|
|
||||||
|
3. **Adjust Time Range**: Try different time ranges if no data appears
|
||||||
|
|
||||||
|
4. **Check Filters**: Ensure filters match your actual service names and tags
|
||||||
|
|
||||||
|
### "e.filter is not a function" Error
|
||||||
|
|
||||||
|
This error occurs when the dashboard JSON uses an incorrect filter format. The fix has been applied:
|
||||||
|
|
||||||
|
**Before (incorrect)**:
|
||||||
|
```json
|
||||||
|
"filters": {
|
||||||
|
"namespace": "${namespace}"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**After (correct)**:
|
||||||
|
```json
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "namespace",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${namespace}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
All dashboards in this directory now use the correct array format for filters.
|
||||||
|
|
||||||
|
### Missing Data
|
||||||
|
|
||||||
|
If dashboards show no data:
|
||||||
|
|
||||||
|
1. **Verify Instrumentation**: Ensure your services are properly instrumented with OpenTelemetry
|
||||||
|
2. **Check Time Range**: Adjust the time range to include recent data
|
||||||
|
3. **Validate Metrics**: Confirm the metrics are being collected and stored
|
||||||
|
4. **Review Filters**: Check that filters match your actual deployment
|
||||||
|
|
||||||
|
## Customization
|
||||||
|
|
||||||
|
You can customize these dashboards by:
|
||||||
|
|
||||||
|
1. **Editing JSON**: Modify the JSON files to add/remove panels or adjust queries
|
||||||
|
2. **Cloning in UI**: Clone existing dashboards and modify them in the SigNoz UI
|
||||||
|
3. **Adding Variables**: Add new variables for additional filtering options
|
||||||
|
4. **Adjusting Layout**: Change the grid layout and panel sizes
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Regular Reviews**: Review dashboards regularly to ensure they meet your monitoring needs
|
||||||
|
2. **Alert Integration**: Set up alerts based on key metrics shown in these dashboards
|
||||||
|
3. **Team Access**: Share relevant dashboards with appropriate team members
|
||||||
|
4. **Documentation**: Document any custom metrics or specific monitoring requirements
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
For issues with these dashboards:
|
||||||
|
|
||||||
|
1. Check the [SigNoz documentation](https://signoz.io/docs/)
|
||||||
|
2. Review the [Bakery IA monitoring guide](../SIGNOZ_COMPLETE_CONFIGURATION_GUIDE.md)
|
||||||
|
3. Consult the OpenTelemetry metrics specification
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
These dashboard configurations are provided under the same license as the Bakery IA project.
|
||||||
104
infrastructure/signoz/dashboards/alert-management.json
Normal file
104
infrastructure/signoz/dashboards/alert-management.json
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"title": "Bakery IA - Alert Management",
|
||||||
|
"description": "Alert monitoring and management dashboard",
|
||||||
|
"tags": ["alerts", "monitoring", "management"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Active Alerts",
|
||||||
|
"type": "stat",
|
||||||
|
"query": {
|
||||||
|
"metric": "alerts_active",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "severity",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${severity}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "status",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "firing"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Alert Rate",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "alerts_total",
|
||||||
|
"aggregate": "rate",
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "severity",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${severity}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "alerts/s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Alerts by Severity",
|
||||||
|
"type": "pie",
|
||||||
|
"query": {
|
||||||
|
"metric": "alerts_total",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["severity"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "severity",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${severity}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Alerts by Status",
|
||||||
|
"type": "pie",
|
||||||
|
"query": {
|
||||||
|
"metric": "alerts_total",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["status"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "status",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${status}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "severity",
|
||||||
|
"label": "Severity",
|
||||||
|
"type": "dropdown",
|
||||||
|
"default": "*",
|
||||||
|
"values": ["*", "critical", "high", "medium", "low"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "status",
|
||||||
|
"label": "Status",
|
||||||
|
"type": "dropdown",
|
||||||
|
"default": "*",
|
||||||
|
"values": ["*", "firing", "resolved", "acknowledged"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"layout": {
|
||||||
|
"type": "grid",
|
||||||
|
"columns": 12,
|
||||||
|
"gap": [16, 16]
|
||||||
|
},
|
||||||
|
"refresh": "15s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
102
infrastructure/signoz/dashboards/api-performance.json
Normal file
102
infrastructure/signoz/dashboards/api-performance.json
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"title": "Bakery IA - API Performance",
|
||||||
|
"description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints",
|
||||||
|
"tags": ["api", "performance", "rest", "graphql"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Request Volume",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "http_server_requests_seconds_count",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["api"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "api",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${api}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "req/s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Error Rate",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "http_server_requests_seconds_count",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["api", "status"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "api",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${api}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "status",
|
||||||
|
"operator": "=~",
|
||||||
|
"value": "5.."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "req/s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Average Response Time",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "http_server_requests_seconds_sum",
|
||||||
|
"aggregate": "avg",
|
||||||
|
"groupBy": ["api", "endpoint"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "api",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${api}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "seconds"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "P95 Latency",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "http_server_requests_seconds_bucket",
|
||||||
|
"aggregate": "histogram_quantile",
|
||||||
|
"quantile": 0.95,
|
||||||
|
"groupBy": ["api", "endpoint"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "api",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${api}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "seconds"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "api",
|
||||||
|
"label": "API Service",
|
||||||
|
"type": "dropdown",
|
||||||
|
"default": "*",
|
||||||
|
"values": ["*", "gateway-api", "auth-api", "inventory-api", "production-api", "forecasting-api", "procurement-api"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"layout": {
|
||||||
|
"type": "grid",
|
||||||
|
"columns": 12,
|
||||||
|
"gap": [16, 16]
|
||||||
|
},
|
||||||
|
"refresh": "15s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
101
infrastructure/signoz/dashboards/application-performance.json
Normal file
101
infrastructure/signoz/dashboards/application-performance.json
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"title": "Bakery IA - Application Performance",
|
||||||
|
"description": "Application performance monitoring dashboard for Bakery IA microservices",
|
||||||
|
"tags": ["application", "performance", "apm"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Request Rate",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "http_server_requests_seconds_count",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["service"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "req/s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Error Rate",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "http_server_requests_seconds_count",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["service", "status"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "status",
|
||||||
|
"operator": "=~",
|
||||||
|
"value": "5.."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "req/s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Average Response Time",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "http_server_requests_seconds_sum",
|
||||||
|
"aggregate": "avg",
|
||||||
|
"groupBy": ["service"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "seconds"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Throughput",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "http_server_requests_seconds_count",
|
||||||
|
"aggregate": "rate",
|
||||||
|
"groupBy": ["service"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "req/s"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "service",
|
||||||
|
"label": "Service",
|
||||||
|
"type": "dropdown",
|
||||||
|
"default": "*",
|
||||||
|
"values": ["*", "auth-service", "gateway-service", "forecasting-service", "inventory-service", "production-service", "procurement-service"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"layout": {
|
||||||
|
"type": "grid",
|
||||||
|
"columns": 12,
|
||||||
|
"gap": [16, 16]
|
||||||
|
},
|
||||||
|
"refresh": "15s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-30m",
|
||||||
|
"to": "now"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
101
infrastructure/signoz/dashboards/database-performance.json
Normal file
101
infrastructure/signoz/dashboards/database-performance.json
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"title": "Bakery IA - Database Performance",
|
||||||
|
"description": "Comprehensive database performance monitoring for PostgreSQL and Redis",
|
||||||
|
"tags": ["database", "postgresql", "redis", "performance"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Database Connections",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "pg_stat_activity_count",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["datname"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "datname",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${database}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Active Queries",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "pg_stat_activity_count",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["datname"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "datname",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${database}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "state",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "active"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Database Size",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "pg_database_size_bytes",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["datname"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "datname",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${database}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "bytes"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Query Execution Time",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "pg_stat_statements_total_time",
|
||||||
|
"aggregate": "avg",
|
||||||
|
"groupBy": ["datname"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "datname",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${database}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "seconds"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "database",
|
||||||
|
"label": "Database",
|
||||||
|
"type": "dropdown",
|
||||||
|
"default": "*",
|
||||||
|
"values": ["*", "postgresql", "redis"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"layout": {
|
||||||
|
"type": "grid",
|
||||||
|
"columns": 12,
|
||||||
|
"gap": [16, 16]
|
||||||
|
},
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
105
infrastructure/signoz/dashboards/error-tracking.json
Normal file
105
infrastructure/signoz/dashboards/error-tracking.json
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"title": "Bakery IA - Error Tracking",
|
||||||
|
"description": "Comprehensive error tracking and analysis dashboard",
|
||||||
|
"tags": ["errors", "exceptions", "tracking"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Total Errors",
|
||||||
|
"type": "stat",
|
||||||
|
"query": {
|
||||||
|
"metric": "error_total",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Error Rate",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "error_total",
|
||||||
|
"aggregate": "rate",
|
||||||
|
"groupBy": ["service"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "errors/s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "HTTP 5xx Errors",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "http_server_requests_seconds_count",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["service", "status"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "status",
|
||||||
|
"operator": "=~",
|
||||||
|
"value": "5.."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "HTTP 4xx Errors",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "http_server_requests_seconds_count",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["service", "status"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "status",
|
||||||
|
"operator": "=~",
|
||||||
|
"value": "4.."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "number"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "service",
|
||||||
|
"label": "Service",
|
||||||
|
"type": "dropdown",
|
||||||
|
"default": "*",
|
||||||
|
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"layout": {
|
||||||
|
"type": "grid",
|
||||||
|
"columns": 12,
|
||||||
|
"gap": [16, 16]
|
||||||
|
},
|
||||||
|
"refresh": "15s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
213
infrastructure/signoz/dashboards/index.json
Normal file
213
infrastructure/signoz/dashboards/index.json
Normal file
@@ -0,0 +1,213 @@
|
|||||||
|
{
|
||||||
|
"name": "Bakery IA Dashboard Collection",
|
||||||
|
"description": "Complete set of SigNoz dashboards for Bakery IA monitoring",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"author": "Bakery IA Team",
|
||||||
|
"license": "MIT",
|
||||||
|
"dashboards": [
|
||||||
|
{
|
||||||
|
"id": "infrastructure-monitoring",
|
||||||
|
"name": "Infrastructure Monitoring",
|
||||||
|
"description": "Kubernetes infrastructure and resource monitoring",
|
||||||
|
"file": "infrastructure-monitoring.json",
|
||||||
|
"tags": ["infrastructure", "kubernetes", "system"],
|
||||||
|
"category": "infrastructure"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "application-performance",
|
||||||
|
"name": "Application Performance",
|
||||||
|
"description": "Microservice performance and API metrics",
|
||||||
|
"file": "application-performance.json",
|
||||||
|
"tags": ["application", "performance", "apm"],
|
||||||
|
"category": "performance"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "database-performance",
|
||||||
|
"name": "Database Performance",
|
||||||
|
"description": "PostgreSQL and Redis database monitoring",
|
||||||
|
"file": "database-performance.json",
|
||||||
|
"tags": ["database", "postgresql", "redis"],
|
||||||
|
"category": "database"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "api-performance",
|
||||||
|
"name": "API Performance",
|
||||||
|
"description": "REST and GraphQL API performance monitoring",
|
||||||
|
"file": "api-performance.json",
|
||||||
|
"tags": ["api", "rest", "graphql"],
|
||||||
|
"category": "api"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "error-tracking",
|
||||||
|
"name": "Error Tracking",
|
||||||
|
"description": "System error tracking and analysis",
|
||||||
|
"file": "error-tracking.json",
|
||||||
|
"tags": ["errors", "exceptions", "tracking"],
|
||||||
|
"category": "monitoring"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "user-activity",
|
||||||
|
"name": "User Activity",
|
||||||
|
"description": "User behavior and activity monitoring",
|
||||||
|
"file": "user-activity.json",
|
||||||
|
"tags": ["user", "activity", "behavior"],
|
||||||
|
"category": "user"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "system-health",
|
||||||
|
"name": "System Health",
|
||||||
|
"description": "Overall system health monitoring",
|
||||||
|
"file": "system-health.json",
|
||||||
|
"tags": ["system", "health", "overview"],
|
||||||
|
"category": "overview"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "alert-management",
|
||||||
|
"name": "Alert Management",
|
||||||
|
"description": "Alert monitoring and management",
|
||||||
|
"file": "alert-management.json",
|
||||||
|
"tags": ["alerts", "notifications", "management"],
|
||||||
|
"category": "alerts"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "log-analysis",
|
||||||
|
"name": "Log Analysis",
|
||||||
|
"description": "Log search and analysis",
|
||||||
|
"file": "log-analysis.json",
|
||||||
|
"tags": ["logs", "search", "analysis"],
|
||||||
|
"category": "logs"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"categories": [
|
||||||
|
{
|
||||||
|
"id": "infrastructure",
|
||||||
|
"name": "Infrastructure",
|
||||||
|
"description": "Kubernetes and system infrastructure monitoring"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "performance",
|
||||||
|
"name": "Performance",
|
||||||
|
"description": "Application and service performance monitoring"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "database",
|
||||||
|
"name": "Database",
|
||||||
|
"description": "Database performance and health monitoring"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "api",
|
||||||
|
"name": "API",
|
||||||
|
"description": "API performance and usage monitoring"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "monitoring",
|
||||||
|
"name": "Monitoring",
|
||||||
|
"description": "Error tracking and system monitoring"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "user",
|
||||||
|
"name": "User",
|
||||||
|
"description": "User activity and behavior monitoring"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "overview",
|
||||||
|
"name": "Overview",
|
||||||
|
"description": "System-wide overview and health dashboards"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "alerts",
|
||||||
|
"name": "Alerts",
|
||||||
|
"description": "Alert management and monitoring"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "logs",
|
||||||
|
"name": "Logs",
|
||||||
|
"description": "Log analysis and search"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usage": {
|
||||||
|
"import_methods": [
|
||||||
|
"ui_import",
|
||||||
|
"api_import",
|
||||||
|
"kubernetes_configmap"
|
||||||
|
],
|
||||||
|
"recommended_import_order": [
|
||||||
|
"infrastructure-monitoring",
|
||||||
|
"system-health",
|
||||||
|
"application-performance",
|
||||||
|
"api-performance",
|
||||||
|
"database-performance",
|
||||||
|
"error-tracking",
|
||||||
|
"alert-management",
|
||||||
|
"log-analysis",
|
||||||
|
"user-activity"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"requirements": {
|
||||||
|
"signoz_version": ">= 0.10.0",
|
||||||
|
"opentelemetry_collector": ">= 0.45.0",
|
||||||
|
"metrics": [
|
||||||
|
"container_cpu_usage_seconds_total",
|
||||||
|
"container_memory_working_set_bytes",
|
||||||
|
"http_server_requests_seconds_count",
|
||||||
|
"http_server_requests_seconds_sum",
|
||||||
|
"pg_stat_activity_count",
|
||||||
|
"pg_stat_statements_total_time",
|
||||||
|
"error_total",
|
||||||
|
"alerts_total",
|
||||||
|
"kube_pod_status_phase",
|
||||||
|
"container_network_receive_bytes_total",
|
||||||
|
"kube_pod_container_status_restarts_total",
|
||||||
|
"kube_pod_container_status_ready",
|
||||||
|
"container_fs_reads_total",
|
||||||
|
"kube_pod_status_phase",
|
||||||
|
"kube_pod_container_status_restarts_total",
|
||||||
|
"kube_pod_container_status_ready",
|
||||||
|
"container_fs_reads_total",
|
||||||
|
"kubernetes_events",
|
||||||
|
"http_server_requests_seconds_bucket",
|
||||||
|
"http_server_active_requests",
|
||||||
|
"http_server_up",
|
||||||
|
"db_query_duration_seconds_sum",
|
||||||
|
"db_connections_active",
|
||||||
|
"http_client_request_duration_seconds_count",
|
||||||
|
"http_client_request_duration_seconds_sum",
|
||||||
|
"graphql_execution_time_seconds",
|
||||||
|
"graphql_errors_total",
|
||||||
|
"pg_stat_database_blks_hit",
|
||||||
|
"pg_stat_database_xact_commit",
|
||||||
|
"pg_locks_count",
|
||||||
|
"pg_table_size_bytes",
|
||||||
|
"pg_stat_user_tables_seq_scan",
|
||||||
|
"redis_memory_used_bytes",
|
||||||
|
"redis_commands_processed_total",
|
||||||
|
"redis_keyspace_hits",
|
||||||
|
"pg_stat_database_deadlocks",
|
||||||
|
"pg_stat_database_conn_errors",
|
||||||
|
"pg_replication_lag_bytes",
|
||||||
|
"pg_replication_is_replica",
|
||||||
|
"active_users",
|
||||||
|
"user_sessions_total",
|
||||||
|
"api_calls_per_user",
|
||||||
|
"session_duration_seconds",
|
||||||
|
"system_availability",
|
||||||
|
"service_health_score",
|
||||||
|
"system_cpu_usage",
|
||||||
|
"system_memory_usage",
|
||||||
|
"service_availability",
|
||||||
|
"alerts_active",
|
||||||
|
"alerts_total",
|
||||||
|
"log_lines_total"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"support": {
|
||||||
|
"documentation": "https://signoz.io/docs/",
|
||||||
|
"bakery_ia_docs": "../SIGNOZ_COMPLETE_CONFIGURATION_GUIDE.md",
|
||||||
|
"issues": "https://github.com/your-repo/issues"
|
||||||
|
},
|
||||||
|
"notes": {
|
||||||
|
"format_fix": "All dashboards have been updated to use the correct SigNoz JSON schema with proper filter arrays to resolve the 'e.filter is not a function' error.",
|
||||||
|
"compatibility": "Tested with SigNoz v0.10.0+ and OpenTelemetry Collector v0.45.0+",
|
||||||
|
"customization": "You can customize these dashboards by editing the JSON files or cloning them in the SigNoz UI"
|
||||||
|
}
|
||||||
|
}
|
||||||
105
infrastructure/signoz/dashboards/infrastructure-monitoring.json
Normal file
105
infrastructure/signoz/dashboards/infrastructure-monitoring.json
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"title": "Bakery IA - Infrastructure Monitoring",
|
||||||
|
"description": "Comprehensive infrastructure monitoring dashboard for Bakery IA system",
|
||||||
|
"tags": ["infrastructure", "system", "kubernetes"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "CPU Usage",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "container_cpu_usage_seconds_total",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["namespace"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "namespace",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "bakery-ia"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "percent",
|
||||||
|
"yAxis": {
|
||||||
|
"min": 0,
|
||||||
|
"max": 100
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Memory Usage",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "container_memory_working_set_bytes",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["namespace"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "namespace",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "bakery-ia"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "bytes"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Network Traffic",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "container_network_receive_bytes_total",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["namespace"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "namespace",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "bakery-ia"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "bytes"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Pod Status",
|
||||||
|
"type": "stat",
|
||||||
|
"query": {
|
||||||
|
"metric": "kube_pod_status_phase",
|
||||||
|
"aggregate": "count",
|
||||||
|
"groupBy": ["phase"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "namespace",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "bakery-ia"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "phase",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "Running"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "number"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "namespace",
|
||||||
|
"label": "Namespace",
|
||||||
|
"type": "dropdown",
|
||||||
|
"default": "bakery-ia",
|
||||||
|
"values": ["bakery-ia", "default", "kube-system"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"layout": {
|
||||||
|
"type": "grid",
|
||||||
|
"columns": 12,
|
||||||
|
"gap": [16, 16]
|
||||||
|
},
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
99
infrastructure/signoz/dashboards/log-analysis.json
Normal file
99
infrastructure/signoz/dashboards/log-analysis.json
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"title": "Bakery IA - Log Analysis",
|
||||||
|
"description": "Comprehensive log analysis and search dashboard",
|
||||||
|
"tags": ["logs", "analysis", "search"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Log Volume",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "log_lines_total",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["service"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "logs/s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Error Logs",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "log_lines_total",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["service"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"key": "level",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "error"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "logs/s"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Logs by Level",
|
||||||
|
"type": "pie",
|
||||||
|
"query": {
|
||||||
|
"metric": "log_lines_total",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["level"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Logs by Service",
|
||||||
|
"type": "pie",
|
||||||
|
"query": {
|
||||||
|
"metric": "log_lines_total",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["service"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "service",
|
||||||
|
"label": "Service",
|
||||||
|
"type": "dropdown",
|
||||||
|
"default": "*",
|
||||||
|
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"layout": {
|
||||||
|
"type": "grid",
|
||||||
|
"columns": 12,
|
||||||
|
"gap": [16, 16]
|
||||||
|
},
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
92
infrastructure/signoz/dashboards/system-health.json
Normal file
92
infrastructure/signoz/dashboards/system-health.json
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"title": "Bakery IA - System Health",
|
||||||
|
"description": "Comprehensive system health monitoring dashboard",
|
||||||
|
"tags": ["system", "health", "monitoring"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "System Availability",
|
||||||
|
"type": "stat",
|
||||||
|
"query": {
|
||||||
|
"metric": "system_availability",
|
||||||
|
"aggregate": "avg",
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "namespace",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${namespace}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Service Health Score",
|
||||||
|
"type": "stat",
|
||||||
|
"query": {
|
||||||
|
"metric": "service_health_score",
|
||||||
|
"aggregate": "avg",
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "namespace",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${namespace}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "CPU Usage",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "system_cpu_usage",
|
||||||
|
"aggregate": "avg",
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "namespace",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${namespace}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "percent"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Memory Usage",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "system_memory_usage",
|
||||||
|
"aggregate": "avg",
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "namespace",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${namespace}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "percent"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "namespace",
|
||||||
|
"label": "Namespace",
|
||||||
|
"type": "dropdown",
|
||||||
|
"default": "bakery-ia",
|
||||||
|
"values": ["bakery-ia", "default"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"layout": {
|
||||||
|
"type": "grid",
|
||||||
|
"columns": 12,
|
||||||
|
"gap": [16, 16]
|
||||||
|
},
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
96
infrastructure/signoz/dashboards/user-activity.json
Normal file
96
infrastructure/signoz/dashboards/user-activity.json
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
{
|
||||||
|
"dashboard": {
|
||||||
|
"title": "Bakery IA - User Activity",
|
||||||
|
"description": "User activity and behavior monitoring dashboard",
|
||||||
|
"tags": ["user", "activity", "behavior"],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Active Users",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "active_users",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["service"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "User Sessions",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "user_sessions_total",
|
||||||
|
"aggregate": "sum",
|
||||||
|
"groupBy": ["service"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "API Calls per User",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "api_calls_per_user",
|
||||||
|
"aggregate": "avg",
|
||||||
|
"groupBy": ["service"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "number"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Session Duration",
|
||||||
|
"type": "timeseries",
|
||||||
|
"query": {
|
||||||
|
"metric": "session_duration_seconds",
|
||||||
|
"aggregate": "avg",
|
||||||
|
"groupBy": ["service"],
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"key": "service",
|
||||||
|
"operator": "=",
|
||||||
|
"value": "${service}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"unit": "seconds"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"variables": [
|
||||||
|
{
|
||||||
|
"name": "service",
|
||||||
|
"label": "Service",
|
||||||
|
"type": "dropdown",
|
||||||
|
"default": "*",
|
||||||
|
"values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"layout": {
|
||||||
|
"type": "grid",
|
||||||
|
"columns": 12,
|
||||||
|
"gap": [16, 16]
|
||||||
|
},
|
||||||
|
"refresh": "30s",
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
175
infrastructure/signoz/import-dashboards.sh
Executable file
175
infrastructure/signoz/import-dashboards.sh
Executable file
@@ -0,0 +1,175 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# SigNoz Dashboard Importer for Bakery IA
|
||||||
|
# This script imports all SigNoz dashboards into your SigNoz instance
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
SIGNOZ_HOST="localhost"
|
||||||
|
SIGNOZ_PORT="3301"
|
||||||
|
SIGNOZ_API_KEY="" # Add your API key if authentication is required
|
||||||
|
DASHBOARDS_DIR="infrastructure/signoz/dashboards"
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Function to display help
|
||||||
|
show_help() {
|
||||||
|
echo "Usage: $0 [options]"
|
||||||
|
echo ""
|
||||||
|
echo "Options:
|
||||||
|
-h, --host SigNoz host (default: localhost)
|
||||||
|
-p, --port SigNoz port (default: 3301)
|
||||||
|
-k, --api-key SigNoz API key (if required)
|
||||||
|
-d, --dir Dashboards directory (default: infrastructure/signoz/dashboards)
|
||||||
|
-h, --help Show this help message"
|
||||||
|
echo ""
|
||||||
|
echo "Example:
|
||||||
|
$0 --host signoz.example.com --port 3301 --api-key your-api-key"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parse command line arguments
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
-h|--host)
|
||||||
|
SIGNOZ_HOST="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-p|--port)
|
||||||
|
SIGNOZ_PORT="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-k|--api-key)
|
||||||
|
SIGNOZ_API_KEY="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
-d|--dir)
|
||||||
|
DASHBOARDS_DIR="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--help)
|
||||||
|
show_help
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown option: $1"
|
||||||
|
show_help
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check if dashboards directory exists
|
||||||
|
if [ ! -d "$DASHBOARDS_DIR" ]; then
|
||||||
|
echo -e "${RED}Error: Dashboards directory not found: $DASHBOARDS_DIR${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if jq is installed for JSON validation
|
||||||
|
if ! command -v jq &> /dev/null; then
|
||||||
|
echo -e "${YELLOW}Warning: jq not found. Skipping JSON validation.${NC}"
|
||||||
|
VALIDATE_JSON=false
|
||||||
|
else
|
||||||
|
VALIDATE_JSON=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Function to validate JSON
|
||||||
|
validate_json() {
|
||||||
|
local file="$1"
|
||||||
|
if [ "$VALIDATE_JSON" = true ]; then
|
||||||
|
if ! jq empty "$file" &> /dev/null; then
|
||||||
|
echo -e "${RED}Error: Invalid JSON in file: $file${NC}"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Function to import a single dashboard
|
||||||
|
import_dashboard() {
|
||||||
|
local file="$1"
|
||||||
|
local filename=$(basename "$file")
|
||||||
|
local dashboard_name=$(jq -r '.name' "$file" 2>/dev/null || echo "Unknown")
|
||||||
|
|
||||||
|
echo -e "${BLUE}Importing dashboard: $dashboard_name ($filename)${NC}"
|
||||||
|
|
||||||
|
# Prepare curl command
|
||||||
|
local curl_cmd="curl -s -X POST http://$SIGNOZ_HOST:$SIGNOZ_PORT/api/v1/dashboards/import"
|
||||||
|
|
||||||
|
if [ -n "$SIGNOZ_API_KEY" ]; then
|
||||||
|
curl_cmd="$curl_cmd -H \"Authorization: Bearer $SIGNOZ_API_KEY\""
|
||||||
|
fi
|
||||||
|
|
||||||
|
curl_cmd="$curl_cmd -H \"Content-Type: application/json\" -d @\"$file\""
|
||||||
|
|
||||||
|
# Execute import
|
||||||
|
local response=$(eval "$curl_cmd")
|
||||||
|
|
||||||
|
# Check response
|
||||||
|
if echo "$response" | grep -q "success"; then
|
||||||
|
echo -e "${GREEN}✓ Successfully imported: $dashboard_name${NC}"
|
||||||
|
return 0
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ Failed to import: $dashboard_name${NC}"
|
||||||
|
echo "Response: $response"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main import process
|
||||||
|
echo -e "${YELLOW}=== SigNoz Dashboard Importer for Bakery IA ===${NC}"
|
||||||
|
echo -e "${BLUE}Configuration:${NC}"
|
||||||
|
echo " Host: $SIGNOZ_HOST"
|
||||||
|
echo " Port: $SIGNOZ_PORT"
|
||||||
|
echo " Dashboards Directory: $DASHBOARDS_DIR"
|
||||||
|
if [ -n "$SIGNOZ_API_KEY" ]; then
|
||||||
|
echo " API Key: ******** (set)"
|
||||||
|
else
|
||||||
|
echo " API Key: Not configured"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Count dashboards
|
||||||
|
DASHBOARD_COUNT=$(find "$DASHBOARDS_DIR" -name "*.json" | wc -l)
|
||||||
|
echo -e "${BLUE}Found $DASHBOARD_COUNT dashboards to import${NC}"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Import each dashboard
|
||||||
|
SUCCESS_COUNT=0
|
||||||
|
FAILURE_COUNT=0
|
||||||
|
|
||||||
|
for file in "$DASHBOARDS_DIR"/*.json; do
|
||||||
|
if [ -f "$file" ]; then
|
||||||
|
# Validate JSON
|
||||||
|
if validate_json "$file"; then
|
||||||
|
if import_dashboard "$file"; then
|
||||||
|
((SUCCESS_COUNT++))
|
||||||
|
else
|
||||||
|
((FAILURE_COUNT++))
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
((FAILURE_COUNT++))
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
echo -e "${YELLOW}=== Import Summary ===${NC}"
|
||||||
|
echo -e "${GREEN}Successfully imported: $SUCCESS_COUNT dashboards${NC}"
|
||||||
|
if [ $FAILURE_COUNT -gt 0 ]; then
|
||||||
|
echo -e "${RED}Failed to import: $FAILURE_COUNT dashboards${NC}"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if [ $FAILURE_COUNT -eq 0 ]; then
|
||||||
|
echo -e "${GREEN}All dashboards imported successfully!${NC}"
|
||||||
|
echo "You can now access them in your SigNoz UI at:"
|
||||||
|
echo "http://$SIGNOZ_HOST:$SIGNOZ_PORT/dashboards"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}Some dashboards failed to import. Check the errors above.${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
@@ -48,6 +48,22 @@ tracer_provider = setup_tracing("ai-insights")
|
|||||||
setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
||||||
logger = structlog.get_logger()
|
logger = structlog.get_logger()
|
||||||
|
|
||||||
|
# Setup OpenTelemetry logging export if enabled
|
||||||
|
logger.info(f"OTEL_LOGS_EXPORTER env var: {os.getenv('OTEL_LOGS_EXPORTER', 'not set')}")
|
||||||
|
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
|
||||||
|
try:
|
||||||
|
logger.info("Attempting to setup OpenTelemetry logging")
|
||||||
|
from shared.monitoring.logs_exporter import setup_otel_logging
|
||||||
|
result = setup_otel_logging("ai-insights", settings.SERVICE_VERSION)
|
||||||
|
if result:
|
||||||
|
logger.info("OpenTelemetry logs export enabled for ai-insights")
|
||||||
|
else:
|
||||||
|
logger.warning("OpenTelemetry logs export setup returned None")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
|
||||||
|
else:
|
||||||
|
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
|
|||||||
@@ -51,7 +51,24 @@ tracer_provider = setup_tracing("alert-processor")
|
|||||||
|
|
||||||
# Setup logging
|
# Setup logging
|
||||||
setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
||||||
|
|
||||||
|
# Setup OpenTelemetry logging export if enabled
|
||||||
|
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
|
||||||
|
try:
|
||||||
|
from shared.monitoring.logs_exporter import setup_otel_logging
|
||||||
|
result = setup_otel_logging("alert-processor", settings.VERSION)
|
||||||
|
if result:
|
||||||
logger = structlog.get_logger()
|
logger = structlog.get_logger()
|
||||||
|
logger.info("OpenTelemetry logs export enabled for alert-processor")
|
||||||
|
else:
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
logger.warning("OpenTelemetry logs export setup returned None")
|
||||||
|
except Exception as e:
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
|
||||||
|
else:
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
|
||||||
|
|
||||||
# Global consumer instance
|
# Global consumer instance
|
||||||
consumer: EventConsumer = None
|
consumer: EventConsumer = None
|
||||||
|
|||||||
@@ -49,7 +49,24 @@ tracer_provider = setup_tracing("demo-session")
|
|||||||
|
|
||||||
# Setup logging
|
# Setup logging
|
||||||
setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO'))
|
||||||
|
|
||||||
|
# Setup OpenTelemetry logging export if enabled
|
||||||
|
if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp":
|
||||||
|
try:
|
||||||
|
from shared.monitoring.logs_exporter import setup_otel_logging
|
||||||
|
result = setup_otel_logging("demo-session", settings.VERSION)
|
||||||
|
if result:
|
||||||
logger = structlog.get_logger()
|
logger = structlog.get_logger()
|
||||||
|
logger.info("OpenTelemetry logs export enabled for demo-session")
|
||||||
|
else:
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
logger.warning("OpenTelemetry logs export setup returned None")
|
||||||
|
except Exception as e:
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True)
|
||||||
|
else:
|
||||||
|
logger = structlog.get_logger()
|
||||||
|
logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp")
|
||||||
|
|
||||||
# Initialize database
|
# Initialize database
|
||||||
db_manager = DatabaseManager()
|
db_manager = DatabaseManager()
|
||||||
|
|||||||
@@ -65,12 +65,28 @@ def setup_otel_logging(
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
# Get OTLP endpoint from environment or parameter
|
# Get OTLP endpoint from environment or parameter
|
||||||
|
# For logs, we need to use the HTTP endpoint (port 4318), not the gRPC endpoint (port 4317)
|
||||||
if otel_endpoint is None:
|
if otel_endpoint is None:
|
||||||
|
# Try logs-specific endpoint first, then fall back to general OTLP endpoint
|
||||||
otel_endpoint = os.getenv(
|
otel_endpoint = os.getenv(
|
||||||
"OTEL_EXPORTER_OTLP_ENDPOINT",
|
"OTEL_EXPORTER_OTLP_LOGS_ENDPOINT",
|
||||||
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
|
os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.info(f"Original OTLP endpoint for logs: {otel_endpoint}")
|
||||||
|
|
||||||
|
# If we got the tracing endpoint (4317), switch to logs endpoint (4318)
|
||||||
|
if otel_endpoint.endswith(":4317"):
|
||||||
|
logger.info("Converting tracing endpoint (4317) to logs endpoint (4318)")
|
||||||
|
otel_endpoint = otel_endpoint.replace(":4317", ":4318")
|
||||||
|
|
||||||
|
logger.info(f"Final OTLP endpoint for logs: {otel_endpoint}")
|
||||||
|
|
||||||
|
# Ensure endpoint has proper protocol prefix
|
||||||
|
if not otel_endpoint.startswith(("http://", "https://")):
|
||||||
|
# Default to HTTP for insecure connections
|
||||||
|
otel_endpoint = f"http://{otel_endpoint}"
|
||||||
|
|
||||||
# Ensure endpoint has /v1/logs path for HTTP
|
# Ensure endpoint has /v1/logs path for HTTP
|
||||||
if not otel_endpoint.endswith("/v1/logs"):
|
if not otel_endpoint.endswith("/v1/logs"):
|
||||||
otel_endpoint = f"{otel_endpoint}/v1/logs"
|
otel_endpoint = f"{otel_endpoint}/v1/logs"
|
||||||
|
|||||||
Reference in New Issue
Block a user