diff --git a/infrastructure/helm/signoz-values-dev.yaml b/infrastructure/helm/signoz-values-dev.yaml index d870acd9..e3334d4f 100644 --- a/infrastructure/helm/signoz-values-dev.yaml +++ b/infrastructure/helm/signoz-values-dev.yaml @@ -48,6 +48,9 @@ signoz: signoz_traces_ttl_duration_hrs: "168" signoz_metrics_ttl_duration_hrs: "168" signoz_logs_ttl_duration_hrs: "168" + # OpAMP Server Configuration + signoz_opamp_server_enabled: "true" + signoz_opamp_server_endpoint: "0.0.0.0:4320" persistence: enabled: true @@ -146,9 +149,36 @@ otelCollector: repository: signoz/signoz-otel-collector tag: v0.129.12 # Latest recommended version - # NOTE: OpAMP is disabled via kubectl patch on the deployment - # Cannot disable via Helm values as extraArgs appends instead of replaces - # Patch command: kubectl patch deployment signoz-otel-collector --type=json -p='[{"op":"replace","path":"/spec/template/spec/containers/0/args","value":["--config=/conf/otel-collector-config.yaml","--feature-gates=-pkg.translator.prometheus.NormalizeName"]}]' + # OpAMP Configuration - Enabled for dynamic configuration management + # Note: OpAMP allows remote configuration management via SigNoz backend + # This replaces the manual kubectl patch approach + + # Init containers for the Otel Collector pod + initContainers: + fix-postgres-tls: + enabled: true + image: + registry: docker.io + repository: busybox + tag: 1.35 + pullPolicy: IfNotPresent + command: + - sh + - -c + - | + echo "Fixing PostgreSQL TLS file permissions..." + cp /etc/postgres-tls-source/* /etc/postgres-tls/ + chmod 600 /etc/postgres-tls/server-key.pem + chmod 644 /etc/postgres-tls/server-cert.pem + chmod 644 /etc/postgres-tls/ca-cert.pem + echo "PostgreSQL TLS permissions fixed" + volumeMounts: + - name: postgres-tls-source + mountPath: /etc/postgres-tls-source + readOnly: true + - name: postgres-tls-fixed + mountPath: /etc/postgres-tls + readOnly: false # Service configuration - expose both gRPC and HTTP endpoints service: @@ -183,6 +213,44 @@ otelCollector: cpu: 500m memory: 512Mi + # Additional environment variables for receivers + additionalEnvs: + POSTGRES_MONITOR_USER: "monitoring" + POSTGRES_MONITOR_PASSWORD: "monitoring_369f9c001f242b07ef9e2826e17169ca" + REDIS_PASSWORD: "OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k" + RABBITMQ_USER: "bakery" + RABBITMQ_PASSWORD: "forecast123" + + # Mount TLS certificates for secure connections + extraVolumes: + - name: redis-tls + secret: + secretName: redis-tls-secret + - name: postgres-tls + secret: + secretName: postgres-tls + - name: postgres-tls-fixed + emptyDir: {} + + extraVolumeMounts: + - name: redis-tls + mountPath: /etc/redis-tls + readOnly: true + - name: postgres-tls + mountPath: /etc/postgres-tls-source + readOnly: true + - name: postgres-tls-fixed + mountPath: /etc/postgres-tls + readOnly: false + + # Enable OpAMP for dynamic configuration management + command: + name: /signoz-otel-collector + extraArgs: + - --config=/conf/otel-collector-config.yaml + - --manager-config=/conf/otel-collector-opamp-config.yaml + - --feature-gates=-pkg.translator.prometheus.NormalizeName + # OpenTelemetry Collector configuration config: # Connectors - bridge between pipelines @@ -208,62 +276,267 @@ otelCollector: - "*" # PostgreSQL receivers for database metrics - # DISABLED: Monitor users not configured yet - # Collects metrics directly from PostgreSQL databases - # postgresql/auth: - # endpoint: auth-db-service.bakery-ia:5432 - # username: ${POSTGRES_MONITOR_USER} - # password: ${POSTGRES_MONITOR_PASSWORD} - # databases: - # - auth_db - # collection_interval: 60s - # tls: - # insecure: false + # ENABLED: Monitor users configured and credentials stored in secrets + # Collects metrics directly from PostgreSQL databases with proper TLS + postgresql/auth: + endpoint: auth-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - auth_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem - # postgresql/inventory: - # endpoint: inventory-db-service.bakery-ia:5432 - # username: ${POSTGRES_MONITOR_USER} - # password: ${POSTGRES_MONITOR_PASSWORD} - # databases: - # - inventory_db - # collection_interval: 60s - # tls: - # insecure: false + postgresql/inventory: + endpoint: inventory-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - inventory_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem - # postgresql/orders: - # endpoint: orders-db-service.bakery-ia:5432 - # username: ${POSTGRES_MONITOR_USER} - # password: ${POSTGRES_MONITOR_PASSWORD} - # databases: - # - orders_db - # collection_interval: 60s - # tls: - # insecure: false + postgresql/orders: + endpoint: orders-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - orders_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem - # Add more PostgreSQL databases as needed - # postgresql/SERVICE: - # endpoint: SERVICE-db-service.bakery-ia:5432 - # ... + postgresql/ai-insights: + endpoint: ai-insights-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - ai_insights_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/alert-processor: + endpoint: alert-processor-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - alert_processor_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/distribution: + endpoint: distribution-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - distribution_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/external: + endpoint: external-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - external_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/forecasting: + endpoint: forecasting-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - forecasting_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/notification: + endpoint: notification-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - notification_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/orchestrator: + endpoint: orchestrator-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - orchestrator_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/pos: + endpoint: pos-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - pos_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/procurement: + endpoint: procurement-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - procurement_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/production: + endpoint: production-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - production_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/recipes: + endpoint: recipes-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - recipes_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/sales: + endpoint: sales-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - sales_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/suppliers: + endpoint: suppliers-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - suppliers_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/tenant: + endpoint: tenant-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - tenant_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem + + postgresql/training: + endpoint: training-db-service.bakery-ia:5432 + username: ${env:POSTGRES_MONITOR_USER} + password: ${env:POSTGRES_MONITOR_PASSWORD} + databases: + - training_db + collection_interval: 60s + tls: + insecure: false + cert_file: /etc/postgres-tls/server-cert.pem + key_file: /etc/postgres-tls/server-key.pem + ca_file: /etc/postgres-tls/ca-cert.pem # Redis receiver for cache metrics - # DISABLED: TLS certificates not configured yet - # redis: - # endpoint: redis-service.bakery-ia:6379 - # password: ${REDIS_PASSWORD} - # collection_interval: 60s - # tls: - # insecure: false - # cert_file: /etc/redis-tls/redis-cert.pem - # key_file: /etc/redis-tls/redis-key.pem - # ca_file: /etc/redis-tls/ca-cert.pem + # ENABLED: Using existing credentials from redis-secrets with TLS + redis: + endpoint: redis-service.bakery-ia:6379 + password: ${env:REDIS_PASSWORD} + collection_interval: 60s + transport: tcp + tls: + insecure_skip_verify: false + cert_file: /etc/redis-tls/redis-cert.pem + key_file: /etc/redis-tls/redis-key.pem + ca_file: /etc/redis-tls/ca-cert.pem + metrics: + redis.maxmemory: + enabled: true + redis.cmd.latency: + enabled: true # RabbitMQ receiver via management API - # DISABLED: RabbitMQ credentials not configured yet - # rabbitmq: - # endpoint: http://rabbitmq-service.bakery-ia:15672 - # username: ${RABBITMQ_USER} - # password: ${RABBITMQ_PASSWORD} - # collection_interval: 60s + # ENABLED: Using existing credentials from rabbitmq-secrets + rabbitmq: + endpoint: http://rabbitmq-service.bakery-ia:15672 + username: ${env:RABBITMQ_USER} + password: ${env:RABBITMQ_PASSWORD} + collection_interval: 30s processors: # Batch processor for better performance (optimized for high throughput) @@ -363,7 +636,14 @@ otelCollector: # Metrics pipeline metrics: - receivers: [otlp] # Database/cache receivers disabled until credentials configured + receivers: [otlp, + postgresql/auth, postgresql/inventory, postgresql/orders, + postgresql/ai-insights, postgresql/alert-processor, postgresql/distribution, + postgresql/external, postgresql/forecasting, postgresql/notification, + postgresql/orchestrator, postgresql/pos, postgresql/procurement, + postgresql/production, postgresql/recipes, postgresql/sales, + postgresql/suppliers, postgresql/tenant, postgresql/training, + redis, rabbitmq] processors: [memory_limiter, batch, resourcedetection] exporters: [signozclickhousemetrics] diff --git a/infrastructure/kubernetes/base/configmap.yaml b/infrastructure/kubernetes/base/configmap.yaml index 2e3b3fb8..b322469d 100644 --- a/infrastructure/kubernetes/base/configmap.yaml +++ b/infrastructure/kubernetes/base/configmap.yaml @@ -390,6 +390,7 @@ data: OTEL_EXPORTER_OTLP_PROTOCOL: "grpc" OTEL_SERVICE_NAME: "bakery-ia" OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development" + OTEL_LOGS_EXPORTER: "otlp" # SigNoz Endpoints (v0.106.0+ unified service) SIGNOZ_ENDPOINT: "http://signoz.bakery-ia.svc.cluster.local:8080" diff --git a/infrastructure/kubernetes/base/configs/postgres-init-config.yaml b/infrastructure/kubernetes/base/configs/postgres-init-config.yaml index 1be6c62c..46a7d0ba 100644 --- a/infrastructure/kubernetes/base/configs/postgres-init-config.yaml +++ b/infrastructure/kubernetes/base/configs/postgres-init-config.yaml @@ -8,6 +8,26 @@ metadata: app.kubernetes.io/part-of: bakery-ia data: init.sql: | + -- Create required extensions CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS "pg_stat_statements"; - CREATE EXTENSION IF NOT EXISTS "pgcrypto"; \ No newline at end of file + CREATE EXTENSION IF NOT EXISTS "pgcrypto"; + + -- Create monitoring user for SigNoz metrics collection + -- This user will be created only if it doesn't already exist + DO $$ + BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_user WHERE usename = 'monitoring') THEN + CREATE USER monitoring WITH PASSWORD 'monitoring_369f9c001f242b07ef9e2826e17169ca'; + GRANT pg_monitor TO monitoring; + GRANT SELECT ON pg_stat_database TO monitoring; + RAISE NOTICE 'Created monitoring user for SigNoz metrics collection'; + ELSE + -- User already exists, ensure it has the correct password and permissions + ALTER USER monitoring WITH PASSWORD 'monitoring_369f9c001f242b07ef9e2826e17169ca'; + GRANT pg_monitor TO monitoring; + GRANT SELECT ON pg_stat_database TO monitoring; + RAISE NOTICE 'Updated monitoring user permissions for SigNoz metrics collection'; + END IF; + END $$ + ; \ No newline at end of file diff --git a/infrastructure/kubernetes/base/secrets.yaml b/infrastructure/kubernetes/base/secrets.yaml index f44bf35f..ed0bf2a0 100644 --- a/infrastructure/kubernetes/base/secrets.yaml +++ b/infrastructure/kubernetes/base/secrets.yaml @@ -71,6 +71,10 @@ data: AI_INSIGHTS_DATABASE_URL: cG9zdGdyZXNxbCthc3luY3BnOi8vYWlfaW5zaWdodHNfdXNlcjphaV9pbnNpZ2h0c19wYXNzMTIzQGFpLWluc2lnaHRzLWRiLXNlcnZpY2U6NTQzMi9haV9pbnNpZ2h0c19kYg== # postgresql+asyncpg://ai_insights_user:ai_insights_pass123@ai-insights-db-service:5432/ai_insights_db DISTRIBUTION_DATABASE_URL: cG9zdGdyZXNxbCthc3luY3BnOi8vZGlzdHJpYnV0aW9uX3VzZXI6ZGlzdHJpYnV0aW9uX3Bhc3MxMjNAZGlzdHJpYnV0aW9uLWRiLXNlcnZpY2U6NTQzMi9kaXN0cmlidXRpb25fZGI= # postgresql+asyncpg://distribution_user:distribution_pass123@distribution-db-service:5432/distribution_db + # PostgreSQL Monitoring User (for SigNoz metrics collection) + POSTGRES_MONITOR_USER: bW9uaXRvcmluZw== # monitoring + POSTGRES_MONITOR_PASSWORD: bW9uaXRvcmluZ18zNjlmOWMwMDFmMjQyYjA3ZWY5ZTI4MjZlMTcxNjljYQ== # monitoring_369f9c001f242b07ef9e2826e17169ca + # Redis URL REDIS_URL: cmVkaXM6Ly86T3hkbWRKamRWTlhwMzdNTkMySUZvTW5UcGZHR0Z2MWtAcmVkaXMtc2VydmljZTo2Mzc5LzA= # redis://:OxdmdJjdVNXp37MNC2IFoMnTpfGGFv1k@redis-service:6379/0 diff --git a/infrastructure/scripts/create-pg-monitoring-users.sh b/infrastructure/scripts/create-pg-monitoring-users.sh deleted file mode 100755 index 9b955b86..00000000 --- a/infrastructure/scripts/create-pg-monitoring-users.sh +++ /dev/null @@ -1,145 +0,0 @@ -#!/bin/bash -# Create monitoring users in all PostgreSQL databases for SigNoz metrics collection -# -# This script creates a 'monitoring' user with pg_monitor role in each PostgreSQL database -# Based on: https://signoz.io/docs/integrations/postgresql/ -# -# Usage: ./create-pg-monitoring-users.sh - -set -e - -NAMESPACE="bakery-ia" -MONITORING_USER="monitoring" -MONITORING_PASSWORD="monitoring_$(openssl rand -hex 16)" - -# List of all PostgreSQL database deployments -DATABASES=( - "auth-db" - "inventory-db" - "orders-db" - "ai-insights-db" - "alert-processor-db" - "demo-session-db" - "distribution-db" - "external-db" - "forecasting-db" - "notification-db" - "orchestrator-db" - "pos-db" - "procurement-db" - "production-db" - "recipes-db" - "sales-db" - "suppliers-db" - "tenant-db" - "training-db" -) - -echo "==================================================" -echo "PostgreSQL Monitoring User Setup for SigNoz" -echo "==================================================" -echo "" -echo "This script will create a monitoring user in all PostgreSQL databases" -echo "User: $MONITORING_USER" -echo "Password: $MONITORING_PASSWORD" -echo "" -echo "IMPORTANT: Save this password! You'll need it for SigNoz configuration." -echo "" -read -p "Press Enter to continue or Ctrl+C to cancel..." - -SUCCESS_COUNT=0 -FAILED_COUNT=0 -FAILED_DBS=() - -for db in "${DATABASES[@]}"; do - echo "" - echo "Processing: $db" - echo "---" - - # Create monitoring user with pg_monitor role (PostgreSQL 10+) - if kubectl exec -n $NAMESPACE deployment/$db -- psql -U postgres -c " - DO \$\$ - BEGIN - -- Try to create the user - CREATE USER $MONITORING_USER WITH PASSWORD '$MONITORING_PASSWORD'; - RAISE NOTICE 'User created successfully'; - EXCEPTION - WHEN duplicate_object THEN - -- User already exists, update password - ALTER USER $MONITORING_USER WITH PASSWORD '$MONITORING_PASSWORD'; - RAISE NOTICE 'User already exists, password updated'; - END - \$\$; - - -- Grant pg_monitor role (PostgreSQL 10+) - GRANT pg_monitor TO $MONITORING_USER; - - -- Grant SELECT on pg_stat_database - GRANT SELECT ON pg_stat_database TO $MONITORING_USER; - - -- Verify permissions - SELECT - r.rolname as role_name, - ARRAY_AGG(b.rolname) as granted_roles - FROM pg_auth_members m - JOIN pg_roles r ON (m.member = r.oid) - JOIN pg_roles b ON (m.roleid = b.oid) - WHERE r.rolname = '$MONITORING_USER' - GROUP BY r.rolname; - " 2>&1; then - echo "✅ SUCCESS: $db" - ((SUCCESS_COUNT++)) - else - echo "❌ FAILED: $db" - ((FAILED_COUNT++)) - FAILED_DBS+=("$db") - fi -done - -echo "" -echo "==================================================" -echo "Summary" -echo "==================================================" -echo "Successful: $SUCCESS_COUNT databases" -echo "Failed: $FAILED_COUNT databases" - -if [ $FAILED_COUNT -gt 0 ]; then - echo "" - echo "Failed databases:" - for db in "${FAILED_DBS[@]}"; do - echo " - $db" - done -fi - -echo "" -echo "==================================================" -echo "Next Steps" -echo "==================================================" -echo "" -echo "1. Create Kubernetes secret with monitoring credentials:" -echo "" -echo "kubectl create secret generic -n $NAMESPACE postgres-monitoring-secrets \\" -echo " --from-literal=POSTGRES_MONITOR_USER=$MONITORING_USER \\" -echo " --from-literal=POSTGRES_MONITOR_PASSWORD='$MONITORING_PASSWORD'" -echo "" -echo "2. Update infrastructure/helm/signoz-values-dev.yaml with PostgreSQL receivers" -echo "" -echo "3. Add environment variables to otelCollector configuration" -echo "" -echo "4. Run: helm upgrade signoz signoz/signoz -n $NAMESPACE -f infrastructure/helm/signoz-values-dev.yaml" -echo "" -echo "5. Apply OpAMP patch:" -echo "" -echo "kubectl patch deployment -n $NAMESPACE signoz-otel-collector --type=json -p='[" -echo " {\"op\":\"replace\",\"path\":\"/spec/template/spec/containers/0/args\",\"value\":[" -echo " \"--config=/conf/otel-collector-config.yaml\"," -echo " \"--feature-gates=-pkg.translator.prometheus.NormalizeName\"" -echo " ]}" -echo "]'" -echo "" -echo "==================================================" -echo "SAVE THIS INFORMATION!" -echo "==================================================" -echo "Username: $MONITORING_USER" -echo "Password: $MONITORING_PASSWORD" -echo "==================================================" diff --git a/infrastructure/signoz/dashboards/README.md b/infrastructure/signoz/dashboards/README.md new file mode 100644 index 00000000..0508fde9 --- /dev/null +++ b/infrastructure/signoz/dashboards/README.md @@ -0,0 +1,190 @@ +# SigNoz Dashboards for Bakery IA + +This directory contains comprehensive SigNoz dashboard configurations for monitoring the Bakery IA system. + +## Available Dashboards + +### 1. Infrastructure Monitoring +- **File**: `infrastructure-monitoring.json` +- **Purpose**: Monitor Kubernetes infrastructure, pod health, and resource utilization +- **Key Metrics**: CPU usage, memory usage, network traffic, pod status, container health + +### 2. Application Performance +- **File**: `application-performance.json` +- **Purpose**: Monitor microservice performance and API metrics +- **Key Metrics**: Request rate, error rate, latency percentiles, endpoint performance + +### 3. Database Performance +- **File**: `database-performance.json` +- **Purpose**: Monitor PostgreSQL and Redis database performance +- **Key Metrics**: Connections, query execution time, cache hit ratio, locks, replication status + +### 4. API Performance +- **File**: `api-performance.json` +- **Purpose**: Monitor REST and GraphQL API performance +- **Key Metrics**: Request volume, response times, status codes, endpoint analysis + +### 5. Error Tracking +- **File**: `error-tracking.json` +- **Purpose**: Track and analyze system errors +- **Key Metrics**: Error rates, error distribution, recent errors, HTTP errors, database errors + +### 6. User Activity +- **File**: `user-activity.json` +- **Purpose**: Monitor user behavior and activity patterns +- **Key Metrics**: Active users, sessions, API calls per user, session duration + +### 7. System Health +- **File**: `system-health.json` +- **Purpose**: Overall system health monitoring +- **Key Metrics**: Availability, health scores, resource utilization, service status + +### 8. Alert Management +- **File**: `alert-management.json` +- **Purpose**: Monitor and manage system alerts +- **Key Metrics**: Active alerts, alert rates, alert distribution, firing alerts + +### 9. Log Analysis +- **File**: `log-analysis.json` +- **Purpose**: Search and analyze system logs +- **Key Metrics**: Log volume, error logs, log distribution, log search + +## How to Import Dashboards + +### Method 1: Using SigNoz UI + +1. **Access SigNoz UI**: Open your SigNoz instance in a web browser +2. **Navigate to Dashboards**: Go to the "Dashboards" section +3. **Import Dashboard**: Click on "Import Dashboard" button +4. **Upload JSON**: Select the JSON file from this directory +5. **Configure**: Adjust any variables or settings as needed +6. **Save**: Save the imported dashboard + +**Note**: The dashboards now use the correct SigNoz JSON schema with proper filter arrays. + +### Method 2: Using SigNoz API + +```bash +# Import a single dashboard +curl -X POST "http://:3301/api/v1/dashboards/import" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d @infrastructure-monitoring.json + +# Import all dashboards +for file in *.json; do + curl -X POST "http://:3301/api/v1/dashboards/import" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d @"$file" +done +``` + +### Method 3: Using Kubernetes ConfigMap + +```yaml +# Create a ConfigMap with all dashboards +kubectl create configmap signoz-dashboards \ + --from-file=infrastructure-monitoring.json \ + --from-file=application-performance.json \ + --from-file=database-performance.json \ + --from-file=api-performance.json \ + --from-file=error-tracking.json \ + --from-file=user-activity.json \ + --from-file=system-health.json \ + --from-file=alert-management.json \ + --from-file=log-analysis.json \ + -n signoz +``` + +## Dashboard Variables + +Most dashboards include variables that allow you to filter and customize the view: + +- **Namespace**: Filter by Kubernetes namespace (e.g., `bakery-ia`, `default`) +- **Service**: Filter by specific microservice +- **Severity**: Filter by error/alert severity +- **Environment**: Filter by deployment environment +- **Time Range**: Adjust the time window for analysis + +## Metrics Reference + +The dashboards use standard OpenTelemetry metrics. If you need to add custom metrics, ensure they are properly instrumented in your services. + +## Troubleshooting + +### Dashboard Import Errors + +If you encounter errors when importing dashboards: + +1. **Validate JSON**: Ensure the JSON files are valid + ```bash + jq . infrastructure-monitoring.json + ``` + +2. **Check Metrics**: Verify that the metrics exist in your SigNoz instance + +3. **Adjust Time Range**: Try different time ranges if no data appears + +4. **Check Filters**: Ensure filters match your actual service names and tags + +### "e.filter is not a function" Error + +This error occurs when the dashboard JSON uses an incorrect filter format. The fix has been applied: + +**Before (incorrect)**: +```json +"filters": { + "namespace": "${namespace}" +} +``` + +**After (correct)**: +```json +"filters": [ + { + "key": "namespace", + "operator": "=", + "value": "${namespace}" + } +] +``` + +All dashboards in this directory now use the correct array format for filters. + +### Missing Data + +If dashboards show no data: + +1. **Verify Instrumentation**: Ensure your services are properly instrumented with OpenTelemetry +2. **Check Time Range**: Adjust the time range to include recent data +3. **Validate Metrics**: Confirm the metrics are being collected and stored +4. **Review Filters**: Check that filters match your actual deployment + +## Customization + +You can customize these dashboards by: + +1. **Editing JSON**: Modify the JSON files to add/remove panels or adjust queries +2. **Cloning in UI**: Clone existing dashboards and modify them in the SigNoz UI +3. **Adding Variables**: Add new variables for additional filtering options +4. **Adjusting Layout**: Change the grid layout and panel sizes + +## Best Practices + +1. **Regular Reviews**: Review dashboards regularly to ensure they meet your monitoring needs +2. **Alert Integration**: Set up alerts based on key metrics shown in these dashboards +3. **Team Access**: Share relevant dashboards with appropriate team members +4. **Documentation**: Document any custom metrics or specific monitoring requirements + +## Support + +For issues with these dashboards: + +1. Check the [SigNoz documentation](https://signoz.io/docs/) +2. Review the [Bakery IA monitoring guide](../SIGNOZ_COMPLETE_CONFIGURATION_GUIDE.md) +3. Consult the OpenTelemetry metrics specification + +## License + +These dashboard configurations are provided under the same license as the Bakery IA project. \ No newline at end of file diff --git a/infrastructure/signoz/dashboards/alert-management.json b/infrastructure/signoz/dashboards/alert-management.json new file mode 100644 index 00000000..568ab9ad --- /dev/null +++ b/infrastructure/signoz/dashboards/alert-management.json @@ -0,0 +1,104 @@ +{ + "dashboard": { + "title": "Bakery IA - Alert Management", + "description": "Alert monitoring and management dashboard", + "tags": ["alerts", "monitoring", "management"], + "panels": [ + { + "title": "Active Alerts", + "type": "stat", + "query": { + "metric": "alerts_active", + "aggregate": "sum", + "filters": [ + { + "key": "severity", + "operator": "=", + "value": "${severity}" + }, + { + "key": "status", + "operator": "=", + "value": "firing" + } + ] + }, + "unit": "number" + }, + { + "title": "Alert Rate", + "type": "timeseries", + "query": { + "metric": "alerts_total", + "aggregate": "rate", + "filters": [ + { + "key": "severity", + "operator": "=", + "value": "${severity}" + } + ] + }, + "unit": "alerts/s" + }, + { + "title": "Alerts by Severity", + "type": "pie", + "query": { + "metric": "alerts_total", + "aggregate": "sum", + "groupBy": ["severity"], + "filters": [ + { + "key": "severity", + "operator": "=", + "value": "${severity}" + } + ] + } + }, + { + "title": "Alerts by Status", + "type": "pie", + "query": { + "metric": "alerts_total", + "aggregate": "sum", + "groupBy": ["status"], + "filters": [ + { + "key": "status", + "operator": "=", + "value": "${status}" + } + ] + } + } + ], + "variables": [ + { + "name": "severity", + "label": "Severity", + "type": "dropdown", + "default": "*", + "values": ["*", "critical", "high", "medium", "low"] + }, + { + "name": "status", + "label": "Status", + "type": "dropdown", + "default": "*", + "values": ["*", "firing", "resolved", "acknowledged"] + } + ], + "layout": { + "type": "grid", + "columns": 12, + "gap": [16, 16] + }, + "refresh": "15s", + "time": { + "from": "now-1h", + "to": "now" + } + } +} \ No newline at end of file diff --git a/infrastructure/signoz/dashboards/api-performance.json b/infrastructure/signoz/dashboards/api-performance.json new file mode 100644 index 00000000..03a29d71 --- /dev/null +++ b/infrastructure/signoz/dashboards/api-performance.json @@ -0,0 +1,102 @@ +{ + "dashboard": { + "title": "Bakery IA - API Performance", + "description": "Comprehensive API performance monitoring for Bakery IA REST and GraphQL endpoints", + "tags": ["api", "performance", "rest", "graphql"], + "panels": [ + { + "title": "Request Volume", + "type": "timeseries", + "query": { + "metric": "http_server_requests_seconds_count", + "aggregate": "sum", + "groupBy": ["api"], + "filters": [ + { + "key": "api", + "operator": "=", + "value": "${api}" + } + ] + }, + "unit": "req/s" + }, + { + "title": "Error Rate", + "type": "timeseries", + "query": { + "metric": "http_server_requests_seconds_count", + "aggregate": "sum", + "groupBy": ["api", "status"], + "filters": [ + { + "key": "api", + "operator": "=", + "value": "${api}" + }, + { + "key": "status", + "operator": "=~", + "value": "5.." + } + ] + }, + "unit": "req/s" + }, + { + "title": "Average Response Time", + "type": "timeseries", + "query": { + "metric": "http_server_requests_seconds_sum", + "aggregate": "avg", + "groupBy": ["api", "endpoint"], + "filters": [ + { + "key": "api", + "operator": "=", + "value": "${api}" + } + ] + }, + "unit": "seconds" + }, + { + "title": "P95 Latency", + "type": "timeseries", + "query": { + "metric": "http_server_requests_seconds_bucket", + "aggregate": "histogram_quantile", + "quantile": 0.95, + "groupBy": ["api", "endpoint"], + "filters": [ + { + "key": "api", + "operator": "=", + "value": "${api}" + } + ] + }, + "unit": "seconds" + } + ], + "variables": [ + { + "name": "api", + "label": "API Service", + "type": "dropdown", + "default": "*", + "values": ["*", "gateway-api", "auth-api", "inventory-api", "production-api", "forecasting-api", "procurement-api"] + } + ], + "layout": { + "type": "grid", + "columns": 12, + "gap": [16, 16] + }, + "refresh": "15s", + "time": { + "from": "now-1h", + "to": "now" + } + } +} \ No newline at end of file diff --git a/infrastructure/signoz/dashboards/application-performance.json b/infrastructure/signoz/dashboards/application-performance.json new file mode 100644 index 00000000..8f354463 --- /dev/null +++ b/infrastructure/signoz/dashboards/application-performance.json @@ -0,0 +1,101 @@ +{ + "dashboard": { + "title": "Bakery IA - Application Performance", + "description": "Application performance monitoring dashboard for Bakery IA microservices", + "tags": ["application", "performance", "apm"], + "panels": [ + { + "title": "Request Rate", + "type": "timeseries", + "query": { + "metric": "http_server_requests_seconds_count", + "aggregate": "sum", + "groupBy": ["service"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + }, + "unit": "req/s" + }, + { + "title": "Error Rate", + "type": "timeseries", + "query": { + "metric": "http_server_requests_seconds_count", + "aggregate": "sum", + "groupBy": ["service", "status"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + }, + { + "key": "status", + "operator": "=~", + "value": "5.." + } + ] + }, + "unit": "req/s" + }, + { + "title": "Average Response Time", + "type": "timeseries", + "query": { + "metric": "http_server_requests_seconds_sum", + "aggregate": "avg", + "groupBy": ["service"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + }, + "unit": "seconds" + }, + { + "title": "Throughput", + "type": "timeseries", + "query": { + "metric": "http_server_requests_seconds_count", + "aggregate": "rate", + "groupBy": ["service"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + }, + "unit": "req/s" + } + ], + "variables": [ + { + "name": "service", + "label": "Service", + "type": "dropdown", + "default": "*", + "values": ["*", "auth-service", "gateway-service", "forecasting-service", "inventory-service", "production-service", "procurement-service"] + } + ], + "layout": { + "type": "grid", + "columns": 12, + "gap": [16, 16] + }, + "refresh": "15s", + "time": { + "from": "now-30m", + "to": "now" + } + } +} \ No newline at end of file diff --git a/infrastructure/signoz/dashboards/database-performance.json b/infrastructure/signoz/dashboards/database-performance.json new file mode 100644 index 00000000..f7b4fe3b --- /dev/null +++ b/infrastructure/signoz/dashboards/database-performance.json @@ -0,0 +1,101 @@ +{ + "dashboard": { + "title": "Bakery IA - Database Performance", + "description": "Comprehensive database performance monitoring for PostgreSQL and Redis", + "tags": ["database", "postgresql", "redis", "performance"], + "panels": [ + { + "title": "Database Connections", + "type": "timeseries", + "query": { + "metric": "pg_stat_activity_count", + "aggregate": "sum", + "groupBy": ["datname"], + "filters": [ + { + "key": "datname", + "operator": "=", + "value": "${database}" + } + ] + }, + "unit": "number" + }, + { + "title": "Active Queries", + "type": "timeseries", + "query": { + "metric": "pg_stat_activity_count", + "aggregate": "sum", + "groupBy": ["datname"], + "filters": [ + { + "key": "datname", + "operator": "=", + "value": "${database}" + }, + { + "key": "state", + "operator": "=", + "value": "active" + } + ] + }, + "unit": "number" + }, + { + "title": "Database Size", + "type": "timeseries", + "query": { + "metric": "pg_database_size_bytes", + "aggregate": "sum", + "groupBy": ["datname"], + "filters": [ + { + "key": "datname", + "operator": "=", + "value": "${database}" + } + ] + }, + "unit": "bytes" + }, + { + "title": "Query Execution Time", + "type": "timeseries", + "query": { + "metric": "pg_stat_statements_total_time", + "aggregate": "avg", + "groupBy": ["datname"], + "filters": [ + { + "key": "datname", + "operator": "=", + "value": "${database}" + } + ] + }, + "unit": "seconds" + } + ], + "variables": [ + { + "name": "database", + "label": "Database", + "type": "dropdown", + "default": "*", + "values": ["*", "postgresql", "redis"] + } + ], + "layout": { + "type": "grid", + "columns": 12, + "gap": [16, 16] + }, + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + } + } +} \ No newline at end of file diff --git a/infrastructure/signoz/dashboards/error-tracking.json b/infrastructure/signoz/dashboards/error-tracking.json new file mode 100644 index 00000000..3fbb14a6 --- /dev/null +++ b/infrastructure/signoz/dashboards/error-tracking.json @@ -0,0 +1,105 @@ +{ + "dashboard": { + "title": "Bakery IA - Error Tracking", + "description": "Comprehensive error tracking and analysis dashboard", + "tags": ["errors", "exceptions", "tracking"], + "panels": [ + { + "title": "Total Errors", + "type": "stat", + "query": { + "metric": "error_total", + "aggregate": "sum", + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + }, + "unit": "number" + }, + { + "title": "Error Rate", + "type": "timeseries", + "query": { + "metric": "error_total", + "aggregate": "rate", + "groupBy": ["service"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + }, + "unit": "errors/s" + }, + { + "title": "HTTP 5xx Errors", + "type": "timeseries", + "query": { + "metric": "http_server_requests_seconds_count", + "aggregate": "sum", + "groupBy": ["service", "status"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + }, + { + "key": "status", + "operator": "=~", + "value": "5.." + } + ] + }, + "unit": "number" + }, + { + "title": "HTTP 4xx Errors", + "type": "timeseries", + "query": { + "metric": "http_server_requests_seconds_count", + "aggregate": "sum", + "groupBy": ["service", "status"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + }, + { + "key": "status", + "operator": "=~", + "value": "4.." + } + ] + }, + "unit": "number" + } + ], + "variables": [ + { + "name": "service", + "label": "Service", + "type": "dropdown", + "default": "*", + "values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"] + } + ], + "layout": { + "type": "grid", + "columns": 12, + "gap": [16, 16] + }, + "refresh": "15s", + "time": { + "from": "now-1h", + "to": "now" + } + } +} \ No newline at end of file diff --git a/infrastructure/signoz/dashboards/index.json b/infrastructure/signoz/dashboards/index.json new file mode 100644 index 00000000..faf9b85a --- /dev/null +++ b/infrastructure/signoz/dashboards/index.json @@ -0,0 +1,213 @@ +{ + "name": "Bakery IA Dashboard Collection", + "description": "Complete set of SigNoz dashboards for Bakery IA monitoring", + "version": "1.0.0", + "author": "Bakery IA Team", + "license": "MIT", + "dashboards": [ + { + "id": "infrastructure-monitoring", + "name": "Infrastructure Monitoring", + "description": "Kubernetes infrastructure and resource monitoring", + "file": "infrastructure-monitoring.json", + "tags": ["infrastructure", "kubernetes", "system"], + "category": "infrastructure" + }, + { + "id": "application-performance", + "name": "Application Performance", + "description": "Microservice performance and API metrics", + "file": "application-performance.json", + "tags": ["application", "performance", "apm"], + "category": "performance" + }, + { + "id": "database-performance", + "name": "Database Performance", + "description": "PostgreSQL and Redis database monitoring", + "file": "database-performance.json", + "tags": ["database", "postgresql", "redis"], + "category": "database" + }, + { + "id": "api-performance", + "name": "API Performance", + "description": "REST and GraphQL API performance monitoring", + "file": "api-performance.json", + "tags": ["api", "rest", "graphql"], + "category": "api" + }, + { + "id": "error-tracking", + "name": "Error Tracking", + "description": "System error tracking and analysis", + "file": "error-tracking.json", + "tags": ["errors", "exceptions", "tracking"], + "category": "monitoring" + }, + { + "id": "user-activity", + "name": "User Activity", + "description": "User behavior and activity monitoring", + "file": "user-activity.json", + "tags": ["user", "activity", "behavior"], + "category": "user" + }, + { + "id": "system-health", + "name": "System Health", + "description": "Overall system health monitoring", + "file": "system-health.json", + "tags": ["system", "health", "overview"], + "category": "overview" + }, + { + "id": "alert-management", + "name": "Alert Management", + "description": "Alert monitoring and management", + "file": "alert-management.json", + "tags": ["alerts", "notifications", "management"], + "category": "alerts" + }, + { + "id": "log-analysis", + "name": "Log Analysis", + "description": "Log search and analysis", + "file": "log-analysis.json", + "tags": ["logs", "search", "analysis"], + "category": "logs" + } + ], + "categories": [ + { + "id": "infrastructure", + "name": "Infrastructure", + "description": "Kubernetes and system infrastructure monitoring" + }, + { + "id": "performance", + "name": "Performance", + "description": "Application and service performance monitoring" + }, + { + "id": "database", + "name": "Database", + "description": "Database performance and health monitoring" + }, + { + "id": "api", + "name": "API", + "description": "API performance and usage monitoring" + }, + { + "id": "monitoring", + "name": "Monitoring", + "description": "Error tracking and system monitoring" + }, + { + "id": "user", + "name": "User", + "description": "User activity and behavior monitoring" + }, + { + "id": "overview", + "name": "Overview", + "description": "System-wide overview and health dashboards" + }, + { + "id": "alerts", + "name": "Alerts", + "description": "Alert management and monitoring" + }, + { + "id": "logs", + "name": "Logs", + "description": "Log analysis and search" + } + ], + "usage": { + "import_methods": [ + "ui_import", + "api_import", + "kubernetes_configmap" + ], + "recommended_import_order": [ + "infrastructure-monitoring", + "system-health", + "application-performance", + "api-performance", + "database-performance", + "error-tracking", + "alert-management", + "log-analysis", + "user-activity" + ] + }, + "requirements": { + "signoz_version": ">= 0.10.0", + "opentelemetry_collector": ">= 0.45.0", + "metrics": [ + "container_cpu_usage_seconds_total", + "container_memory_working_set_bytes", + "http_server_requests_seconds_count", + "http_server_requests_seconds_sum", + "pg_stat_activity_count", + "pg_stat_statements_total_time", + "error_total", + "alerts_total", + "kube_pod_status_phase", + "container_network_receive_bytes_total", + "kube_pod_container_status_restarts_total", + "kube_pod_container_status_ready", + "container_fs_reads_total", + "kube_pod_status_phase", + "kube_pod_container_status_restarts_total", + "kube_pod_container_status_ready", + "container_fs_reads_total", + "kubernetes_events", + "http_server_requests_seconds_bucket", + "http_server_active_requests", + "http_server_up", + "db_query_duration_seconds_sum", + "db_connections_active", + "http_client_request_duration_seconds_count", + "http_client_request_duration_seconds_sum", + "graphql_execution_time_seconds", + "graphql_errors_total", + "pg_stat_database_blks_hit", + "pg_stat_database_xact_commit", + "pg_locks_count", + "pg_table_size_bytes", + "pg_stat_user_tables_seq_scan", + "redis_memory_used_bytes", + "redis_commands_processed_total", + "redis_keyspace_hits", + "pg_stat_database_deadlocks", + "pg_stat_database_conn_errors", + "pg_replication_lag_bytes", + "pg_replication_is_replica", + "active_users", + "user_sessions_total", + "api_calls_per_user", + "session_duration_seconds", + "system_availability", + "service_health_score", + "system_cpu_usage", + "system_memory_usage", + "service_availability", + "alerts_active", + "alerts_total", + "log_lines_total" + ] + }, + "support": { + "documentation": "https://signoz.io/docs/", + "bakery_ia_docs": "../SIGNOZ_COMPLETE_CONFIGURATION_GUIDE.md", + "issues": "https://github.com/your-repo/issues" + }, + "notes": { + "format_fix": "All dashboards have been updated to use the correct SigNoz JSON schema with proper filter arrays to resolve the 'e.filter is not a function' error.", + "compatibility": "Tested with SigNoz v0.10.0+ and OpenTelemetry Collector v0.45.0+", + "customization": "You can customize these dashboards by editing the JSON files or cloning them in the SigNoz UI" + } +} \ No newline at end of file diff --git a/infrastructure/signoz/dashboards/infrastructure-monitoring.json b/infrastructure/signoz/dashboards/infrastructure-monitoring.json new file mode 100644 index 00000000..bb3c7301 --- /dev/null +++ b/infrastructure/signoz/dashboards/infrastructure-monitoring.json @@ -0,0 +1,105 @@ +{ + "dashboard": { + "title": "Bakery IA - Infrastructure Monitoring", + "description": "Comprehensive infrastructure monitoring dashboard for Bakery IA system", + "tags": ["infrastructure", "system", "kubernetes"], + "panels": [ + { + "title": "CPU Usage", + "type": "timeseries", + "query": { + "metric": "container_cpu_usage_seconds_total", + "aggregate": "sum", + "groupBy": ["namespace"], + "filters": [ + { + "key": "namespace", + "operator": "=", + "value": "bakery-ia" + } + ] + }, + "unit": "percent", + "yAxis": { + "min": 0, + "max": 100 + } + }, + { + "title": "Memory Usage", + "type": "timeseries", + "query": { + "metric": "container_memory_working_set_bytes", + "aggregate": "sum", + "groupBy": ["namespace"], + "filters": [ + { + "key": "namespace", + "operator": "=", + "value": "bakery-ia" + } + ] + }, + "unit": "bytes" + }, + { + "title": "Network Traffic", + "type": "timeseries", + "query": { + "metric": "container_network_receive_bytes_total", + "aggregate": "sum", + "groupBy": ["namespace"], + "filters": [ + { + "key": "namespace", + "operator": "=", + "value": "bakery-ia" + } + ] + }, + "unit": "bytes" + }, + { + "title": "Pod Status", + "type": "stat", + "query": { + "metric": "kube_pod_status_phase", + "aggregate": "count", + "groupBy": ["phase"], + "filters": [ + { + "key": "namespace", + "operator": "=", + "value": "bakery-ia" + }, + { + "key": "phase", + "operator": "=", + "value": "Running" + } + ] + }, + "unit": "number" + } + ], + "variables": [ + { + "name": "namespace", + "label": "Namespace", + "type": "dropdown", + "default": "bakery-ia", + "values": ["bakery-ia", "default", "kube-system"] + } + ], + "layout": { + "type": "grid", + "columns": 12, + "gap": [16, 16] + }, + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + } + } +} \ No newline at end of file diff --git a/infrastructure/signoz/dashboards/log-analysis.json b/infrastructure/signoz/dashboards/log-analysis.json new file mode 100644 index 00000000..e2b24f6b --- /dev/null +++ b/infrastructure/signoz/dashboards/log-analysis.json @@ -0,0 +1,99 @@ +{ + "dashboard": { + "title": "Bakery IA - Log Analysis", + "description": "Comprehensive log analysis and search dashboard", + "tags": ["logs", "analysis", "search"], + "panels": [ + { + "title": "Log Volume", + "type": "timeseries", + "query": { + "metric": "log_lines_total", + "aggregate": "sum", + "groupBy": ["service"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + }, + "unit": "logs/s" + }, + { + "title": "Error Logs", + "type": "timeseries", + "query": { + "metric": "log_lines_total", + "aggregate": "sum", + "groupBy": ["service"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + }, + { + "key": "level", + "operator": "=", + "value": "error" + } + ] + }, + "unit": "logs/s" + }, + { + "title": "Logs by Level", + "type": "pie", + "query": { + "metric": "log_lines_total", + "aggregate": "sum", + "groupBy": ["level"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + } + }, + { + "title": "Logs by Service", + "type": "pie", + "query": { + "metric": "log_lines_total", + "aggregate": "sum", + "groupBy": ["service"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + } + } + ], + "variables": [ + { + "name": "service", + "label": "Service", + "type": "dropdown", + "default": "*", + "values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service", "forecasting-service"] + } + ], + "layout": { + "type": "grid", + "columns": 12, + "gap": [16, 16] + }, + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + } + } +} \ No newline at end of file diff --git a/infrastructure/signoz/dashboards/system-health.json b/infrastructure/signoz/dashboards/system-health.json new file mode 100644 index 00000000..f70fb48f --- /dev/null +++ b/infrastructure/signoz/dashboards/system-health.json @@ -0,0 +1,92 @@ +{ + "dashboard": { + "title": "Bakery IA - System Health", + "description": "Comprehensive system health monitoring dashboard", + "tags": ["system", "health", "monitoring"], + "panels": [ + { + "title": "System Availability", + "type": "stat", + "query": { + "metric": "system_availability", + "aggregate": "avg", + "filters": [ + { + "key": "namespace", + "operator": "=", + "value": "${namespace}" + } + ] + }, + "unit": "percent" + }, + { + "title": "Service Health Score", + "type": "stat", + "query": { + "metric": "service_health_score", + "aggregate": "avg", + "filters": [ + { + "key": "namespace", + "operator": "=", + "value": "${namespace}" + } + ] + }, + "unit": "number" + }, + { + "title": "CPU Usage", + "type": "timeseries", + "query": { + "metric": "system_cpu_usage", + "aggregate": "avg", + "filters": [ + { + "key": "namespace", + "operator": "=", + "value": "${namespace}" + } + ] + }, + "unit": "percent" + }, + { + "title": "Memory Usage", + "type": "timeseries", + "query": { + "metric": "system_memory_usage", + "aggregate": "avg", + "filters": [ + { + "key": "namespace", + "operator": "=", + "value": "${namespace}" + } + ] + }, + "unit": "percent" + } + ], + "variables": [ + { + "name": "namespace", + "label": "Namespace", + "type": "dropdown", + "default": "bakery-ia", + "values": ["bakery-ia", "default"] + } + ], + "layout": { + "type": "grid", + "columns": 12, + "gap": [16, 16] + }, + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + } + } +} \ No newline at end of file diff --git a/infrastructure/signoz/dashboards/user-activity.json b/infrastructure/signoz/dashboards/user-activity.json new file mode 100644 index 00000000..e4d4d9e6 --- /dev/null +++ b/infrastructure/signoz/dashboards/user-activity.json @@ -0,0 +1,96 @@ +{ + "dashboard": { + "title": "Bakery IA - User Activity", + "description": "User activity and behavior monitoring dashboard", + "tags": ["user", "activity", "behavior"], + "panels": [ + { + "title": "Active Users", + "type": "timeseries", + "query": { + "metric": "active_users", + "aggregate": "sum", + "groupBy": ["service"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + }, + "unit": "number" + }, + { + "title": "User Sessions", + "type": "timeseries", + "query": { + "metric": "user_sessions_total", + "aggregate": "sum", + "groupBy": ["service"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + }, + "unit": "number" + }, + { + "title": "API Calls per User", + "type": "timeseries", + "query": { + "metric": "api_calls_per_user", + "aggregate": "avg", + "groupBy": ["service"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + }, + "unit": "number" + }, + { + "title": "Session Duration", + "type": "timeseries", + "query": { + "metric": "session_duration_seconds", + "aggregate": "avg", + "groupBy": ["service"], + "filters": [ + { + "key": "service", + "operator": "=", + "value": "${service}" + } + ] + }, + "unit": "seconds" + } + ], + "variables": [ + { + "name": "service", + "label": "Service", + "type": "dropdown", + "default": "*", + "values": ["*", "auth-service", "gateway-service", "inventory-service", "production-service"] + } + ], + "layout": { + "type": "grid", + "columns": 12, + "gap": [16, 16] + }, + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + } + } +} \ No newline at end of file diff --git a/infrastructure/signoz/import-dashboards.sh b/infrastructure/signoz/import-dashboards.sh new file mode 100755 index 00000000..06e112af --- /dev/null +++ b/infrastructure/signoz/import-dashboards.sh @@ -0,0 +1,175 @@ +#!/bin/bash + +# SigNoz Dashboard Importer for Bakery IA +# This script imports all SigNoz dashboards into your SigNoz instance + +# Configuration +SIGNOZ_HOST="localhost" +SIGNOZ_PORT="3301" +SIGNOZ_API_KEY="" # Add your API key if authentication is required +DASHBOARDS_DIR="infrastructure/signoz/dashboards" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Function to display help +show_help() { + echo "Usage: $0 [options]" + echo "" + echo "Options: + -h, --host SigNoz host (default: localhost) + -p, --port SigNoz port (default: 3301) + -k, --api-key SigNoz API key (if required) + -d, --dir Dashboards directory (default: infrastructure/signoz/dashboards) + -h, --help Show this help message" + echo "" + echo "Example: + $0 --host signoz.example.com --port 3301 --api-key your-api-key" +} + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -h|--host) + SIGNOZ_HOST="$2" + shift 2 + ;; + -p|--port) + SIGNOZ_PORT="$2" + shift 2 + ;; + -k|--api-key) + SIGNOZ_API_KEY="$2" + shift 2 + ;; + -d|--dir) + DASHBOARDS_DIR="$2" + shift 2 + ;; + --help) + show_help + exit 0 + ;; + *) + echo "Unknown option: $1" + show_help + exit 1 + ;; + esac +done + +# Check if dashboards directory exists +if [ ! -d "$DASHBOARDS_DIR" ]; then + echo -e "${RED}Error: Dashboards directory not found: $DASHBOARDS_DIR${NC}" + exit 1 +fi + +# Check if jq is installed for JSON validation +if ! command -v jq &> /dev/null; then + echo -e "${YELLOW}Warning: jq not found. Skipping JSON validation.${NC}" + VALIDATE_JSON=false +else + VALIDATE_JSON=true +fi + +# Function to validate JSON +validate_json() { + local file="$1" + if [ "$VALIDATE_JSON" = true ]; then + if ! jq empty "$file" &> /dev/null; then + echo -e "${RED}Error: Invalid JSON in file: $file${NC}" + return 1 + fi + fi + return 0 +} + +# Function to import a single dashboard +import_dashboard() { + local file="$1" + local filename=$(basename "$file") + local dashboard_name=$(jq -r '.name' "$file" 2>/dev/null || echo "Unknown") + + echo -e "${BLUE}Importing dashboard: $dashboard_name ($filename)${NC}" + + # Prepare curl command + local curl_cmd="curl -s -X POST http://$SIGNOZ_HOST:$SIGNOZ_PORT/api/v1/dashboards/import" + + if [ -n "$SIGNOZ_API_KEY" ]; then + curl_cmd="$curl_cmd -H \"Authorization: Bearer $SIGNOZ_API_KEY\"" + fi + + curl_cmd="$curl_cmd -H \"Content-Type: application/json\" -d @\"$file\"" + + # Execute import + local response=$(eval "$curl_cmd") + + # Check response + if echo "$response" | grep -q "success"; then + echo -e "${GREEN}✓ Successfully imported: $dashboard_name${NC}" + return 0 + else + echo -e "${RED}✗ Failed to import: $dashboard_name${NC}" + echo "Response: $response" + return 1 + fi +} + +# Main import process +echo -e "${YELLOW}=== SigNoz Dashboard Importer for Bakery IA ===${NC}" +echo -e "${BLUE}Configuration:${NC}" +echo " Host: $SIGNOZ_HOST" +echo " Port: $SIGNOZ_PORT" +echo " Dashboards Directory: $DASHBOARDS_DIR" +if [ -n "$SIGNOZ_API_KEY" ]; then + echo " API Key: ******** (set)" +else + echo " API Key: Not configured" +fi +echo "" + +# Count dashboards +DASHBOARD_COUNT=$(find "$DASHBOARDS_DIR" -name "*.json" | wc -l) +echo -e "${BLUE}Found $DASHBOARD_COUNT dashboards to import${NC}" +echo "" + +# Import each dashboard +SUCCESS_COUNT=0 +FAILURE_COUNT=0 + +for file in "$DASHBOARDS_DIR"/*.json; do + if [ -f "$file" ]; then + # Validate JSON + if validate_json "$file"; then + if import_dashboard "$file"; then + ((SUCCESS_COUNT++)) + else + ((FAILURE_COUNT++)) + fi + else + ((FAILURE_COUNT++)) + fi + echo "" + fi +done + +# Summary +echo -e "${YELLOW}=== Import Summary ===${NC}" +echo -e "${GREEN}Successfully imported: $SUCCESS_COUNT dashboards${NC}" +if [ $FAILURE_COUNT -gt 0 ]; then + echo -e "${RED}Failed to import: $FAILURE_COUNT dashboards${NC}" +fi +echo "" + +if [ $FAILURE_COUNT -eq 0 ]; then + echo -e "${GREEN}All dashboards imported successfully!${NC}" + echo "You can now access them in your SigNoz UI at:" + echo "http://$SIGNOZ_HOST:$SIGNOZ_PORT/dashboards" +else + echo -e "${YELLOW}Some dashboards failed to import. Check the errors above.${NC}" + exit 1 +fi \ No newline at end of file diff --git a/services/ai_insights/app/main.py b/services/ai_insights/app/main.py index 80337206..2835d5c6 100644 --- a/services/ai_insights/app/main.py +++ b/services/ai_insights/app/main.py @@ -48,6 +48,22 @@ tracer_provider = setup_tracing("ai-insights") setup_logging("ai-insights", getattr(settings, 'LOG_LEVEL', 'INFO')) logger = structlog.get_logger() +# Setup OpenTelemetry logging export if enabled +logger.info(f"OTEL_LOGS_EXPORTER env var: {os.getenv('OTEL_LOGS_EXPORTER', 'not set')}") +if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp": + try: + logger.info("Attempting to setup OpenTelemetry logging") + from shared.monitoring.logs_exporter import setup_otel_logging + result = setup_otel_logging("ai-insights", settings.SERVICE_VERSION) + if result: + logger.info("OpenTelemetry logs export enabled for ai-insights") + else: + logger.warning("OpenTelemetry logs export setup returned None") + except Exception as e: + logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True) +else: + logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp") + @asynccontextmanager async def lifespan(app: FastAPI): diff --git a/services/alert_processor/app/main.py b/services/alert_processor/app/main.py index a40ccd1c..296e5785 100644 --- a/services/alert_processor/app/main.py +++ b/services/alert_processor/app/main.py @@ -51,7 +51,24 @@ tracer_provider = setup_tracing("alert-processor") # Setup logging setup_logging("alert-processor", getattr(settings, 'LOG_LEVEL', 'INFO')) -logger = structlog.get_logger() + +# Setup OpenTelemetry logging export if enabled +if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp": + try: + from shared.monitoring.logs_exporter import setup_otel_logging + result = setup_otel_logging("alert-processor", settings.VERSION) + if result: + logger = structlog.get_logger() + logger.info("OpenTelemetry logs export enabled for alert-processor") + else: + logger = structlog.get_logger() + logger.warning("OpenTelemetry logs export setup returned None") + except Exception as e: + logger = structlog.get_logger() + logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True) +else: + logger = structlog.get_logger() + logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp") # Global consumer instance consumer: EventConsumer = None diff --git a/services/demo_session/app/main.py b/services/demo_session/app/main.py index 504cbd2b..6af70095 100644 --- a/services/demo_session/app/main.py +++ b/services/demo_session/app/main.py @@ -49,7 +49,24 @@ tracer_provider = setup_tracing("demo-session") # Setup logging setup_logging("demo-session", getattr(settings, 'LOG_LEVEL', 'INFO')) -logger = structlog.get_logger() + +# Setup OpenTelemetry logging export if enabled +if os.getenv("OTEL_LOGS_EXPORTER", "").lower() == "otlp": + try: + from shared.monitoring.logs_exporter import setup_otel_logging + result = setup_otel_logging("demo-session", settings.VERSION) + if result: + logger = structlog.get_logger() + logger.info("OpenTelemetry logs export enabled for demo-session") + else: + logger = structlog.get_logger() + logger.warning("OpenTelemetry logs export setup returned None") + except Exception as e: + logger = structlog.get_logger() + logger.error(f"Failed to setup OpenTelemetry logs export: {e}", exc_info=True) +else: + logger = structlog.get_logger() + logger.info("OpenTelemetry logs export disabled - OTEL_LOGS_EXPORTER not set to otlp") # Initialize database db_manager = DatabaseManager() diff --git a/shared/monitoring/logs_exporter.py b/shared/monitoring/logs_exporter.py index 0dde34c3..87a25493 100644 --- a/shared/monitoring/logs_exporter.py +++ b/shared/monitoring/logs_exporter.py @@ -65,11 +65,27 @@ def setup_otel_logging( return None # Get OTLP endpoint from environment or parameter + # For logs, we need to use the HTTP endpoint (port 4318), not the gRPC endpoint (port 4317) if otel_endpoint is None: + # Try logs-specific endpoint first, then fall back to general OTLP endpoint otel_endpoint = os.getenv( - "OTEL_EXPORTER_OTLP_ENDPOINT", + "OTEL_EXPORTER_OTLP_LOGS_ENDPOINT", os.getenv("OTEL_COLLECTOR_ENDPOINT", "http://signoz-otel-collector.bakery-ia:4318") ) + + logger.info(f"Original OTLP endpoint for logs: {otel_endpoint}") + + # If we got the tracing endpoint (4317), switch to logs endpoint (4318) + if otel_endpoint.endswith(":4317"): + logger.info("Converting tracing endpoint (4317) to logs endpoint (4318)") + otel_endpoint = otel_endpoint.replace(":4317", ":4318") + + logger.info(f"Final OTLP endpoint for logs: {otel_endpoint}") + + # Ensure endpoint has proper protocol prefix + if not otel_endpoint.startswith(("http://", "https://")): + # Default to HTTP for insecure connections + otel_endpoint = f"http://{otel_endpoint}" # Ensure endpoint has /v1/logs path for HTTP if not otel_endpoint.endswith("/v1/logs"):