Fix some issues

2026-01-25 20:07:37 +01:00
parent e0be1b22f9
commit 6c6a9fc58c
32 changed files with 1719 additions and 226 deletions
--- a/infrastructure/environments/common/configs/configmap.yaml
+++ b/infrastructure/environments/common/configs/configmap.yaml
@@ -387,6 +387,7 @@ data:
  VITE_PILOT_COUPON_CODE: "PILOT2025"
  VITE_PILOT_TRIAL_MONTHS: "3"
  VITE_STRIPE_PUBLISHABLE_KEY: "pk_test_51QuxKyIzCdnBmAVTGM8fvXYkItrBUILz6lHYwhAva6ZAH1HRi0e8zDRgZ4X3faN0zEABp5RHjCVBmMJL3aKXbaC200fFrSNnPl"
+  VITE_STRIPE_ACCOUNT_ID: "acct_1QuxKsIucMC6K1cg"

  # ================================================================
  # LOCATION SETTINGS (Nominatim Geocoding)
--- a/infrastructure/environments/prod/k8s-manifests/kustomization.yaml
+++ b/infrastructure/environments/prod/k8s-manifests/kustomization.yaml
@@ -107,6 +107,12 @@ patches:
      - op: add
        path: /data/VITE_ENVIRONMENT
        value: "production"
+      - op: replace
+        path: /data/VITE_STRIPE_PUBLISHABLE_KEY
+        value: "pk_test_51QuxKyIzCdnBmAVTGM8fvXYkItrBUILz6lHYwhAva6ZAH1HRi0e8zDRgZ4X3faN0zEABp5RHjCVBmMJL3aKXbaC200fFrSNnPl"
+      - op: add
+        path: /data/VITE_STRIPE_ACCOUNT_ID
+        value: "acct_1QuxKsIucMC6K1cg"
  # Add imagePullSecrets to all Deployments for gitea registry authentication
  - target:
      kind: Deployment
--- a/infrastructure/monitoring/signoz/k8s-infra-values-dev.yaml
+++ b/infrastructure/monitoring/signoz/k8s-infra-values-dev.yaml
@@ -0,0 +1,53 @@
+# SigNoz k8s-infra Helm Chart Values - Development Environment
+# Collects Kubernetes infrastructure metrics and sends to SigNoz
+#
+# Official Chart: https://github.com/SigNoz/charts/tree/main/charts/k8s-infra
+# Install Command: helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-dev.yaml
+
+# ============================================================================
+# OTEL COLLECTOR ENDPOINT
+# ============================================================================
+otelCollectorEndpoint: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
+otelInsecure: true
+clusterName: "bakery-ia-dev"
+
+# ============================================================================
+# PRESETS - Minimal configuration for development
+# ============================================================================
+presets:
+  hostMetrics:
+    enabled: true
+    collectionInterval: 60s  # Less frequent in dev
+
+  kubeletMetrics:
+    enabled: true
+    collectionInterval: 60s
+
+  kubernetesAttributes:
+    enabled: true
+
+  kubernetesEvents:
+    enabled: false  # Disabled in dev to reduce noise
+
+  logsCollection:
+    enabled: false
+
+# ============================================================================
+# OTEL AGENT - Minimal resources for dev
+# ============================================================================
+otelAgent:
+  enabled: true
+  resources:
+    requests:
+      memory: "128Mi"
+      cpu: "50m"
+    limits:
+      memory: "256Mi"
+      cpu: "250m"
+
+otelDeployment:
+  enabled: false
+
+commonLabels:
+  app.kubernetes.io/part-of: "signoz"
+  environment: "development"
--- a/infrastructure/monitoring/signoz/k8s-infra-values-prod.yaml
+++ b/infrastructure/monitoring/signoz/k8s-infra-values-prod.yaml
@@ -0,0 +1,76 @@
+# SigNoz k8s-infra Helm Chart Values - Production Environment
+# Collects ALL Kubernetes infrastructure metrics and sends to SigNoz
+#
+# This chart REPLACES the need for:
+# - kube-state-metrics (delete after deploying this)
+# - node-exporter (delete after deploying this)
+#
+# Official Chart: https://github.com/SigNoz/charts/tree/main/charts/k8s-infra
+#
+# Install Command:
+#   helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-prod.yaml
+#
+# After install, remove redundant exporters:
+#   helm uninstall kube-state-metrics -n bakery-ia
+#   helm uninstall node-exporter-prometheus-node-exporter -n bakery-ia
+#   (or: helm uninstall prometheus -n bakery-ia if installed via prometheus stack)
+
+# ============================================================================
+# CONNECTION TO SIGNOZ
+# ============================================================================
+otelCollectorEndpoint: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
+otelInsecure: true
+clusterName: "bakery-ia-prod"
+
+# ============================================================================
+# PRESETS - What metrics to collect
+# ============================================================================
+presets:
+  # Host metrics: CPU, memory, disk, filesystem, network, load
+  # Replaces node-exporter
+  hostMetrics:
+    enabled: true
+    collectionInterval: 30s
+
+  # Kubelet metrics: Pod/container CPU, memory usage
+  # Essential for seeing resource usage per pod in SigNoz
+  kubeletMetrics:
+    enabled: true
+    collectionInterval: 30s
+
+  # Kubernetes cluster metrics: deployments, pods, nodes status
+  # Replaces kube-state-metrics
+  clusterMetrics:
+    enabled: true
+    collectionInterval: 30s
+
+  # Enriches all telemetry with k8s metadata (pod name, namespace, etc.)
+  kubernetesAttributes:
+    enabled: true
+
+  # Kubernetes events (pod scheduled, failed, etc.)
+  kubernetesEvents:
+    enabled: true
+
+  # Container logs - disabled (apps send logs via OTLP directly)
+  logsCollection:
+    enabled: false
+
+# ============================================================================
+# OTEL AGENT (DaemonSet) - Runs on each node
+# ============================================================================
+otelAgent:
+  enabled: true
+  resources:
+    requests:
+      memory: "256Mi"
+      cpu: "100m"
+    limits:
+      memory: "512Mi"
+      cpu: "500m"
+
+# ============================================================================
+# OTEL DEPLOYMENT - Disabled (using DaemonSet only)
+# ============================================================================
+otelDeployment:
+  enabled: false
--- a/infrastructure/monitoring/signoz/signoz-values-prod.yaml
+++ b/infrastructure/monitoring/signoz/signoz-values-prod.yaml
@@ -3,18 +3,21 @@
 # DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by SigNoz Helm chart
 #
 # Official Chart: https://github.com/SigNoz/charts
-# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
+# Install Command: helm upgrade --install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
+#
+# IMPORTANT: This chart works together with k8s-infra chart for infrastructure monitoring
+# Deploy k8s-infra after this: helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-prod.yaml
+#
+# MEMORY OPTIMIZATION NOTES:
+# - ClickHouse memory increased to 8Gi to prevent OOM errors
+# - Retention reduced to 3 days for traces, 7 days for metrics/logs

 global:
-  storageClass: "microk8s-hostpath"  # For MicroK8s, use "microk8s-hostpath" or custom storage class
+  storageClass: "microk8s-hostpath"
  clusterName: "bakery-ia-prod"
  domain: "monitoring.bakewise.ai"
-  # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)

 # Ingress configuration for SigNoz Frontend
-# Configured to use HTTPS with TLS termination at ingress controller
-# NOTE: SigNoz Helm chart expects ingress under "signoz.ingress", not "frontend.ingress"
-# Reference: https://github.com/SigNoz/charts/blob/main/charts/signoz/values.yaml
 signoz:
  ingress:
    enabled: true
@@ -39,56 +42,50 @@ signoz:
          - monitoring.bakewise.ai
        secretName: bakery-ia-prod-tls-cert

-# Resource configuration for production
-# Optimized for 8 CPU core VPS deployment
+# ============================================================================
+# CLICKHOUSE CONFIGURATION
+# Increased memory to 8Gi to prevent OOM errors (was 4Gi, causing code 241 errors)
+# ============================================================================
 clickhouse:
  persistence:
    size: 20Gi
  resources:
    requests:
-      memory: "2Gi"
-      cpu: "500m"
-    limits:
      memory: "4Gi"
      cpu: "1000m"
-
-otelCollector:
-  resources:
-    requests:
-      memory: "1Gi"
-      cpu: "500m"
    limits:
-      memory: "2Gi"
-      cpu: "1000m"
-  # Additional config for Kubernetes infrastructure metrics scraping
-  config:
-    receivers:
-      prometheus:
-        config:
-          scrape_configs:
-            # Kube-state-metrics - Kubernetes object metrics
-            - job_name: 'kube-state-metrics'
-              static_configs:
-                - targets: ['kube-state-metrics.bakery-ia.svc.cluster.local:8080']
-              scrape_interval: 30s
-              metric_relabel_configs:
-                - source_labels: [__name__]
-                  regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset|replicaset|job|cronjob|persistentvolume|persistentvolumeclaim|resourcequota|service|configmap|secret).*'
-                  action: keep
-            # Node-exporter - Host-level metrics
-            - job_name: 'node-exporter'
-              static_configs:
-                - targets: ['node-exporter-prometheus-node-exporter.bakery-ia.svc.cluster.local:9100']
-              scrape_interval: 30s
-              metric_relabel_configs:
-                - source_labels: [__name__]
-                  regex: 'node_(cpu|memory|disk|filesystem|network|load).*'
-                  action: keep
-    service:
-      pipelines:
-        metrics:
-          receivers: [otlp, prometheus]
+      memory: "8Gi"
+      cpu: "2000m"

+  # Server-level settings only (NOT user-level settings like max_threads)
+  # User-level settings must go in profiles section
+  settings:
+    # Max server memory usage: 80% of container limit (6.4GB of 8GB)
+    max_server_memory_usage: "6400000000"
+    # Mark cache size (256MB)
+    mark_cache_size: "268435456"
+    # Uncompressed cache (256MB)
+    uncompressed_cache_size: "268435456"
+    # Max concurrent queries
+    max_concurrent_queries: "100"
+
+  # User-level settings go in profiles
+  profiles:
+    default:
+      # Max memory per query: 2GB
+      max_memory_usage: "2000000000"
+      # Max threads per query
+      max_threads: "4"
+      # Background merges memory limit
+      max_bytes_to_merge_at_max_space_in_pool: "1073741824"
+
+  coldStorage:
+    enabled: false
+
+# ============================================================================
+# DATA RETENTION CONFIGURATION
+# Reduced retention to minimize storage and memory pressure
+# ============================================================================
 queryService:
  resources:
    requests:
@@ -97,7 +94,33 @@ queryService:
    limits:
      memory: "2Gi"
      cpu: "1000m"
+  # Retention configuration via environment variables
+  configVars:
+    # Trace retention: 3 days (72 hours)
+    SIGNOZ_TRACE_TTL_DURATION_HOURS: "72"
+    # Logs retention: 7 days (168 hours)
+    SIGNOZ_LOGS_TTL_DURATION_HOURS: "168"
+    # Metrics retention: 7 days (168 hours)
+    SIGNOZ_METRICS_TTL_DURATION_HOURS: "168"

+# ============================================================================
+# OTEL COLLECTOR CONFIGURATION
+# This collector receives data from:
+# - Application services (traces, logs, metrics via OTLP)
+# - k8s-infra chart (infrastructure metrics)
+# ============================================================================
+otelCollector:
+  resources:
+    requests:
+      memory: "1Gi"
+      cpu: "500m"
+    limits:
+      memory: "2Gi"
+      cpu: "1000m"
+
+# ============================================================================
+# ALERTMANAGER CONFIGURATION
+# ============================================================================
 alertmanager:
  resources:
    requests:
@@ -106,3 +129,17 @@ alertmanager:
    limits:
      memory: "1Gi"
      cpu: "500m"
+
+# ============================================================================
+# ZOOKEEPER CONFIGURATION
+# ============================================================================
+zookeeper:
+  resources:
+    requests:
+      memory: "512Mi"
+      cpu: "250m"
+    limits:
+      memory: "1Gi"
+      cpu: "500m"
+  persistence:
+    size: 5Gi
--- a/infrastructure/testing/check_alert_flow.sh
+++ b/infrastructure/testing/check_alert_flow.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# Usage: ./check_alert_flow.sh <TENANT_ID>
+# Checks the complete alert flow for a demo session in production
+
+# Use microk8s kubectl directly to avoid wrapper issues
+KUBECTL="microk8s kubectl"
+
+TENANT_ID="${1:-your-default-tenant-id}"
+
+# Try different secret names for Redis password
+REDIS_PASSWORD=$($KUBECTL get secret redis-credentials -n bakery-ia -o jsonpath='{.data.REDIS_PASSWORD}' 2>/dev/null | base64 -d 2>/dev/null)
+if [ -z "$REDIS_PASSWORD" ]; then
+  REDIS_PASSWORD=$($KUBECTL get secret redis-secret -n bakery-ia -o jsonpath='{.data.REDIS_PASSWORD}' 2>/dev/null | base64 -d 2>/dev/null)
+fi
+if [ -z "$REDIS_PASSWORD" ]; then
+  # Try to get from configmap or use default
+  REDIS_PASSWORD="redis_pass123"
+fi
+
+echo "=========================================="
+echo "Alert Flow Health Check for Tenant: $TENANT_ID"
+echo "=========================================="
+
+echo -e "\n=== 1. RabbitMQ Queues ==="
+$KUBECTL exec -n bakery-ia deployment/rabbitmq -- \
+  rabbitmqctl list_queues name messages consumers 2>/dev/null | grep -E "alert|event" || echo "  No alert/event queues found or RabbitMQ not accessible"
+
+echo -e "\n=== 2. Alert Processor DB Events (last 10) ==="
+$KUBECTL exec -n bakery-ia deployment/alert-processor-db -- \
+  psql -U alert_processor_user -d alert_processor_db -c \
+  "SELECT event_type, priority_score, type_class, status, created_at
+   FROM events WHERE tenant_id = '$TENANT_ID'
+   ORDER BY created_at DESC LIMIT 10;" 2>/dev/null || echo "  Could not query alert-processor-db"
+
+echo -e "\n=== 3. Redis SSE Channels ==="
+$KUBECTL exec -n bakery-ia deployment/redis -- \
+  redis-cli -a "$REDIS_PASSWORD" --no-auth-warning PUBSUB CHANNELS "*" 2>/dev/null | grep -i "$TENANT_ID" || echo "  No active SSE channels for this tenant (channels only exist when frontend is connected)"
+
+echo -e "\n=== 4. Demo Session Status ==="
+$KUBECTL exec -n bakery-ia deployment/demo-session-db -- \
+  psql -U demo_session_user -d demo_session_db -c \
+  "SELECT id, status, virtual_tenant_id, demo_account_type, data_cloned, created_at, cloning_completed_at
+   FROM demo_sessions
+   WHERE virtual_tenant_id::text = '$TENANT_ID' OR id::text = '$TENANT_ID'
+   ORDER BY created_at DESC LIMIT 1;" 2>/dev/null || echo "  Could not query demo-session-db"
+
+echo -e "\n=== 5. Recent Alert Processor Logs ==="
+$KUBECTL logs -n bakery-ia deployment/alert-processor --tail=100 2>/dev/null | \
+  grep -iE "received|enriched|stored|error|consuming|rabbitmq|sse" | tail -15 || echo "  No relevant logs found"
+
+echo -e "\n=== 6. Gateway SSE Logs ==="
+$KUBECTL logs -n bakery-ia deployment/gateway --tail=100 2>/dev/null | \
+  grep -iE "sse|pubsub|events_stream" | tail -10 || echo "  No SSE logs found"
+
+echo -e "\n=== 7. Service Health ==="
+# Check deployment status directly
+for deploy in alert-processor gateway demo-session-service orchestrator-service inventory-service production-service procurement-service rabbitmq redis; do
+  READY=$($KUBECTL get deployment/$deploy -n bakery-ia -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0")
+  DESIRED=$($KUBECTL get deployment/$deploy -n bakery-ia -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "?")
+  if [ -z "$READY" ]; then READY="0"; fi
+  echo "  $deploy: $READY/$DESIRED ready"
+done
+
+echo -e "\n=== 8. RabbitMQ Connection Status ==="
+$KUBECTL exec -n bakery-ia deployment/rabbitmq -- \
+  rabbitmqctl list_connections name state 2>/dev/null | head -10 || echo "  Could not list connections"
+
+echo -e "\n=== 9. Recent Demo Session Service Logs (enrichment) ==="
+$KUBECTL logs -n bakery-ia deployment/demo-session-service --tail=100 2>/dev/null | \
+  grep -iE "enrichment|alert|trigger|clone|post_clone" | tail -10 || echo "  No relevant logs found"
+
+echo -e "\n=== 10. Total Events in Alert Processor DB ==="
+$KUBECTL exec -n bakery-ia deployment/alert-processor-db -- \
+  psql -U alert_processor_user -d alert_processor_db -c \
+  "SELECT COUNT(*) as total_events,
+          COUNT(*) FILTER (WHERE tenant_id = '$TENANT_ID') as tenant_events
+   FROM events;" 2>/dev/null || echo "  Could not query"
+
+echo -e "\n=========================================="
+echo "Health Check Complete"
+echo "=========================================="