Fix some issues
This commit is contained in:
@@ -387,6 +387,7 @@ data:
|
||||
VITE_PILOT_COUPON_CODE: "PILOT2025"
|
||||
VITE_PILOT_TRIAL_MONTHS: "3"
|
||||
VITE_STRIPE_PUBLISHABLE_KEY: "pk_test_51QuxKyIzCdnBmAVTGM8fvXYkItrBUILz6lHYwhAva6ZAH1HRi0e8zDRgZ4X3faN0zEABp5RHjCVBmMJL3aKXbaC200fFrSNnPl"
|
||||
VITE_STRIPE_ACCOUNT_ID: "acct_1QuxKsIucMC6K1cg"
|
||||
|
||||
# ================================================================
|
||||
# LOCATION SETTINGS (Nominatim Geocoding)
|
||||
|
||||
@@ -107,6 +107,12 @@ patches:
|
||||
- op: add
|
||||
path: /data/VITE_ENVIRONMENT
|
||||
value: "production"
|
||||
- op: replace
|
||||
path: /data/VITE_STRIPE_PUBLISHABLE_KEY
|
||||
value: "pk_test_51QuxKyIzCdnBmAVTGM8fvXYkItrBUILz6lHYwhAva6ZAH1HRi0e8zDRgZ4X3faN0zEABp5RHjCVBmMJL3aKXbaC200fFrSNnPl"
|
||||
- op: add
|
||||
path: /data/VITE_STRIPE_ACCOUNT_ID
|
||||
value: "acct_1QuxKsIucMC6K1cg"
|
||||
# Add imagePullSecrets to all Deployments for gitea registry authentication
|
||||
- target:
|
||||
kind: Deployment
|
||||
|
||||
53
infrastructure/monitoring/signoz/k8s-infra-values-dev.yaml
Normal file
53
infrastructure/monitoring/signoz/k8s-infra-values-dev.yaml
Normal file
@@ -0,0 +1,53 @@
|
||||
# SigNoz k8s-infra Helm Chart Values - Development Environment
|
||||
# Collects Kubernetes infrastructure metrics and sends to SigNoz
|
||||
#
|
||||
# Official Chart: https://github.com/SigNoz/charts/tree/main/charts/k8s-infra
|
||||
# Install Command: helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-dev.yaml
|
||||
|
||||
# ============================================================================
|
||||
# OTEL COLLECTOR ENDPOINT
|
||||
# ============================================================================
|
||||
otelCollectorEndpoint: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
|
||||
otelInsecure: true
|
||||
clusterName: "bakery-ia-dev"
|
||||
|
||||
# ============================================================================
|
||||
# PRESETS - Minimal configuration for development
|
||||
# ============================================================================
|
||||
presets:
|
||||
hostMetrics:
|
||||
enabled: true
|
||||
collectionInterval: 60s # Less frequent in dev
|
||||
|
||||
kubeletMetrics:
|
||||
enabled: true
|
||||
collectionInterval: 60s
|
||||
|
||||
kubernetesAttributes:
|
||||
enabled: true
|
||||
|
||||
kubernetesEvents:
|
||||
enabled: false # Disabled in dev to reduce noise
|
||||
|
||||
logsCollection:
|
||||
enabled: false
|
||||
|
||||
# ============================================================================
|
||||
# OTEL AGENT - Minimal resources for dev
|
||||
# ============================================================================
|
||||
otelAgent:
|
||||
enabled: true
|
||||
resources:
|
||||
requests:
|
||||
memory: "128Mi"
|
||||
cpu: "50m"
|
||||
limits:
|
||||
memory: "256Mi"
|
||||
cpu: "250m"
|
||||
|
||||
otelDeployment:
|
||||
enabled: false
|
||||
|
||||
commonLabels:
|
||||
app.kubernetes.io/part-of: "signoz"
|
||||
environment: "development"
|
||||
76
infrastructure/monitoring/signoz/k8s-infra-values-prod.yaml
Normal file
76
infrastructure/monitoring/signoz/k8s-infra-values-prod.yaml
Normal file
@@ -0,0 +1,76 @@
|
||||
# SigNoz k8s-infra Helm Chart Values - Production Environment
|
||||
# Collects ALL Kubernetes infrastructure metrics and sends to SigNoz
|
||||
#
|
||||
# This chart REPLACES the need for:
|
||||
# - kube-state-metrics (delete after deploying this)
|
||||
# - node-exporter (delete after deploying this)
|
||||
#
|
||||
# Official Chart: https://github.com/SigNoz/charts/tree/main/charts/k8s-infra
|
||||
#
|
||||
# Install Command:
|
||||
# helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-prod.yaml
|
||||
#
|
||||
# After install, remove redundant exporters:
|
||||
# helm uninstall kube-state-metrics -n bakery-ia
|
||||
# helm uninstall node-exporter-prometheus-node-exporter -n bakery-ia
|
||||
# (or: helm uninstall prometheus -n bakery-ia if installed via prometheus stack)
|
||||
|
||||
# ============================================================================
|
||||
# CONNECTION TO SIGNOZ
|
||||
# ============================================================================
|
||||
otelCollectorEndpoint: "signoz-otel-collector.bakery-ia.svc.cluster.local:4317"
|
||||
otelInsecure: true
|
||||
clusterName: "bakery-ia-prod"
|
||||
|
||||
# ============================================================================
|
||||
# PRESETS - What metrics to collect
|
||||
# ============================================================================
|
||||
presets:
|
||||
# Host metrics: CPU, memory, disk, filesystem, network, load
|
||||
# Replaces node-exporter
|
||||
hostMetrics:
|
||||
enabled: true
|
||||
collectionInterval: 30s
|
||||
|
||||
# Kubelet metrics: Pod/container CPU, memory usage
|
||||
# Essential for seeing resource usage per pod in SigNoz
|
||||
kubeletMetrics:
|
||||
enabled: true
|
||||
collectionInterval: 30s
|
||||
|
||||
# Kubernetes cluster metrics: deployments, pods, nodes status
|
||||
# Replaces kube-state-metrics
|
||||
clusterMetrics:
|
||||
enabled: true
|
||||
collectionInterval: 30s
|
||||
|
||||
# Enriches all telemetry with k8s metadata (pod name, namespace, etc.)
|
||||
kubernetesAttributes:
|
||||
enabled: true
|
||||
|
||||
# Kubernetes events (pod scheduled, failed, etc.)
|
||||
kubernetesEvents:
|
||||
enabled: true
|
||||
|
||||
# Container logs - disabled (apps send logs via OTLP directly)
|
||||
logsCollection:
|
||||
enabled: false
|
||||
|
||||
# ============================================================================
|
||||
# OTEL AGENT (DaemonSet) - Runs on each node
|
||||
# ============================================================================
|
||||
otelAgent:
|
||||
enabled: true
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
|
||||
# ============================================================================
|
||||
# OTEL DEPLOYMENT - Disabled (using DaemonSet only)
|
||||
# ============================================================================
|
||||
otelDeployment:
|
||||
enabled: false
|
||||
@@ -3,18 +3,21 @@
|
||||
# DEPLOYED IN bakery-ia NAMESPACE - Ingress managed by SigNoz Helm chart
|
||||
#
|
||||
# Official Chart: https://github.com/SigNoz/charts
|
||||
# Install Command: helm install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
|
||||
# Install Command: helm upgrade --install signoz signoz/signoz -n bakery-ia -f signoz-values-prod.yaml
|
||||
#
|
||||
# IMPORTANT: This chart works together with k8s-infra chart for infrastructure monitoring
|
||||
# Deploy k8s-infra after this: helm upgrade --install k8s-infra signoz/k8s-infra -n bakery-ia -f k8s-infra-values-prod.yaml
|
||||
#
|
||||
# MEMORY OPTIMIZATION NOTES:
|
||||
# - ClickHouse memory increased to 8Gi to prevent OOM errors
|
||||
# - Retention reduced to 3 days for traces, 7 days for metrics/logs
|
||||
|
||||
global:
|
||||
storageClass: "microk8s-hostpath" # For MicroK8s, use "microk8s-hostpath" or custom storage class
|
||||
storageClass: "microk8s-hostpath"
|
||||
clusterName: "bakery-ia-prod"
|
||||
domain: "monitoring.bakewise.ai"
|
||||
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
|
||||
|
||||
# Ingress configuration for SigNoz Frontend
|
||||
# Configured to use HTTPS with TLS termination at ingress controller
|
||||
# NOTE: SigNoz Helm chart expects ingress under "signoz.ingress", not "frontend.ingress"
|
||||
# Reference: https://github.com/SigNoz/charts/blob/main/charts/signoz/values.yaml
|
||||
signoz:
|
||||
ingress:
|
||||
enabled: true
|
||||
@@ -39,56 +42,50 @@ signoz:
|
||||
- monitoring.bakewise.ai
|
||||
secretName: bakery-ia-prod-tls-cert
|
||||
|
||||
# Resource configuration for production
|
||||
# Optimized for 8 CPU core VPS deployment
|
||||
# ============================================================================
|
||||
# CLICKHOUSE CONFIGURATION
|
||||
# Increased memory to 8Gi to prevent OOM errors (was 4Gi, causing code 241 errors)
|
||||
# ============================================================================
|
||||
clickhouse:
|
||||
persistence:
|
||||
size: 20Gi
|
||||
resources:
|
||||
requests:
|
||||
memory: "2Gi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "4Gi"
|
||||
cpu: "1000m"
|
||||
|
||||
otelCollector:
|
||||
resources:
|
||||
requests:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "2Gi"
|
||||
cpu: "1000m"
|
||||
# Additional config for Kubernetes infrastructure metrics scraping
|
||||
config:
|
||||
receivers:
|
||||
prometheus:
|
||||
config:
|
||||
scrape_configs:
|
||||
# Kube-state-metrics - Kubernetes object metrics
|
||||
- job_name: 'kube-state-metrics'
|
||||
static_configs:
|
||||
- targets: ['kube-state-metrics.bakery-ia.svc.cluster.local:8080']
|
||||
scrape_interval: 30s
|
||||
metric_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset|replicaset|job|cronjob|persistentvolume|persistentvolumeclaim|resourcequota|service|configmap|secret).*'
|
||||
action: keep
|
||||
# Node-exporter - Host-level metrics
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: ['node-exporter-prometheus-node-exporter.bakery-ia.svc.cluster.local:9100']
|
||||
scrape_interval: 30s
|
||||
metric_relabel_configs:
|
||||
- source_labels: [__name__]
|
||||
regex: 'node_(cpu|memory|disk|filesystem|network|load).*'
|
||||
action: keep
|
||||
service:
|
||||
pipelines:
|
||||
metrics:
|
||||
receivers: [otlp, prometheus]
|
||||
memory: "8Gi"
|
||||
cpu: "2000m"
|
||||
|
||||
# Server-level settings only (NOT user-level settings like max_threads)
|
||||
# User-level settings must go in profiles section
|
||||
settings:
|
||||
# Max server memory usage: 80% of container limit (6.4GB of 8GB)
|
||||
max_server_memory_usage: "6400000000"
|
||||
# Mark cache size (256MB)
|
||||
mark_cache_size: "268435456"
|
||||
# Uncompressed cache (256MB)
|
||||
uncompressed_cache_size: "268435456"
|
||||
# Max concurrent queries
|
||||
max_concurrent_queries: "100"
|
||||
|
||||
# User-level settings go in profiles
|
||||
profiles:
|
||||
default:
|
||||
# Max memory per query: 2GB
|
||||
max_memory_usage: "2000000000"
|
||||
# Max threads per query
|
||||
max_threads: "4"
|
||||
# Background merges memory limit
|
||||
max_bytes_to_merge_at_max_space_in_pool: "1073741824"
|
||||
|
||||
coldStorage:
|
||||
enabled: false
|
||||
|
||||
# ============================================================================
|
||||
# DATA RETENTION CONFIGURATION
|
||||
# Reduced retention to minimize storage and memory pressure
|
||||
# ============================================================================
|
||||
queryService:
|
||||
resources:
|
||||
requests:
|
||||
@@ -97,7 +94,33 @@ queryService:
|
||||
limits:
|
||||
memory: "2Gi"
|
||||
cpu: "1000m"
|
||||
# Retention configuration via environment variables
|
||||
configVars:
|
||||
# Trace retention: 3 days (72 hours)
|
||||
SIGNOZ_TRACE_TTL_DURATION_HOURS: "72"
|
||||
# Logs retention: 7 days (168 hours)
|
||||
SIGNOZ_LOGS_TTL_DURATION_HOURS: "168"
|
||||
# Metrics retention: 7 days (168 hours)
|
||||
SIGNOZ_METRICS_TTL_DURATION_HOURS: "168"
|
||||
|
||||
# ============================================================================
|
||||
# OTEL COLLECTOR CONFIGURATION
|
||||
# This collector receives data from:
|
||||
# - Application services (traces, logs, metrics via OTLP)
|
||||
# - k8s-infra chart (infrastructure metrics)
|
||||
# ============================================================================
|
||||
otelCollector:
|
||||
resources:
|
||||
requests:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
limits:
|
||||
memory: "2Gi"
|
||||
cpu: "1000m"
|
||||
|
||||
# ============================================================================
|
||||
# ALERTMANAGER CONFIGURATION
|
||||
# ============================================================================
|
||||
alertmanager:
|
||||
resources:
|
||||
requests:
|
||||
@@ -106,3 +129,17 @@ alertmanager:
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
|
||||
# ============================================================================
|
||||
# ZOOKEEPER CONFIGURATION
|
||||
# ============================================================================
|
||||
zookeeper:
|
||||
resources:
|
||||
requests:
|
||||
memory: "512Mi"
|
||||
cpu: "250m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
persistence:
|
||||
size: 5Gi
|
||||
|
||||
81
infrastructure/testing/check_alert_flow.sh
Normal file
81
infrastructure/testing/check_alert_flow.sh
Normal file
@@ -0,0 +1,81 @@
|
||||
#!/bin/bash
|
||||
# Usage: ./check_alert_flow.sh <TENANT_ID>
|
||||
# Checks the complete alert flow for a demo session in production
|
||||
|
||||
# Use microk8s kubectl directly to avoid wrapper issues
|
||||
KUBECTL="microk8s kubectl"
|
||||
|
||||
TENANT_ID="${1:-your-default-tenant-id}"
|
||||
|
||||
# Try different secret names for Redis password
|
||||
REDIS_PASSWORD=$($KUBECTL get secret redis-credentials -n bakery-ia -o jsonpath='{.data.REDIS_PASSWORD}' 2>/dev/null | base64 -d 2>/dev/null)
|
||||
if [ -z "$REDIS_PASSWORD" ]; then
|
||||
REDIS_PASSWORD=$($KUBECTL get secret redis-secret -n bakery-ia -o jsonpath='{.data.REDIS_PASSWORD}' 2>/dev/null | base64 -d 2>/dev/null)
|
||||
fi
|
||||
if [ -z "$REDIS_PASSWORD" ]; then
|
||||
# Try to get from configmap or use default
|
||||
REDIS_PASSWORD="redis_pass123"
|
||||
fi
|
||||
|
||||
echo "=========================================="
|
||||
echo "Alert Flow Health Check for Tenant: $TENANT_ID"
|
||||
echo "=========================================="
|
||||
|
||||
echo -e "\n=== 1. RabbitMQ Queues ==="
|
||||
$KUBECTL exec -n bakery-ia deployment/rabbitmq -- \
|
||||
rabbitmqctl list_queues name messages consumers 2>/dev/null | grep -E "alert|event" || echo " No alert/event queues found or RabbitMQ not accessible"
|
||||
|
||||
echo -e "\n=== 2. Alert Processor DB Events (last 10) ==="
|
||||
$KUBECTL exec -n bakery-ia deployment/alert-processor-db -- \
|
||||
psql -U alert_processor_user -d alert_processor_db -c \
|
||||
"SELECT event_type, priority_score, type_class, status, created_at
|
||||
FROM events WHERE tenant_id = '$TENANT_ID'
|
||||
ORDER BY created_at DESC LIMIT 10;" 2>/dev/null || echo " Could not query alert-processor-db"
|
||||
|
||||
echo -e "\n=== 3. Redis SSE Channels ==="
|
||||
$KUBECTL exec -n bakery-ia deployment/redis -- \
|
||||
redis-cli -a "$REDIS_PASSWORD" --no-auth-warning PUBSUB CHANNELS "*" 2>/dev/null | grep -i "$TENANT_ID" || echo " No active SSE channels for this tenant (channels only exist when frontend is connected)"
|
||||
|
||||
echo -e "\n=== 4. Demo Session Status ==="
|
||||
$KUBECTL exec -n bakery-ia deployment/demo-session-db -- \
|
||||
psql -U demo_session_user -d demo_session_db -c \
|
||||
"SELECT id, status, virtual_tenant_id, demo_account_type, data_cloned, created_at, cloning_completed_at
|
||||
FROM demo_sessions
|
||||
WHERE virtual_tenant_id::text = '$TENANT_ID' OR id::text = '$TENANT_ID'
|
||||
ORDER BY created_at DESC LIMIT 1;" 2>/dev/null || echo " Could not query demo-session-db"
|
||||
|
||||
echo -e "\n=== 5. Recent Alert Processor Logs ==="
|
||||
$KUBECTL logs -n bakery-ia deployment/alert-processor --tail=100 2>/dev/null | \
|
||||
grep -iE "received|enriched|stored|error|consuming|rabbitmq|sse" | tail -15 || echo " No relevant logs found"
|
||||
|
||||
echo -e "\n=== 6. Gateway SSE Logs ==="
|
||||
$KUBECTL logs -n bakery-ia deployment/gateway --tail=100 2>/dev/null | \
|
||||
grep -iE "sse|pubsub|events_stream" | tail -10 || echo " No SSE logs found"
|
||||
|
||||
echo -e "\n=== 7. Service Health ==="
|
||||
# Check deployment status directly
|
||||
for deploy in alert-processor gateway demo-session-service orchestrator-service inventory-service production-service procurement-service rabbitmq redis; do
|
||||
READY=$($KUBECTL get deployment/$deploy -n bakery-ia -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0")
|
||||
DESIRED=$($KUBECTL get deployment/$deploy -n bakery-ia -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "?")
|
||||
if [ -z "$READY" ]; then READY="0"; fi
|
||||
echo " $deploy: $READY/$DESIRED ready"
|
||||
done
|
||||
|
||||
echo -e "\n=== 8. RabbitMQ Connection Status ==="
|
||||
$KUBECTL exec -n bakery-ia deployment/rabbitmq -- \
|
||||
rabbitmqctl list_connections name state 2>/dev/null | head -10 || echo " Could not list connections"
|
||||
|
||||
echo -e "\n=== 9. Recent Demo Session Service Logs (enrichment) ==="
|
||||
$KUBECTL logs -n bakery-ia deployment/demo-session-service --tail=100 2>/dev/null | \
|
||||
grep -iE "enrichment|alert|trigger|clone|post_clone" | tail -10 || echo " No relevant logs found"
|
||||
|
||||
echo -e "\n=== 10. Total Events in Alert Processor DB ==="
|
||||
$KUBECTL exec -n bakery-ia deployment/alert-processor-db -- \
|
||||
psql -U alert_processor_user -d alert_processor_db -c \
|
||||
"SELECT COUNT(*) as total_events,
|
||||
COUNT(*) FILTER (WHERE tenant_id = '$TENANT_ID') as tenant_events
|
||||
FROM events;" 2>/dev/null || echo " Could not query"
|
||||
|
||||
echo -e "\n=========================================="
|
||||
echo "Health Check Complete"
|
||||
echo "=========================================="
|
||||
Reference in New Issue
Block a user