Update monitoring packages to latest versions
- Updated all OpenTelemetry packages to latest versions: - opentelemetry-api: 1.27.0 → 1.39.1 - opentelemetry-sdk: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-grpc: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-http: 1.27.0 → 1.39.1 - opentelemetry-instrumentation-fastapi: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-httpx: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-redis: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-sqlalchemy: 0.48b0 → 0.60b1 - Removed prometheus-client==0.23.1 from all services - Unified all services to use the same monitoring package versions Generated by Mistral Vibe. Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
This commit is contained in:
298
infrastructure/helm/deploy-signoz.sh
Executable file
298
infrastructure/helm/deploy-signoz.sh
Executable file
@@ -0,0 +1,298 @@
|
||||
#!/bin/bash
|
||||
|
||||
# ============================================================================
|
||||
# SigNoz Deployment Script for Bakery IA
|
||||
# ============================================================================
|
||||
# This script deploys SigNoz monitoring stack using Helm
|
||||
# Supports both development and production environments
|
||||
# ============================================================================
|
||||
|
||||
set -e
|
||||
|
||||
# Color codes for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Function to display help
|
||||
show_help() {
|
||||
echo "Usage: $0 [OPTIONS] ENVIRONMENT"
|
||||
echo ""
|
||||
echo "Deploy SigNoz monitoring stack for Bakery IA"
|
||||
echo ""
|
||||
echo "Arguments:
|
||||
ENVIRONMENT Environment to deploy to (dev|prod)"
|
||||
echo ""
|
||||
echo "Options:
|
||||
-h, --help Show this help message
|
||||
-d, --dry-run Dry run - show what would be done without actually deploying
|
||||
-u, --upgrade Upgrade existing deployment
|
||||
-r, --remove Remove/Uninstall SigNoz deployment
|
||||
-n, --namespace NAMESPACE Specify namespace (default: signoz)"
|
||||
echo ""
|
||||
echo "Examples:
|
||||
$0 dev # Deploy to development
|
||||
$0 prod # Deploy to production
|
||||
$0 --upgrade prod # Upgrade production deployment
|
||||
$0 --remove dev # Remove development deployment"
|
||||
}
|
||||
|
||||
# Parse command line arguments
|
||||
DRY_RUN=false
|
||||
UPGRADE=false
|
||||
REMOVE=false
|
||||
NAMESPACE="signoz"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
-d|--dry-run)
|
||||
DRY_RUN=true
|
||||
shift
|
||||
;;
|
||||
-u|--upgrade)
|
||||
UPGRADE=true
|
||||
shift
|
||||
;;
|
||||
-r|--remove)
|
||||
REMOVE=true
|
||||
shift
|
||||
;;
|
||||
-n|--namespace)
|
||||
NAMESPACE="$2"
|
||||
shift 2
|
||||
;;
|
||||
dev|prod)
|
||||
ENVIRONMENT="$1"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1"
|
||||
show_help
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate environment
|
||||
if [[ -z "$ENVIRONMENT" ]]; then
|
||||
echo "Error: Environment not specified. Use 'dev' or 'prod'."
|
||||
show_help
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "$ENVIRONMENT" != "dev" && "$ENVIRONMENT" != "prod" ]]; then
|
||||
echo "Error: Invalid environment. Use 'dev' or 'prod'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Function to check if Helm is installed
|
||||
check_helm() {
|
||||
if ! command -v helm &> /dev/null; then
|
||||
echo "${RED}Error: Helm is not installed. Please install Helm first.${NC}"
|
||||
echo "Installation instructions: https://helm.sh/docs/intro/install/"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to check if kubectl is configured
|
||||
check_kubectl() {
|
||||
if ! kubectl cluster-info &> /dev/null; then
|
||||
echo "${RED}Error: kubectl is not configured or cannot connect to cluster.${NC}"
|
||||
echo "Please ensure you have access to a Kubernetes cluster."
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to check if namespace exists, create if not
|
||||
ensure_namespace() {
|
||||
if ! kubectl get namespace "$NAMESPACE" &> /dev/null; then
|
||||
echo "${BLUE}Creating namespace $NAMESPACE...${NC}"
|
||||
if [[ "$DRY_RUN" == true ]]; then
|
||||
echo " (dry-run) Would create namespace $NAMESPACE"
|
||||
else
|
||||
kubectl create namespace "$NAMESPACE"
|
||||
echo "${GREEN}Namespace $NAMESPACE created.${NC}"
|
||||
fi
|
||||
else
|
||||
echo "${BLUE}Namespace $NAMESPACE already exists.${NC}"
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to deploy SigNoz
|
||||
deploy_signoz() {
|
||||
local values_file="infrastructure/helm/signoz-values-$ENVIRONMENT.yaml"
|
||||
|
||||
if [[ ! -f "$values_file" ]]; then
|
||||
echo "${RED}Error: Values file $values_file not found.${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "${BLUE}Deploying SigNoz to $ENVIRONMENT environment...${NC}"
|
||||
echo " Using values file: $values_file"
|
||||
echo " Target namespace: $NAMESPACE"
|
||||
|
||||
if [[ "$DRY_RUN" == true ]]; then
|
||||
echo " (dry-run) Would deploy SigNoz with:"
|
||||
echo " helm install signoz signoz/signoz -n $NAMESPACE -f $values_file"
|
||||
return
|
||||
fi
|
||||
|
||||
# Use upgrade --install to handle both new installations and upgrades
|
||||
echo "${BLUE}Installing/Upgrading SigNoz...${NC}"
|
||||
helm upgrade --install signoz signoz/signoz -n "$NAMESPACE" -f "$values_file"
|
||||
|
||||
echo "${GREEN}SigNoz deployment initiated.${NC}"
|
||||
echo "Waiting for pods to become ready..."
|
||||
|
||||
# Wait for deployment to complete
|
||||
wait_for_deployment
|
||||
}
|
||||
|
||||
# Function to remove SigNoz
|
||||
remove_signoz() {
|
||||
echo "${BLUE}Removing SigNoz deployment from namespace $NAMESPACE...${NC}"
|
||||
|
||||
if [[ "$DRY_RUN" == true ]]; then
|
||||
echo " (dry-run) Would remove SigNoz deployment"
|
||||
return
|
||||
fi
|
||||
|
||||
if helm list -n "$NAMESPACE" | grep -q signoz; then
|
||||
helm uninstall signoz -n "$NAMESPACE"
|
||||
echo "${GREEN}SigNoz deployment removed.${NC}"
|
||||
else
|
||||
echo "${YELLOW}No SigNoz deployment found in namespace $NAMESPACE.${NC}"
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to wait for deployment to complete
|
||||
wait_for_deployment() {
|
||||
echo "${BLUE}Waiting for SigNoz pods to become ready...${NC}"
|
||||
|
||||
# Wait for pods to be ready
|
||||
local timeout=600 # 10 minutes
|
||||
local start_time=$(date +%s)
|
||||
|
||||
while true; do
|
||||
local current_time=$(date +%s)
|
||||
local elapsed=$((current_time - start_time))
|
||||
|
||||
if [[ $elapsed -ge $timeout ]]; then
|
||||
echo "${RED}Timeout waiting for SigNoz pods to become ready.${NC}"
|
||||
break
|
||||
fi
|
||||
|
||||
# Check pod status
|
||||
local ready_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz --field-selector=status.phase=Running 2>/dev/null | grep -c "Running" | tr -d '[:space:]' || echo "0")
|
||||
local total_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz 2>/dev/null | grep -v "NAME" | wc -l | tr -d '[:space:]' || echo "0")
|
||||
|
||||
if [[ $ready_pods -eq 0 ]]; then
|
||||
echo " Waiting for pods to start..."
|
||||
else
|
||||
echo " $ready_pods/$total_pods pods are running"
|
||||
|
||||
if [[ $ready_pods -eq $total_pods && $total_pods -gt 0 ]]; then
|
||||
echo "${GREEN}All SigNoz pods are running!${NC}"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
done
|
||||
|
||||
# Show deployment status
|
||||
show_deployment_status
|
||||
}
|
||||
|
||||
# Function to show deployment status
|
||||
show_deployment_status() {
|
||||
echo ""
|
||||
echo "${BLUE}=== SigNoz Deployment Status ===${NC}"
|
||||
echo ""
|
||||
|
||||
# Get pods
|
||||
echo "Pods:"
|
||||
kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz
|
||||
echo ""
|
||||
|
||||
# Get services
|
||||
echo "Services:"
|
||||
kubectl get svc -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz
|
||||
echo ""
|
||||
|
||||
# Get ingress
|
||||
echo "Ingress:"
|
||||
kubectl get ingress -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz
|
||||
echo ""
|
||||
|
||||
# Show access information
|
||||
show_access_info
|
||||
}
|
||||
|
||||
# Function to show access information
|
||||
show_access_info() {
|
||||
echo "${BLUE}=== Access Information ===${NC}"
|
||||
|
||||
if [[ "$ENVIRONMENT" == "dev" ]]; then
|
||||
echo "SigNoz UI: https://localhost/signoz"
|
||||
echo "SigNoz API: https://localhost/signoz-api"
|
||||
echo ""
|
||||
echo "OpenTelemetry Collector Endpoints:"
|
||||
echo " gRPC: localhost:4317"
|
||||
echo " HTTP: localhost:4318"
|
||||
echo " Metrics: localhost:8888"
|
||||
else
|
||||
echo "SigNoz UI: https://monitoring.bakewise.ai/signoz"
|
||||
echo "SigNoz API: https://monitoring.bakewise.ai/signoz-api"
|
||||
echo "SigNoz Alerts: https://monitoring.bakewise.ai/signoz-alerts"
|
||||
echo ""
|
||||
echo "OpenTelemetry Collector Endpoints:"
|
||||
echo " gRPC: monitoring.bakewise.ai:4317"
|
||||
echo " HTTP: monitoring.bakewise.ai:4318"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Default credentials:"
|
||||
echo " Username: admin"
|
||||
echo " Password: admin"
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
echo "${BLUE}"
|
||||
echo "=========================================="
|
||||
echo "🚀 SigNoz Deployment for Bakery IA"
|
||||
echo "=========================================="
|
||||
echo "${NC}"
|
||||
|
||||
# Check prerequisites
|
||||
check_helm
|
||||
check_kubectl
|
||||
|
||||
# Ensure namespace
|
||||
ensure_namespace
|
||||
|
||||
if [[ "$REMOVE" == true ]]; then
|
||||
remove_signoz
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Deploy SigNoz
|
||||
deploy_signoz
|
||||
|
||||
echo "${GREEN}"
|
||||
echo "=========================================="
|
||||
echo "✅ SigNoz deployment completed!"
|
||||
echo "=========================================="
|
||||
echo "${NC}"
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main
|
||||
@@ -6,7 +6,10 @@
|
||||
|
||||
global:
|
||||
storageClass: "standard"
|
||||
domain: "localhost"
|
||||
domain: "monitoring.bakery-ia.local"
|
||||
# Docker Hub credentials for pulling images
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
|
||||
# Frontend Configuration
|
||||
frontend:
|
||||
@@ -27,7 +30,7 @@ frontend:
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
||||
nginx.ingress.kubernetes.io/use-regex: "true"
|
||||
hosts:
|
||||
- host: localhost
|
||||
- host: monitoring.bakery-ia.local
|
||||
paths:
|
||||
- path: /signoz(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
@@ -35,8 +38,8 @@ frontend:
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
cpu: 25m # Reduced for local dev
|
||||
memory: 64Mi # Reduced for local dev
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
@@ -44,6 +47,8 @@ frontend:
|
||||
env:
|
||||
- name: FRONTEND_REFRESH_INTERVAL
|
||||
value: "30000"
|
||||
- name: BASE_URL
|
||||
value: "https://monitoring.bakery-ia.local/signoz"
|
||||
|
||||
# Query Service Configuration
|
||||
queryService:
|
||||
@@ -59,8 +64,8 @@ queryService:
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
cpu: 50m # Reduced for local dev
|
||||
memory: 128Mi # Reduced for local dev
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
@@ -90,8 +95,8 @@ alertmanager:
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
cpu: 25m # Reduced for local dev
|
||||
memory: 64Mi # Reduced for local dev
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
@@ -115,76 +120,59 @@ alertmanager:
|
||||
# Add email, slack, webhook configs here
|
||||
|
||||
# ClickHouse Configuration - Time Series Database
|
||||
# Minimal resources for local development on constrained Kind cluster
|
||||
clickhouse:
|
||||
replicaCount: 1
|
||||
image:
|
||||
repository: clickhouse/clickhouse-server
|
||||
tag: 24.1.2-alpine
|
||||
pullPolicy: IfNotPresent
|
||||
enabled: true
|
||||
installCustomStorageClass: false
|
||||
|
||||
service:
|
||||
type: ClusterIP
|
||||
httpPort: 8123
|
||||
tcpPort: 9000
|
||||
# Reduce ClickHouse resource requests for local dev
|
||||
clickhouse:
|
||||
resources:
|
||||
requests:
|
||||
cpu: 200m # Reduced from default 500m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 10Gi
|
||||
storageClass: "standard"
|
||||
|
||||
# ClickHouse configuration
|
||||
config:
|
||||
logger:
|
||||
level: information
|
||||
max_connections: 1024
|
||||
max_concurrent_queries: 100
|
||||
# Data retention (7 days for dev)
|
||||
merge_tree:
|
||||
parts_to_delay_insert: 150
|
||||
parts_to_throw_insert: 300
|
||||
|
||||
# OpenTelemetry Collector - Integrated with SigNoz
|
||||
# OpenTelemetry Collector - Data ingestion endpoint for all telemetry
|
||||
otelCollector:
|
||||
enabled: true
|
||||
replicaCount: 1
|
||||
image:
|
||||
repository: signoz/signoz-otel-collector
|
||||
tag: 0.102.8
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
# Service configuration - expose both gRPC and HTTP endpoints
|
||||
service:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
otlpGrpc: 4317
|
||||
otlpHttp: 4318
|
||||
metrics: 8888
|
||||
healthCheck: 13133
|
||||
# gRPC receivers
|
||||
- name: otlp-grpc
|
||||
port: 4317
|
||||
targetPort: 4317
|
||||
protocol: TCP
|
||||
# HTTP receivers
|
||||
- name: otlp-http
|
||||
port: 4318
|
||||
targetPort: 4318
|
||||
protocol: TCP
|
||||
# Prometheus remote write
|
||||
- name: prometheus
|
||||
port: 8889
|
||||
targetPort: 8889
|
||||
protocol: TCP
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
cpu: 50m # Reduced from 100m
|
||||
memory: 128Mi # Reduced from 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
# Full OTEL Collector Configuration
|
||||
# OpenTelemetry Collector configuration
|
||||
config:
|
||||
extensions:
|
||||
health_check:
|
||||
endpoint: 0.0.0.0:13133
|
||||
zpages:
|
||||
endpoint: 0.0.0.0:55679
|
||||
|
||||
receivers:
|
||||
# OTLP receivers for traces, metrics, and logs from applications
|
||||
# All application telemetry is pushed via OTLP protocol
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
@@ -193,105 +181,119 @@ otelCollector:
|
||||
endpoint: 0.0.0.0:4318
|
||||
cors:
|
||||
allowed_origins:
|
||||
- "http://localhost"
|
||||
- "https://localhost"
|
||||
- "*"
|
||||
|
||||
# Prometheus receiver for scraping metrics
|
||||
prometheus:
|
||||
config:
|
||||
scrape_configs:
|
||||
- job_name: 'otel-collector'
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets: ['localhost:8888']
|
||||
# PostgreSQL receivers for database metrics
|
||||
# Collects metrics directly from PostgreSQL databases
|
||||
postgresql/auth:
|
||||
endpoint: auth-db-service.bakery-ia:5432
|
||||
username: ${POSTGRES_MONITOR_USER}
|
||||
password: ${POSTGRES_MONITOR_PASSWORD}
|
||||
databases:
|
||||
- auth_db
|
||||
collection_interval: 60s
|
||||
tls:
|
||||
insecure: false
|
||||
|
||||
postgresql/inventory:
|
||||
endpoint: inventory-db-service.bakery-ia:5432
|
||||
username: ${POSTGRES_MONITOR_USER}
|
||||
password: ${POSTGRES_MONITOR_PASSWORD}
|
||||
databases:
|
||||
- inventory_db
|
||||
collection_interval: 60s
|
||||
tls:
|
||||
insecure: false
|
||||
|
||||
postgresql/orders:
|
||||
endpoint: orders-db-service.bakery-ia:5432
|
||||
username: ${POSTGRES_MONITOR_USER}
|
||||
password: ${POSTGRES_MONITOR_PASSWORD}
|
||||
databases:
|
||||
- orders_db
|
||||
collection_interval: 60s
|
||||
tls:
|
||||
insecure: false
|
||||
|
||||
# Add more PostgreSQL databases as needed
|
||||
# postgresql/SERVICE:
|
||||
# endpoint: SERVICE-db-service.bakery-ia:5432
|
||||
# ...
|
||||
|
||||
# Redis receiver for cache metrics
|
||||
redis:
|
||||
endpoint: redis-service.bakery-ia:6379
|
||||
password: ${REDIS_PASSWORD}
|
||||
collection_interval: 60s
|
||||
tls:
|
||||
insecure: false
|
||||
cert_file: /etc/redis-tls/redis-cert.pem
|
||||
key_file: /etc/redis-tls/redis-key.pem
|
||||
ca_file: /etc/redis-tls/ca-cert.pem
|
||||
|
||||
# RabbitMQ receiver via management API
|
||||
rabbitmq:
|
||||
endpoint: http://rabbitmq-service.bakery-ia:15672
|
||||
username: ${RABBITMQ_USER}
|
||||
password: ${RABBITMQ_PASSWORD}
|
||||
collection_interval: 60s
|
||||
|
||||
processors:
|
||||
# Batch processor for better performance
|
||||
batch:
|
||||
timeout: 10s
|
||||
send_batch_size: 1024
|
||||
|
||||
# Memory limiter to prevent OOM
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 400
|
||||
spike_limit_mib: 100
|
||||
|
||||
# Resource detection for K8s
|
||||
# Resource detection
|
||||
resourcedetection:
|
||||
detectors: [env, system, docker]
|
||||
detectors: [env, system]
|
||||
timeout: 5s
|
||||
|
||||
# Add resource attributes
|
||||
resource:
|
||||
attributes:
|
||||
- key: deployment.environment
|
||||
value: development
|
||||
action: upsert
|
||||
|
||||
exporters:
|
||||
# Export to SigNoz ClickHouse
|
||||
# ClickHouse exporter for traces
|
||||
clickhousetraces:
|
||||
datasource: tcp://clickhouse:9000/?database=signoz_traces
|
||||
datasource: tcp://signoz-clickhouse:9000/?database=signoz_traces
|
||||
timeout: 10s
|
||||
|
||||
# ClickHouse exporter for metrics
|
||||
clickhousemetricswrite:
|
||||
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
|
||||
endpoint: tcp://signoz-clickhouse:9000/?database=signoz_metrics
|
||||
timeout: 10s
|
||||
|
||||
# ClickHouse exporter for logs
|
||||
clickhouselogsexporter:
|
||||
dsn: tcp://clickhouse:9000/?database=signoz_logs
|
||||
dsn: tcp://signoz-clickhouse:9000/?database=signoz_logs
|
||||
timeout: 10s
|
||||
|
||||
# Debug logging
|
||||
# Logging exporter for debugging (optional)
|
||||
logging:
|
||||
loglevel: info
|
||||
sampling_initial: 5
|
||||
sampling_thereafter: 200
|
||||
|
||||
service:
|
||||
extensions: [health_check, zpages]
|
||||
pipelines:
|
||||
# Traces pipeline
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||
exporters: [clickhousetraces, logging]
|
||||
processors: [memory_limiter, batch, resourcedetection]
|
||||
exporters: [clickhousetraces]
|
||||
|
||||
# Metrics pipeline
|
||||
metrics:
|
||||
receivers: [otlp, prometheus]
|
||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||
receivers: [otlp, postgresql/auth, postgresql/inventory, postgresql/orders, redis, rabbitmq]
|
||||
processors: [memory_limiter, batch, resourcedetection]
|
||||
exporters: [clickhousemetricswrite]
|
||||
|
||||
# Logs pipeline
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch, resourcedetection, resource]
|
||||
exporters: [clickhouselogsexporter, logging]
|
||||
|
||||
# OpenTelemetry Collector Deployment Mode
|
||||
otelCollectorDeployment:
|
||||
enabled: true
|
||||
mode: deployment
|
||||
|
||||
# Node Exporter for infrastructure metrics (optional)
|
||||
nodeExporter:
|
||||
enabled: true
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 9100
|
||||
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
|
||||
# Schemamanager - Manages ClickHouse schema
|
||||
schemamanager:
|
||||
enabled: true
|
||||
image:
|
||||
repository: signoz/signoz-schema-migrator
|
||||
tag: 0.52.3
|
||||
pullPolicy: IfNotPresent
|
||||
processors: [memory_limiter, batch, resourcedetection]
|
||||
exporters: [clickhouselogsexporter]
|
||||
|
||||
# Additional Configuration
|
||||
serviceAccount:
|
||||
|
||||
394
infrastructure/helm/verify-signoz.sh
Executable file
394
infrastructure/helm/verify-signoz.sh
Executable file
@@ -0,0 +1,394 @@
|
||||
#!/bin/bash
|
||||
|
||||
# ============================================================================
|
||||
# SigNoz Verification Script for Bakery IA
|
||||
# ============================================================================
|
||||
# This script verifies that SigNoz is properly deployed and functioning
|
||||
# ============================================================================
|
||||
|
||||
set -e
|
||||
|
||||
# Color codes for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
BLUE='\033[0;34m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Function to display help
|
||||
show_help() {
|
||||
echo "Usage: $0 [OPTIONS] ENVIRONMENT"
|
||||
echo ""
|
||||
echo "Verify SigNoz deployment for Bakery IA"
|
||||
echo ""
|
||||
echo "Arguments:
|
||||
ENVIRONMENT Environment to verify (dev|prod)"
|
||||
echo ""
|
||||
echo "Options:
|
||||
-h, --help Show this help message
|
||||
-n, --namespace NAMESPACE Specify namespace (default: signoz)"
|
||||
echo ""
|
||||
echo "Examples:
|
||||
$0 dev # Verify development deployment
|
||||
$0 prod # Verify production deployment
|
||||
$0 --namespace monitoring dev # Verify with custom namespace"
|
||||
}
|
||||
|
||||
# Parse command line arguments
|
||||
NAMESPACE="signoz"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
-h|--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
-n|--namespace)
|
||||
NAMESPACE="$2"
|
||||
shift 2
|
||||
;;
|
||||
dev|prod)
|
||||
ENVIRONMENT="$1"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1"
|
||||
show_help
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Validate environment
|
||||
if [[ -z "$ENVIRONMENT" ]]; then
|
||||
echo "Error: Environment not specified. Use 'dev' or 'prod'."
|
||||
show_help
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "$ENVIRONMENT" != "dev" && "$ENVIRONMENT" != "prod" ]]; then
|
||||
echo "Error: Invalid environment. Use 'dev' or 'prod'."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Function to check if kubectl is configured
|
||||
check_kubectl() {
|
||||
if ! kubectl cluster-info &> /dev/null; then
|
||||
echo "${RED}Error: kubectl is not configured or cannot connect to cluster.${NC}"
|
||||
echo "Please ensure you have access to a Kubernetes cluster."
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to check namespace exists
|
||||
check_namespace() {
|
||||
if ! kubectl get namespace "$NAMESPACE" &> /dev/null; then
|
||||
echo "${RED}Error: Namespace $NAMESPACE does not exist.${NC}"
|
||||
echo "Please deploy SigNoz first using: ./deploy-signoz.sh $ENVIRONMENT"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to verify SigNoz deployment
|
||||
verify_deployment() {
|
||||
echo "${BLUE}"
|
||||
echo "=========================================="
|
||||
echo "🔍 Verifying SigNoz Deployment"
|
||||
echo "=========================================="
|
||||
echo "Environment: $ENVIRONMENT"
|
||||
echo "Namespace: $NAMESPACE"
|
||||
echo "${NC}"
|
||||
echo ""
|
||||
|
||||
# Check if SigNoz helm release exists
|
||||
echo "${BLUE}1. Checking Helm release...${NC}"
|
||||
if helm list -n "$NAMESPACE" | grep -q signoz; then
|
||||
echo "${GREEN}✅ SigNoz Helm release found${NC}"
|
||||
else
|
||||
echo "${RED}❌ SigNoz Helm release not found${NC}"
|
||||
echo "Please deploy SigNoz first using: ./deploy-signoz.sh $ENVIRONMENT"
|
||||
exit 1
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Check pod status
|
||||
echo "${BLUE}2. Checking pod status...${NC}"
|
||||
local total_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz 2>/dev/null | grep -v "NAME" | wc -l | tr -d ' ' || echo "0")
|
||||
local running_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz --field-selector=status.phase=Running 2>/dev/null | grep -c "Running" || echo "0")
|
||||
local ready_pods=$(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz 2>/dev/null | grep "Running" | grep "1/1" | wc -l | tr -d ' ' || echo "0")
|
||||
|
||||
echo "Total pods: $total_pods"
|
||||
echo "Running pods: $running_pods"
|
||||
echo "Ready pods: $ready_pods"
|
||||
|
||||
if [[ $total_pods -eq 0 ]]; then
|
||||
echo "${RED}❌ No SigNoz pods found${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ $running_pods -eq $total_pods ]]; then
|
||||
echo "${GREEN}✅ All pods are running${NC}"
|
||||
else
|
||||
echo "${YELLOW}⚠️ Some pods are not running${NC}"
|
||||
fi
|
||||
|
||||
if [[ $ready_pods -eq $total_pods ]]; then
|
||||
echo "${GREEN}✅ All pods are ready${NC}"
|
||||
else
|
||||
echo "${YELLOW}⚠️ Some pods are not ready${NC}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Show pod details
|
||||
echo "${BLUE}Pod Details:${NC}"
|
||||
kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz
|
||||
echo ""
|
||||
|
||||
# Check services
|
||||
echo "${BLUE}3. Checking services...${NC}"
|
||||
local service_count=$(kubectl get svc -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz 2>/dev/null | grep -v "NAME" | wc -l | tr -d ' ' || echo "0")
|
||||
|
||||
if [[ $service_count -gt 0 ]]; then
|
||||
echo "${GREEN}✅ Services found ($service_count services)${NC}"
|
||||
kubectl get svc -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz
|
||||
else
|
||||
echo "${RED}❌ No services found${NC}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Check ingress
|
||||
echo "${BLUE}4. Checking ingress...${NC}"
|
||||
local ingress_count=$(kubectl get ingress -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz 2>/dev/null | grep -v "NAME" | wc -l | tr -d ' ' || echo "0")
|
||||
|
||||
if [[ $ingress_count -gt 0 ]]; then
|
||||
echo "${GREEN}✅ Ingress found ($ingress_count ingress resources)${NC}"
|
||||
kubectl get ingress -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz
|
||||
else
|
||||
echo "${YELLOW}⚠️ No ingress found (may be configured in main namespace)${NC}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Check PVCs
|
||||
echo "${BLUE}5. Checking persistent volume claims...${NC}"
|
||||
local pvc_count=$(kubectl get pvc -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz 2>/dev/null | grep -v "NAME" | wc -l | tr -d ' ' || echo "0")
|
||||
|
||||
if [[ $pvc_count -gt 0 ]]; then
|
||||
echo "${GREEN}✅ PVCs found ($pvc_count PVCs)${NC}"
|
||||
kubectl get pvc -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz
|
||||
else
|
||||
echo "${YELLOW}⚠️ No PVCs found (may not be required for all components)${NC}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Check resource usage
|
||||
echo "${BLUE}6. Checking resource usage...${NC}"
|
||||
if command -v kubectl &> /dev/null && kubectl top pods -n "$NAMESPACE" &> /dev/null; then
|
||||
echo "${GREEN}✅ Resource usage:${NC}"
|
||||
kubectl top pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz
|
||||
else
|
||||
echo "${YELLOW}⚠️ Metrics server not available or no resource usage data${NC}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Check logs for errors
|
||||
echo "${BLUE}7. Checking for errors in logs...${NC}"
|
||||
local error_found=false
|
||||
|
||||
# Check each pod for errors
|
||||
while IFS= read -r pod; do
|
||||
if [[ -n "$pod" ]]; then
|
||||
local pod_errors=$(kubectl logs -n "$NAMESPACE" "$pod" 2>/dev/null | grep -i "error\|exception\|fail\|crash" | wc -l || echo "0")
|
||||
if [[ $pod_errors -gt 0 ]]; then
|
||||
echo "${RED}❌ Errors found in pod $pod ($pod_errors errors)${NC}"
|
||||
error_found=true
|
||||
fi
|
||||
fi
|
||||
done < <(kubectl get pods -n "$NAMESPACE" -l app.kubernetes.io/instance=signoz -o name | sed 's|pod/||')
|
||||
|
||||
if [[ "$error_found" == false ]]; then
|
||||
echo "${GREEN}✅ No errors found in logs${NC}"
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# Environment-specific checks
|
||||
if [[ "$ENVIRONMENT" == "dev" ]]; then
|
||||
verify_dev_specific
|
||||
else
|
||||
verify_prod_specific
|
||||
fi
|
||||
|
||||
# Show access information
|
||||
show_access_info
|
||||
}
|
||||
|
||||
# Function for development-specific verification
|
||||
verify_dev_specific() {
|
||||
echo "${BLUE}8. Development-specific checks...${NC}"
|
||||
|
||||
# Check if localhost ingress is configured
|
||||
if kubectl get ingress -n "$NAMESPACE" | grep -q "localhost"; then
|
||||
echo "${GREEN}✅ Localhost ingress configured${NC}"
|
||||
else
|
||||
echo "${YELLOW}⚠️ Localhost ingress not found${NC}"
|
||||
fi
|
||||
|
||||
# Check resource limits (should be lower for dev)
|
||||
local query_service=$(kubectl get deployment -n "$NAMESPACE" signoz-query-service -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}' 2>/dev/null || echo "")
|
||||
if [[ -n "$query_service" && "$query_service" == "512Mi" ]]; then
|
||||
echo "${GREEN}✅ Development resource limits applied${NC}"
|
||||
else
|
||||
echo "${YELLOW}⚠️ Resource limits may not be optimized for development${NC}"
|
||||
fi
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Function for production-specific verification
|
||||
verify_prod_specific() {
|
||||
echo "${BLUE}8. Production-specific checks...${NC}"
|
||||
|
||||
# Check if TLS is configured
|
||||
if kubectl get ingress -n "$NAMESPACE" | grep -q "signoz-tls-cert"; then
|
||||
echo "${GREEN}✅ TLS certificate configured${NC}"
|
||||
else
|
||||
echo "${YELLOW}⚠️ TLS certificate not found${NC}"
|
||||
fi
|
||||
|
||||
# Check if multiple replicas are running
|
||||
local query_replicas=$(kubectl get deployment -n "$NAMESPACE" signoz-query-service -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1")
|
||||
if [[ $query_replicas -gt 1 ]]; then
|
||||
echo "${GREEN}✅ High availability configured ($query_replicas replicas)${NC}"
|
||||
else
|
||||
echo "${YELLOW}⚠️ Single replica detected (not highly available)${NC}"
|
||||
fi
|
||||
|
||||
# Check resource limits (should be higher for prod)
|
||||
local query_service=$(kubectl get deployment -n "$NAMESPACE" signoz-query-service -o jsonpath='{.spec.template.spec.containers[0].resources.limits.memory}' 2>/dev/null || echo "")
|
||||
if [[ -n "$query_service" && "$query_service" == "2Gi" ]]; then
|
||||
echo "${GREEN}✅ Production resource limits applied${NC}"
|
||||
else
|
||||
echo "${YELLOW}⚠️ Resource limits may not be optimized for production${NC}"
|
||||
fi
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Function to show access information
|
||||
show_access_info() {
|
||||
echo "${BLUE}"
|
||||
echo "=========================================="
|
||||
echo "📋 Access Information"
|
||||
echo "=========================================="
|
||||
echo "${NC}"
|
||||
|
||||
if [[ "$ENVIRONMENT" == "dev" ]]; then
|
||||
echo "SigNoz UI: https://localhost/signoz"
|
||||
echo "SigNoz API: https://localhost/signoz-api"
|
||||
echo ""
|
||||
echo "OpenTelemetry Collector:"
|
||||
echo " gRPC: localhost:4317"
|
||||
echo " HTTP: localhost:4318"
|
||||
echo " Metrics: localhost:8888"
|
||||
else
|
||||
echo "SigNoz UI: https://monitoring.bakewise.ai/signoz"
|
||||
echo "SigNoz API: https://monitoring.bakewise.ai/signoz-api"
|
||||
echo "SigNoz Alerts: https://monitoring.bakewise.ai/signoz-alerts"
|
||||
echo ""
|
||||
echo "OpenTelemetry Collector:"
|
||||
echo " gRPC: monitoring.bakewise.ai:4317"
|
||||
echo " HTTP: monitoring.bakewise.ai:4318"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Default Credentials:"
|
||||
echo " Username: admin"
|
||||
echo " Password: admin"
|
||||
echo ""
|
||||
|
||||
# Show connection test commands
|
||||
echo "Connection Test Commands:"
|
||||
if [[ "$ENVIRONMENT" == "dev" ]]; then
|
||||
echo " curl -k https://localhost/signoz"
|
||||
echo " curl -k https://localhost/signoz-api/health"
|
||||
else
|
||||
echo " curl https://monitoring.bakewise.ai/signoz"
|
||||
echo " curl https://monitoring.bakewise.ai/signoz-api/health"
|
||||
fi
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Function to run connectivity tests
|
||||
run_connectivity_tests() {
|
||||
echo "${BLUE}"
|
||||
echo "=========================================="
|
||||
echo "🔗 Running Connectivity Tests"
|
||||
echo "=========================================="
|
||||
echo "${NC}"
|
||||
|
||||
if [[ "$ENVIRONMENT" == "dev" ]]; then
|
||||
# Test frontend
|
||||
echo "Testing SigNoz frontend..."
|
||||
if curl -k -s -o /dev/null -w "%{http_code}" https://localhost/signoz | grep -q "200\|302"; then
|
||||
echo "${GREEN}✅ Frontend accessible${NC}"
|
||||
else
|
||||
echo "${RED}❌ Frontend not accessible${NC}"
|
||||
fi
|
||||
|
||||
# Test API
|
||||
echo "Testing SigNoz API..."
|
||||
if curl -k -s -o /dev/null -w "%{http_code}" https://localhost/signoz-api/health | grep -q "200"; then
|
||||
echo "${GREEN}✅ API accessible${NC}"
|
||||
else
|
||||
echo "${RED}❌ API not accessible${NC}"
|
||||
fi
|
||||
|
||||
# Test OTEL collector
|
||||
echo "Testing OpenTelemetry collector..."
|
||||
if curl -s -o /dev/null -w "%{http_code}" http://localhost:8888/metrics | grep -q "200"; then
|
||||
echo "${GREEN}✅ OTEL collector accessible${NC}"
|
||||
else
|
||||
echo "${YELLOW}⚠️ OTEL collector not accessible (may not be exposed)${NC}"
|
||||
fi
|
||||
else
|
||||
echo "${YELLOW}⚠️ Production connectivity tests require valid DNS and TLS${NC}"
|
||||
echo " Please ensure monitoring.bakewise.ai resolves to your cluster"
|
||||
fi
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Main execution
|
||||
main() {
|
||||
echo "${BLUE}"
|
||||
echo "=========================================="
|
||||
echo "🔍 SigNoz Verification for Bakery IA"
|
||||
echo "=========================================="
|
||||
echo "${NC}"
|
||||
|
||||
# Check prerequisites
|
||||
check_kubectl
|
||||
check_namespace
|
||||
|
||||
# Verify deployment
|
||||
verify_deployment
|
||||
|
||||
# Run connectivity tests
|
||||
run_connectivity_tests
|
||||
|
||||
echo "${GREEN}"
|
||||
echo "=========================================="
|
||||
echo "✅ Verification Complete"
|
||||
echo "=========================================="
|
||||
echo "${NC}"
|
||||
|
||||
echo "Summary:"
|
||||
echo " Environment: $ENVIRONMENT"
|
||||
echo " Namespace: $NAMESPACE"
|
||||
echo ""
|
||||
echo "Next Steps:"
|
||||
echo " 1. Access SigNoz UI and verify dashboards"
|
||||
echo " 2. Configure alert rules for your services"
|
||||
echo " 3. Instrument your applications with OpenTelemetry"
|
||||
echo " 4. Set up custom dashboards for key metrics"
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Run main function
|
||||
main
|
||||
Reference in New Issue
Block a user