From fc26876eb015d7c8eb9afd3ea5ab801d21f37a8e Mon Sep 17 00:00:00 2001 From: Bakery Admin Date: Sat, 24 Jan 2026 20:14:19 +0100 Subject: [PATCH] Fix redis ssl issues 3 --- PRODUCTION_DEPLOYMENT_GUIDE.md | 72 ++++ Tiltfile | 87 ++--- infrastructure/monitoring/k8s-infra/README.md | 121 ++++++ .../k8s-infra/deploy-k8s-infra-monitoring.sh | 347 ++++++++++++++++++ .../k8s-infra/kube-state-metrics-values.yaml | 109 ++++++ .../k8s-infra/node-exporter-values.yaml | 97 +++++ .../monitoring/signoz/signoz-values-prod.yaml | 28 ++ shared/config/base.py | 7 - shared/redis_utils/client.py | 80 +++- 9 files changed, 884 insertions(+), 64 deletions(-) create mode 100644 infrastructure/monitoring/k8s-infra/README.md create mode 100755 infrastructure/monitoring/k8s-infra/deploy-k8s-infra-monitoring.sh create mode 100644 infrastructure/monitoring/k8s-infra/kube-state-metrics-values.yaml create mode 100644 infrastructure/monitoring/k8s-infra/node-exporter-values.yaml diff --git a/PRODUCTION_DEPLOYMENT_GUIDE.md b/PRODUCTION_DEPLOYMENT_GUIDE.md index a970d5fc..08eb9d93 100644 --- a/PRODUCTION_DEPLOYMENT_GUIDE.md +++ b/PRODUCTION_DEPLOYMENT_GUIDE.md @@ -31,6 +31,7 @@ - [Step 5.6: Verify Service Images](#step-56-verify-all-service-images-are-available) 9. [Phase 6: Deploy Application Services](#phase-6-deploy-application-services) 10. [Phase 7: Deploy Optional Services](#phase-7-deploy-optional-services) + - [Step 7.5: Deploy Kubernetes Infrastructure Monitoring](#step-75-deploy-kubernetes-infrastructure-monitoring-required-for-signoz-infrastructure-view) 11. [Phase 8: Verification & Validation](#phase-8-verification--validation) 12. [Post-Deployment Operations](#post-deployment-operations) 13. [Troubleshooting Guide](#troubleshooting-guide) @@ -1385,6 +1386,77 @@ kubectl wait --for=condition=available --timeout=600s deployment/signoz-frontend kubectl get pods -n bakery-ia -l app.kubernetes.io/instance=signoz ``` +### Step 7.5: Deploy Kubernetes Infrastructure Monitoring (Required for SigNoz Infrastructure View) + +> **Purpose:** Deploy kube-state-metrics and node-exporter to enable Kubernetes infrastructure metrics in SigNoz. Without these components, the SigNoz Infrastructure section will be empty. + +**Components Deployed:** + +| Component | Purpose | Metrics | +|-----------|---------|---------| +| **kube-state-metrics** | Kubernetes object metrics | Pods, Deployments, Nodes, PVCs, etc. | +| **node-exporter** | Host-level metrics | CPU, Memory, Disk, Network | + +**Deploy using the automated script:** + +```bash +# Navigate to the k8s-infra monitoring directory +cd /root/bakery-ia + +# Make the script executable (if not already) +chmod +x infrastructure/monitoring/k8s-infra/deploy-k8s-infra-monitoring.sh + +# Deploy kube-state-metrics and node-exporter +./infrastructure/monitoring/k8s-infra/deploy-k8s-infra-monitoring.sh --microk8s install +``` + +**Upgrade SigNoz to scrape the new metrics:** + +```bash +# The signoz-values-prod.yaml already includes the Prometheus receiver configuration +# Upgrade SigNoz to apply the scraping configuration +microk8s helm3 upgrade signoz signoz/signoz \ + -n bakery-ia \ + -f infrastructure/monitoring/signoz/signoz-values-prod.yaml +``` + +**Verify deployment:** + +```bash +# Check pods are running +microk8s kubectl get pods -n bakery-ia | grep -E "(kube-state|node-exporter)" + +# Expected output: +# kube-state-metrics-xxxxxxxxxx-xxxxx 1/1 Running 0 1m +# node-exporter-prometheus-node-exporter-xxxxx 1/1 Running 0 1m + +# Check status +./infrastructure/monitoring/k8s-infra/deploy-k8s-infra-monitoring.sh --microk8s status +``` + +**Verify metrics in SigNoz:** + +After a few minutes, you should see: +- **Infrastructure → Kubernetes**: Pod status, deployments, nodes, PVCs +- **Infrastructure → Hosts**: CPU, memory, disk, network usage + +**Troubleshooting:** + +```bash +# Check if metrics are being scraped +microk8s kubectl port-forward svc/kube-state-metrics 8080:8080 -n bakery-ia & +curl localhost:8080/metrics | head -20 + +# Check OTel Collector logs for scraping errors +microk8s kubectl logs -l app.kubernetes.io/name=signoz-otel-collector -n bakery-ia --tail=50 +``` + +> **Files Location:** +> - Helm values: `infrastructure/monitoring/k8s-infra/kube-state-metrics-values.yaml` +> - Helm values: `infrastructure/monitoring/k8s-infra/node-exporter-values.yaml` +> - Deploy script: `infrastructure/monitoring/k8s-infra/deploy-k8s-infra-monitoring.sh` +> - Documentation: `infrastructure/monitoring/k8s-infra/README.md` + --- ## Phase 8: Verification & Validation diff --git a/Tiltfile b/Tiltfile index 333e31c3..78d77155 100644 --- a/Tiltfile +++ b/Tiltfile @@ -46,6 +46,10 @@ if use_dockerhub: base_registry = 'docker.io' python_image = 'python:3.11-slim' +# Git commit hash for migration job names (extracted from manifest to match CI/CD updates) +# We read from a manifest file rather than git HEAD because CI/CD commits may not be checked out locally +git_commit_short = str(local("sed -n 's/.*name: auth-migration-\\([a-f0-9]*\\).*/\\1/p' infrastructure/services/microservices/auth/migrations/auth-migration-job.yaml | head -1", quiet=True)).strip() + # ============================================================================= # PREPULL BASE IMAGES - RUNS AFTER SECURITY SETUP @@ -1189,79 +1193,80 @@ k8s_resource('demo-session-db', resource_deps=['security-setup'], labels=['06-da # ============================================================================= # MIGRATION JOBS # ============================================================================= +# Migration job names include git commit hash (set by CI/CD in manifests) # Core Service Migrations -k8s_resource('auth-migration', resource_deps=['auth-db'], labels=['07-migrations']) -k8s_resource('tenant-migration', resource_deps=['tenant-db'], labels=['07-migrations']) +k8s_resource('auth-migration-' + git_commit_short, resource_deps=['auth-db'], labels=['07-migrations']) +k8s_resource('tenant-migration-' + git_commit_short, resource_deps=['tenant-db'], labels=['07-migrations']) # Data & Analytics Migrations -k8s_resource('training-migration', resource_deps=['training-db'], labels=['07-migrations']) -k8s_resource('forecasting-migration', resource_deps=['forecasting-db'], labels=['07-migrations']) -k8s_resource('ai-insights-migration', resource_deps=['ai-insights-db'], labels=['07-migrations']) +k8s_resource('training-migration-' + git_commit_short, resource_deps=['training-db'], labels=['07-migrations']) +k8s_resource('forecasting-migration-' + git_commit_short, resource_deps=['forecasting-db'], labels=['07-migrations']) +k8s_resource('ai-insights-migration-' + git_commit_short, resource_deps=['ai-insights-db'], labels=['07-migrations']) # Operations Migrations -k8s_resource('sales-migration', resource_deps=['sales-db'], labels=['07-migrations']) -k8s_resource('inventory-migration', resource_deps=['inventory-db'], labels=['07-migrations']) -k8s_resource('production-migration', resource_deps=['production-db'], labels=['07-migrations']) -k8s_resource('procurement-migration', resource_deps=['procurement-db'], labels=['07-migrations']) -k8s_resource('distribution-migration', resource_deps=['distribution-db'], labels=['07-migrations']) +k8s_resource('sales-migration-' + git_commit_short, resource_deps=['sales-db'], labels=['07-migrations']) +k8s_resource('inventory-migration-' + git_commit_short, resource_deps=['inventory-db'], labels=['07-migrations']) +k8s_resource('production-migration-' + git_commit_short, resource_deps=['production-db'], labels=['07-migrations']) +k8s_resource('procurement-migration-' + git_commit_short, resource_deps=['procurement-db'], labels=['07-migrations']) +k8s_resource('distribution-migration-' + git_commit_short, resource_deps=['distribution-db'], labels=['07-migrations']) # Supporting Service Migrations -k8s_resource('recipes-migration', resource_deps=['recipes-db'], labels=['07-migrations']) -k8s_resource('suppliers-migration', resource_deps=['suppliers-db'], labels=['07-migrations']) -k8s_resource('pos-migration', resource_deps=['pos-db'], labels=['07-migrations']) -k8s_resource('orders-migration', resource_deps=['orders-db'], labels=['07-migrations']) -k8s_resource('external-migration', resource_deps=['external-db'], labels=['07-migrations']) +k8s_resource('recipes-migration-' + git_commit_short, resource_deps=['recipes-db'], labels=['07-migrations']) +k8s_resource('suppliers-migration-' + git_commit_short, resource_deps=['suppliers-db'], labels=['07-migrations']) +k8s_resource('pos-migration-' + git_commit_short, resource_deps=['pos-db'], labels=['07-migrations']) +k8s_resource('orders-migration-' + git_commit_short, resource_deps=['orders-db'], labels=['07-migrations']) +k8s_resource('external-migration-' + git_commit_short, resource_deps=['external-db'], labels=['07-migrations']) # Platform Service Migrations -k8s_resource('notification-migration', resource_deps=['notification-db'], labels=['07-migrations']) -k8s_resource('alert-processor-migration', resource_deps=['alert-processor-db'], labels=['07-migrations']) -k8s_resource('orchestrator-migration', resource_deps=['orchestrator-db'], labels=['07-migrations']) +k8s_resource('notification-migration-' + git_commit_short, resource_deps=['notification-db'], labels=['07-migrations']) +k8s_resource('alert-processor-migration-' + git_commit_short, resource_deps=['alert-processor-db'], labels=['07-migrations']) +k8s_resource('orchestrator-migration-' + git_commit_short, resource_deps=['orchestrator-db'], labels=['07-migrations']) # Demo Service Migrations -k8s_resource('demo-session-migration', resource_deps=['demo-session-db'], labels=['07-migrations']) +k8s_resource('demo-session-migration-' + git_commit_short, resource_deps=['demo-session-db'], labels=['07-migrations']) # ============================================================================= # DATA INITIALIZATION JOBS # ============================================================================= -k8s_resource('external-data-init', resource_deps=['external-migration', 'redis'], labels=['08-data-init']) +k8s_resource('external-data-init-' + git_commit_short, resource_deps=['external-migration-' + git_commit_short, 'redis'], labels=['08-data-init']) # ============================================================================= # APPLICATION SERVICES # ============================================================================= # Core Services -k8s_resource('auth-service', resource_deps=['auth-migration', 'redis'], labels=['09-services-core']) -k8s_resource('tenant-service', resource_deps=['tenant-migration', 'redis'], labels=['09-services-core']) +k8s_resource('auth-service', resource_deps=['auth-migration-' + git_commit_short, 'redis'], labels=['09-services-core']) +k8s_resource('tenant-service', resource_deps=['tenant-migration-' + git_commit_short, 'redis'], labels=['09-services-core']) # Data & Analytics Services -k8s_resource('training-service', resource_deps=['training-migration', 'redis'], labels=['10-services-analytics']) -k8s_resource('forecasting-service', resource_deps=['forecasting-migration', 'redis'], labels=['10-services-analytics']) -k8s_resource('ai-insights-service', resource_deps=['ai-insights-migration', 'redis', 'forecasting-service', 'production-service', 'procurement-service'], labels=['10-services-analytics']) +k8s_resource('training-service', resource_deps=['training-migration-' + git_commit_short, 'redis'], labels=['10-services-analytics']) +k8s_resource('forecasting-service', resource_deps=['forecasting-migration-' + git_commit_short, 'redis'], labels=['10-services-analytics']) +k8s_resource('ai-insights-service', resource_deps=['ai-insights-migration-' + git_commit_short, 'redis', 'forecasting-service', 'production-service', 'procurement-service'], labels=['10-services-analytics']) # Operations Services -k8s_resource('sales-service', resource_deps=['sales-migration', 'redis'], labels=['11-services-operations']) -k8s_resource('inventory-service', resource_deps=['inventory-migration', 'redis'], labels=['11-services-operations']) -k8s_resource('production-service', resource_deps=['production-migration', 'redis'], labels=['11-services-operations']) -k8s_resource('procurement-service', resource_deps=['procurement-migration', 'redis'], labels=['11-services-operations']) -k8s_resource('distribution-service', resource_deps=['distribution-migration', 'redis', 'rabbitmq'], labels=['11-services-operations']) +k8s_resource('sales-service', resource_deps=['sales-migration-' + git_commit_short, 'redis'], labels=['11-services-operations']) +k8s_resource('inventory-service', resource_deps=['inventory-migration-' + git_commit_short, 'redis'], labels=['11-services-operations']) +k8s_resource('production-service', resource_deps=['production-migration-' + git_commit_short, 'redis'], labels=['11-services-operations']) +k8s_resource('procurement-service', resource_deps=['procurement-migration-' + git_commit_short, 'redis'], labels=['11-services-operations']) +k8s_resource('distribution-service', resource_deps=['distribution-migration-' + git_commit_short, 'redis', 'rabbitmq'], labels=['11-services-operations']) # Supporting Services -k8s_resource('recipes-service', resource_deps=['recipes-migration', 'redis'], labels=['12-services-supporting']) -k8s_resource('suppliers-service', resource_deps=['suppliers-migration', 'redis'], labels=['12-services-supporting']) -k8s_resource('pos-service', resource_deps=['pos-migration', 'redis'], labels=['12-services-supporting']) -k8s_resource('orders-service', resource_deps=['orders-migration', 'redis'], labels=['12-services-supporting']) -k8s_resource('external-service', resource_deps=['external-migration', 'external-data-init', 'redis'], labels=['12-services-supporting']) +k8s_resource('recipes-service', resource_deps=['recipes-migration-' + git_commit_short, 'redis'], labels=['12-services-supporting']) +k8s_resource('suppliers-service', resource_deps=['suppliers-migration-' + git_commit_short, 'redis'], labels=['12-services-supporting']) +k8s_resource('pos-service', resource_deps=['pos-migration-' + git_commit_short, 'redis'], labels=['12-services-supporting']) +k8s_resource('orders-service', resource_deps=['orders-migration-' + git_commit_short, 'redis'], labels=['12-services-supporting']) +k8s_resource('external-service', resource_deps=['external-migration-' + git_commit_short, 'external-data-init-' + git_commit_short, 'redis'], labels=['12-services-supporting']) # Platform Services -k8s_resource('notification-service', resource_deps=['notification-migration', 'redis', 'rabbitmq'], labels=['13-services-platform']) -k8s_resource('alert-processor', resource_deps=['alert-processor-migration', 'redis', 'rabbitmq'], labels=['13-services-platform']) -k8s_resource('orchestrator-service', resource_deps=['orchestrator-migration', 'redis'], labels=['13-services-platform']) +k8s_resource('notification-service', resource_deps=['notification-migration-' + git_commit_short, 'redis', 'rabbitmq'], labels=['13-services-platform']) +k8s_resource('alert-processor', resource_deps=['alert-processor-migration-' + git_commit_short, 'redis', 'rabbitmq'], labels=['13-services-platform']) +k8s_resource('orchestrator-service', resource_deps=['orchestrator-migration-' + git_commit_short, 'redis'], labels=['13-services-platform']) # Demo Services -k8s_resource('demo-session-service', resource_deps=['demo-session-migration', 'redis'], labels=['14-services-demo']) -k8s_resource('demo-cleanup-worker', resource_deps=['demo-session-service', 'redis'], labels=['14-services-demo']) +k8s_resource('demo-session-service', resource_deps=['demo-session-migration-' + git_commit_short, 'redis'], labels=['14-services-demo']) +k8s_resource('demo-cleanup-worker-' + git_commit_short, resource_deps=['demo-session-service', 'redis'], labels=['14-services-demo']) # ============================================================================= # FRONTEND & GATEWAY @@ -1275,7 +1280,7 @@ k8s_resource('frontend', resource_deps=['gateway'], labels=['15-frontend']) # ============================================================================= k8s_resource('demo-session-cleanup', resource_deps=['demo-session-service'], labels=['16-cronjobs']) -k8s_resource('external-data-rotation', resource_deps=['external-service'], labels=['16-cronjobs']) +k8s_resource('external-data-rotation-' + git_commit_short, resource_deps=['external-service'], labels=['16-cronjobs']) # ============================================================================= # WATCH SETTINGS diff --git a/infrastructure/monitoring/k8s-infra/README.md b/infrastructure/monitoring/k8s-infra/README.md new file mode 100644 index 00000000..ea277af1 --- /dev/null +++ b/infrastructure/monitoring/k8s-infra/README.md @@ -0,0 +1,121 @@ +# Kubernetes Infrastructure Monitoring + +This directory contains configurations for deploying Kubernetes infrastructure monitoring components that integrate with SigNoz. + +## Components + +| Component | Purpose | Metrics Endpoint | +|-----------|---------|------------------| +| **kube-state-metrics** | Kubernetes object metrics (pods, deployments, nodes, etc.) | `:8080/metrics` | +| **node-exporter** | Host-level metrics (CPU, memory, disk, network) | `:9100/metrics` | + +## Quick Start (MicroK8s Production) + +```bash +# 1. Deploy infrastructure monitoring components +./deploy-k8s-infra-monitoring.sh --microk8s install + +# 2. Upgrade SigNoz to scrape the new metrics +microk8s helm3 upgrade signoz signoz/signoz \ + -n bakery-ia \ + -f ../signoz/signoz-values-prod.yaml +``` + +## Usage + +### Install + +```bash +# Standard Kubernetes +./deploy-k8s-infra-monitoring.sh install + +# MicroK8s +./deploy-k8s-infra-monitoring.sh --microk8s install +``` + +### Upgrade + +```bash +./deploy-k8s-infra-monitoring.sh --microk8s upgrade +``` + +### Uninstall + +```bash +./deploy-k8s-infra-monitoring.sh --microk8s uninstall +``` + +### Check Status + +```bash +./deploy-k8s-infra-monitoring.sh --microk8s status +``` + +### Dry Run + +```bash +./deploy-k8s-infra-monitoring.sh --microk8s --dry-run install +``` + +## Files + +- `kube-state-metrics-values.yaml` - Helm values for kube-state-metrics +- `node-exporter-values.yaml` - Helm values for node-exporter +- `deploy-k8s-infra-monitoring.sh` - Deployment automation script + +## SigNoz Integration + +The SigNoz OTel Collector is configured (in `signoz-values-prod.yaml`) to scrape metrics from: + +- `kube-state-metrics.bakery-ia.svc.cluster.local:8080` +- `node-exporter-prometheus-node-exporter.bakery-ia.svc.cluster.local:9100` + +After deploying these components, metrics will appear in SigNoz under: +- **Infrastructure** > **Kubernetes** (for K8s object metrics) +- **Infrastructure** > **Hosts** (for node metrics) + +## Metrics Available + +### From kube-state-metrics + +- Pod status, phase, restarts +- Deployment replicas (desired vs available) +- Node conditions and capacity +- PVC status and capacity +- Resource requests and limits +- Job/CronJob status + +### From node-exporter + +- CPU usage per core +- Memory usage (total, free, cached) +- Disk I/O and space +- Network traffic (bytes in/out) +- System load average +- Filesystem usage + +## Troubleshooting + +### Check if metrics are being scraped + +```bash +# Port-forward to kube-state-metrics +microk8s kubectl port-forward svc/kube-state-metrics 8080:8080 -n bakery-ia & +curl localhost:8080/metrics | head -50 + +# Port-forward to node-exporter +microk8s kubectl port-forward svc/node-exporter-prometheus-node-exporter 9100:9100 -n bakery-ia & +curl localhost:9100/metrics | head -50 +``` + +### Check OTel Collector logs + +```bash +microk8s kubectl logs -l app.kubernetes.io/name=signoz-otel-collector -n bakery-ia --tail=100 +``` + +### Verify pods are running + +```bash +microk8s kubectl get pods -n bakery-ia | grep -E "(kube-state|node-exporter)" +``` diff --git a/infrastructure/monitoring/k8s-infra/deploy-k8s-infra-monitoring.sh b/infrastructure/monitoring/k8s-infra/deploy-k8s-infra-monitoring.sh new file mode 100755 index 00000000..d684c1d9 --- /dev/null +++ b/infrastructure/monitoring/k8s-infra/deploy-k8s-infra-monitoring.sh @@ -0,0 +1,347 @@ +#!/bin/bash + +# ============================================================================ +# Kubernetes Infrastructure Monitoring Deployment Script +# ============================================================================ +# Deploys kube-state-metrics and node-exporter for Kubernetes infrastructure +# monitoring in SigNoz +# ============================================================================ + +set -e + +# Color codes for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration +NAMESPACE="bakery-ia" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Function to display help +show_help() { + echo "Usage: $0 [OPTIONS] [COMMAND]" + echo "" + echo "Deploy Kubernetes infrastructure monitoring components" + echo "" + echo "Commands:" + echo " install Install kube-state-metrics and node-exporter (default)" + echo " upgrade Upgrade existing deployments" + echo " uninstall Remove all infrastructure monitoring components" + echo " status Show deployment status" + echo "" + echo "Options:" + echo " -h, --help Show this help message" + echo " -d, --dry-run Show what would be done without executing" + echo " -n, --namespace NS Specify namespace (default: bakery-ia)" + echo " --microk8s Use microk8s helm3 command (for MicroK8s clusters)" + echo "" + echo "Examples:" + echo " $0 install # Install on standard k8s" + echo " $0 --microk8s install # Install on MicroK8s" + echo " $0 --microk8s upgrade # Upgrade on MicroK8s" + echo " $0 --microk8s uninstall # Remove from MicroK8s" + echo " $0 status # Check deployment status" +} + +# Parse arguments +DRY_RUN=false +USE_MICROK8S=false +COMMAND="install" + +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_help + exit 0 + ;; + -d|--dry-run) + DRY_RUN=true + shift + ;; + -n|--namespace) + NAMESPACE="$2" + shift 2 + ;; + --microk8s) + USE_MICROK8S=true + shift + ;; + install|upgrade|uninstall|status) + COMMAND="$1" + shift + ;; + *) + echo -e "${RED}Unknown argument: $1${NC}" + show_help + exit 1 + ;; + esac +done + +# Set helm and kubectl commands based on environment +if [[ "$USE_MICROK8S" == true ]]; then + HELM_CMD="microk8s helm3" + KUBECTL_CMD="microk8s kubectl" +else + HELM_CMD="helm" + KUBECTL_CMD="kubectl" +fi + +# Function to check prerequisites +check_prerequisites() { + echo -e "${BLUE}Checking prerequisites...${NC}" + + # Check helm + if [[ "$USE_MICROK8S" == true ]]; then + # Test if microk8s helm3 command works directly + if ! microk8s helm3 version &> /dev/null; then + echo -e "${RED}Error: MicroK8s helm3 addon is not working.${NC}" + echo "Enable it with: microk8s enable helm3" + exit 1 + fi + echo -e "${GREEN}MicroK8s helm3 is available.${NC}" + else + if ! command -v helm &> /dev/null; then + echo -e "${RED}Error: Helm is not installed.${NC}" + exit 1 + fi + fi + + # Check kubectl connectivity + if ! $KUBECTL_CMD cluster-info &> /dev/null; then + echo -e "${RED}Error: Cannot connect to Kubernetes cluster.${NC}" + exit 1 + fi + + echo -e "${GREEN}Prerequisites check passed.${NC}" + echo "" +} + +# Function to setup Helm repository +setup_helm_repo() { + echo -e "${BLUE}Setting up Prometheus Community Helm repository...${NC}" + + if [[ "$DRY_RUN" == true ]]; then + echo " (dry-run) Would add prometheus-community Helm repository" + return + fi + + if $HELM_CMD repo list 2>/dev/null | grep -q "prometheus-community"; then + echo -e "${BLUE}Repository already added, updating...${NC}" + $HELM_CMD repo update prometheus-community + else + $HELM_CMD repo add prometheus-community https://prometheus-community.github.io/helm-charts + $HELM_CMD repo update + fi + + echo -e "${GREEN}Helm repository ready.${NC}" + echo "" +} + +# Function to ensure namespace exists +ensure_namespace() { + echo -e "${BLUE}Ensuring namespace $NAMESPACE exists...${NC}" + + if [[ "$DRY_RUN" == true ]]; then + echo " (dry-run) Would create namespace if needed" + return + fi + + if ! $KUBECTL_CMD get namespace "$NAMESPACE" &> /dev/null; then + $KUBECTL_CMD create namespace "$NAMESPACE" + echo -e "${GREEN}Namespace $NAMESPACE created.${NC}" + else + echo -e "${BLUE}Namespace $NAMESPACE already exists.${NC}" + fi + echo "" +} + +# Function to install kube-state-metrics +install_kube_state_metrics() { + echo -e "${BLUE}Installing kube-state-metrics...${NC}" + + local values_file="$SCRIPT_DIR/kube-state-metrics-values.yaml" + + if [[ ! -f "$values_file" ]]; then + echo -e "${RED}Error: Values file not found: $values_file${NC}" + exit 1 + fi + + if [[ "$DRY_RUN" == true ]]; then + echo " (dry-run) Would install kube-state-metrics" + echo " Command: $HELM_CMD upgrade --install kube-state-metrics prometheus-community/kube-state-metrics -n $NAMESPACE -f $values_file" + return + fi + + $HELM_CMD upgrade --install kube-state-metrics \ + prometheus-community/kube-state-metrics \ + -n "$NAMESPACE" \ + -f "$values_file" \ + --wait \ + --timeout 5m + + echo -e "${GREEN}kube-state-metrics installed successfully.${NC}" + echo "" +} + +# Function to install node-exporter +install_node_exporter() { + echo -e "${BLUE}Installing node-exporter...${NC}" + + local values_file="$SCRIPT_DIR/node-exporter-values.yaml" + + if [[ ! -f "$values_file" ]]; then + echo -e "${RED}Error: Values file not found: $values_file${NC}" + exit 1 + fi + + if [[ "$DRY_RUN" == true ]]; then + echo " (dry-run) Would install node-exporter" + echo " Command: $HELM_CMD upgrade --install node-exporter prometheus-community/prometheus-node-exporter -n $NAMESPACE -f $values_file" + return + fi + + $HELM_CMD upgrade --install node-exporter \ + prometheus-community/prometheus-node-exporter \ + -n "$NAMESPACE" \ + -f "$values_file" \ + --wait \ + --timeout 5m + + echo -e "${GREEN}node-exporter installed successfully.${NC}" + echo "" +} + +# Function to uninstall components +uninstall_components() { + echo -e "${BLUE}Uninstalling Kubernetes infrastructure monitoring components...${NC}" + + if [[ "$DRY_RUN" == true ]]; then + echo " (dry-run) Would uninstall kube-state-metrics and node-exporter" + return + fi + + # Uninstall kube-state-metrics + if $HELM_CMD list -n "$NAMESPACE" | grep -q "kube-state-metrics"; then + echo -e "${BLUE}Removing kube-state-metrics...${NC}" + $HELM_CMD uninstall kube-state-metrics -n "$NAMESPACE" --wait + echo -e "${GREEN}kube-state-metrics removed.${NC}" + else + echo -e "${YELLOW}kube-state-metrics not found.${NC}" + fi + + # Uninstall node-exporter + if $HELM_CMD list -n "$NAMESPACE" | grep -q "node-exporter"; then + echo -e "${BLUE}Removing node-exporter...${NC}" + $HELM_CMD uninstall node-exporter -n "$NAMESPACE" --wait + echo -e "${GREEN}node-exporter removed.${NC}" + else + echo -e "${YELLOW}node-exporter not found.${NC}" + fi + + echo "" +} + +# Function to show deployment status +show_status() { + echo -e "${BLUE}=== Kubernetes Infrastructure Monitoring Status ===${NC}" + echo "" + + echo -e "${BLUE}Helm Releases:${NC}" + $HELM_CMD list -n "$NAMESPACE" | grep -E "(kube-state-metrics|node-exporter)" || echo " No releases found" + echo "" + + echo -e "${BLUE}Pods:${NC}" + $KUBECTL_CMD get pods -n "$NAMESPACE" -l 'app.kubernetes.io/name in (kube-state-metrics, prometheus-node-exporter)' 2>/dev/null || echo " No pods found" + echo "" + + echo -e "${BLUE}Services:${NC}" + $KUBECTL_CMD get svc -n "$NAMESPACE" | grep -E "(kube-state-metrics|node-exporter)" || echo " No services found" + echo "" + + echo -e "${BLUE}Endpoints (for SigNoz scraping):${NC}" + echo " kube-state-metrics: kube-state-metrics.$NAMESPACE.svc.cluster.local:8080" + echo " node-exporter: node-exporter-prometheus-node-exporter.$NAMESPACE.svc.cluster.local:9100" + echo "" +} + +# Function to show post-install instructions +show_post_install_instructions() { + echo -e "${BLUE}=== Post-Installation Instructions ===${NC}" + echo "" + echo "To enable SigNoz to scrape these metrics, update your SigNoz OTel Collector config." + echo "" + echo "Add the following to your signoz-values-prod.yaml under otelCollector.config:" + echo "" + cat << 'EOF' +otelCollector: + config: + receivers: + prometheus: + config: + scrape_configs: + - job_name: 'kube-state-metrics' + static_configs: + - targets: ['kube-state-metrics.bakery-ia.svc.cluster.local:8080'] + scrape_interval: 30s + - job_name: 'node-exporter' + static_configs: + - targets: ['node-exporter-prometheus-node-exporter.bakery-ia.svc.cluster.local:9100'] + scrape_interval: 30s + service: + pipelines: + metrics: + receivers: [otlp, prometheus] +EOF + echo "" + echo "Then upgrade SigNoz:" + if [[ "$USE_MICROK8S" == true ]]; then + echo " microk8s helm3 upgrade signoz signoz/signoz -n $NAMESPACE -f infrastructure/monitoring/signoz/signoz-values-prod.yaml" + else + echo " helm upgrade signoz signoz/signoz -n $NAMESPACE -f infrastructure/monitoring/signoz/signoz-values-prod.yaml" + fi + echo "" +} + +# Main execution +main() { + echo -e "${BLUE}" + echo "==========================================" + echo "Kubernetes Infrastructure Monitoring" + echo "==========================================" + echo -e "${NC}" + + check_prerequisites + + case $COMMAND in + install) + setup_helm_repo + ensure_namespace + install_kube_state_metrics + install_node_exporter + show_status + show_post_install_instructions + echo -e "${GREEN}Installation completed successfully!${NC}" + ;; + upgrade) + setup_helm_repo + install_kube_state_metrics + install_node_exporter + show_status + echo -e "${GREEN}Upgrade completed successfully!${NC}" + ;; + uninstall) + uninstall_components + echo -e "${GREEN}Uninstallation completed.${NC}" + ;; + status) + show_status + ;; + esac +} + +# Run main function +main diff --git a/infrastructure/monitoring/k8s-infra/kube-state-metrics-values.yaml b/infrastructure/monitoring/k8s-infra/kube-state-metrics-values.yaml new file mode 100644 index 00000000..fe301f4e --- /dev/null +++ b/infrastructure/monitoring/k8s-infra/kube-state-metrics-values.yaml @@ -0,0 +1,109 @@ +# Kube-State-Metrics Helm Values for Bakery IA +# Chart: prometheus-community/kube-state-metrics +# Documentation: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics +# +# Install Command: +# helm install kube-state-metrics prometheus-community/kube-state-metrics \ +# -n bakery-ia -f kube-state-metrics-values.yaml + +# Image configuration +image: + registry: registry.k8s.io + repository: kube-state-metrics/kube-state-metrics + tag: "" # Uses chart default (latest stable) + pullPolicy: IfNotPresent + +# Replicas - single instance is sufficient for most clusters +replicas: 1 + +# Resource limits optimized for MicroK8s VPS +resources: + requests: + cpu: 10m + memory: 32Mi + limits: + cpu: 100m + memory: 128Mi + +# Service configuration +service: + type: ClusterIP + port: 8080 + annotations: {} + +# Prometheus scrape annotations +prometheusScrape: true + +# Which Kubernetes resources to collect metrics for +# Full list available, but we focus on most useful ones +collectors: + - certificatesigningrequests + - configmaps + - cronjobs + - daemonsets + - deployments + - endpoints + - horizontalpodautoscalers + - ingresses + - jobs + - leases + - limitranges + - namespaces + - networkpolicies + - nodes + - persistentvolumeclaims + - persistentvolumes + - poddisruptionbudgets + - pods + - replicasets + - replicationcontrollers + - resourcequotas + - secrets + - services + - statefulsets + - storageclasses + +# Namespace to watch (empty = all namespaces) +namespaces: "" + +# Node selector for scheduling +nodeSelector: {} + +# Tolerations +tolerations: [] + +# Affinity rules +affinity: {} + +# Pod security context +podSecurityContext: + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 + +# Container security context +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + +# Self-monitoring metrics +selfMonitor: + enabled: true + +# Kubernetes API access +kubeconfig: + enabled: false + +# RBAC configuration +rbac: + create: true + useClusterRole: true + +# Service account +serviceAccount: + create: true + name: "" + annotations: {} diff --git a/infrastructure/monitoring/k8s-infra/node-exporter-values.yaml b/infrastructure/monitoring/k8s-infra/node-exporter-values.yaml new file mode 100644 index 00000000..3f0fd960 --- /dev/null +++ b/infrastructure/monitoring/k8s-infra/node-exporter-values.yaml @@ -0,0 +1,97 @@ +# Prometheus Node Exporter Helm Values for Bakery IA +# Chart: prometheus-community/prometheus-node-exporter +# Documentation: https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter +# +# Install Command: +# helm install node-exporter prometheus-community/prometheus-node-exporter \ +# -n bakery-ia -f node-exporter-values.yaml + +# Image configuration +image: + registry: quay.io + repository: prometheus/node-exporter + tag: "" # Uses chart default (latest stable) + pullPolicy: IfNotPresent + +# Resource limits optimized for MicroK8s VPS +resources: + requests: + cpu: 10m + memory: 32Mi + limits: + cpu: 200m + memory: 64Mi + +# Service configuration +service: + type: ClusterIP + port: 9100 + targetPort: 9100 + annotations: + prometheus.io/scrape: "true" + +# DaemonSet update strategy +updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + +# Host network - required for accurate network metrics +hostNetwork: true +hostPID: true +hostRootFsMount: + enabled: true + mountPropagation: HostToContainer + +# Node selector +nodeSelector: {} + +# Tolerations - allow scheduling on all nodes including control plane +tolerations: + - effect: NoSchedule + operator: Exists + +# Affinity rules +affinity: {} + +# Pod security context +podSecurityContext: + fsGroup: 65534 + runAsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + +# Container security context +securityContext: + readOnlyRootFilesystem: true + +# RBAC configuration +rbac: + create: true + pspEnabled: false + +# Service account +serviceAccount: + create: true + name: "" + annotations: {} + +# Prometheus scrape annotations +prometheus: + monitor: + enabled: false # We use SigNoz OTel collector scraping instead + +# Extra arguments for node-exporter +extraArgs: + - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) + - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$ + +# Collectors to enable (default set) +# Disable collectors that are not useful or cause issues +extraHostVolumeMounts: [] + +# Sidecar containers +sidecars: [] + +# Init containers +initContainers: [] diff --git a/infrastructure/monitoring/signoz/signoz-values-prod.yaml b/infrastructure/monitoring/signoz/signoz-values-prod.yaml index a1bb2bf0..1f4df682 100644 --- a/infrastructure/monitoring/signoz/signoz-values-prod.yaml +++ b/infrastructure/monitoring/signoz/signoz-values-prod.yaml @@ -60,6 +60,34 @@ otelCollector: limits: memory: "2Gi" cpu: "1000m" + # Additional config for Kubernetes infrastructure metrics scraping + config: + receivers: + prometheus: + config: + scrape_configs: + # Kube-state-metrics - Kubernetes object metrics + - job_name: 'kube-state-metrics' + static_configs: + - targets: ['kube-state-metrics.bakery-ia.svc.cluster.local:8080'] + scrape_interval: 30s + metric_relabel_configs: + - source_labels: [__name__] + regex: 'kube_(daemonset|deployment|pod|namespace|node|statefulset|replicaset|job|cronjob|persistentvolume|persistentvolumeclaim|resourcequota|service|configmap|secret).*' + action: keep + # Node-exporter - Host-level metrics + - job_name: 'node-exporter' + static_configs: + - targets: ['node-exporter-prometheus-node-exporter.bakery-ia.svc.cluster.local:9100'] + scrape_interval: 30s + metric_relabel_configs: + - source_labels: [__name__] + regex: 'node_(cpu|memory|disk|filesystem|network|load).*' + action: keep + service: + pipelines: + metrics: + receivers: [otlp, prometheus] queryService: resources: diff --git a/shared/config/base.py b/shared/config/base.py index bd697bab..fbf66cfe 100755 --- a/shared/config/base.py +++ b/shared/config/base.py @@ -165,16 +165,9 @@ class BaseServiceSettings(BaseSettings): if password: url = f"{protocol}://:{password}@{host}:{port}" - if use_tls: - # Use ssl_cert_reqs=none for self-signed certs in internal cluster - # Still encrypted, just skips cert validation - url += "?ssl_cert_reqs=none" print(f"[DEBUG REDIS_URL] Returning URL with auth and TLS: {url}", file=sys.stderr) return url url = f"{protocol}://{host}:{port}" - if use_tls: - # Use ssl_cert_reqs=none for self-signed certs in internal cluster - url += "?ssl_cert_reqs=none" print(f"[DEBUG REDIS_URL] Returning URL without auth: {url}", file=sys.stderr) return url diff --git a/shared/redis_utils/client.py b/shared/redis_utils/client.py index 1ec744c8..3b5a69a4 100755 --- a/shared/redis_utils/client.py +++ b/shared/redis_utils/client.py @@ -134,22 +134,70 @@ class RedisConnectionManager: self._redis_url = redis_url # Create connection pool with SSL handling for self-signed certificates - connection_kwargs = { - 'db': db, - 'max_connections': max_connections, - 'decode_responses': decode_responses, - 'retry_on_timeout': retry_on_timeout, - 'socket_keepalive': socket_keepalive, - 'health_check_interval': health_check_interval - } - - # Add SSL kwargs for self-signed certificates (using shared helper) - connection_kwargs.update(get_ssl_kwargs_for_url(redis_url)) - - self._pool = redis.ConnectionPool.from_url( - redis_url, - **connection_kwargs - ) + # For Redis 6.4.0+, we need to handle SSL parameters correctly + if redis_url.startswith("rediss://"): + # Extract connection parameters from URL + from urllib.parse import urlparse + + parsed_url = urlparse(redis_url) + + # Build connection parameters for ConnectionPool + connection_params = { + 'db': db, + 'max_connections': max_connections, + 'retry_on_timeout': retry_on_timeout, + 'socket_keepalive': socket_keepalive, + 'health_check_interval': health_check_interval + } + + # Add password if present + if parsed_url.password: + connection_params['password'] = parsed_url.password + + # Create connection pool (without SSL parameters - they go to the client) + self._pool = redis.ConnectionPool( + host=parsed_url.hostname, + port=parsed_url.port or 6379, + **connection_params + ) + + # Get SSL configuration for self-signed certificates + ssl_kwargs = get_ssl_kwargs_for_url(redis_url) + + # Create Redis client with SSL parameters + client_params = { + 'connection_pool': self._pool, + 'decode_responses': decode_responses + } + + if ssl_kwargs: + client_params['ssl'] = True + client_params['ssl_cert_reqs'] = ssl_kwargs.get('ssl_cert_reqs', ssl.CERT_NONE) + client_params['ssl_ca_certs'] = ssl_kwargs.get('ssl_ca_certs') + client_params['ssl_certfile'] = ssl_kwargs.get('ssl_certfile') + client_params['ssl_keyfile'] = ssl_kwargs.get('ssl_keyfile') + + self._client = redis.Redis(**client_params) + else: + # For non-TLS connections, use the original approach + connection_kwargs = { + 'db': db, + 'max_connections': max_connections, + 'decode_responses': decode_responses, + 'retry_on_timeout': retry_on_timeout, + 'socket_keepalive': socket_keepalive, + 'health_check_interval': health_check_interval + } + + # Add SSL kwargs for self-signed certificates (using shared helper) + connection_kwargs.update(get_ssl_kwargs_for_url(redis_url)) + + self._pool = redis.ConnectionPool.from_url( + redis_url, + **connection_kwargs + ) + + self._client = redis.Redis(connection_pool=self._pool) # Create Redis client with pool self._client = redis.Redis(connection_pool=self._pool)