From 4af860c010ebb37552cb324cc969515f22462e49 Mon Sep 17 00:00:00 2001 From: Urtzi Alfaro Date: Fri, 9 Jan 2026 06:57:18 +0100 Subject: [PATCH] Imporve monitoring --- Tiltfile | 44 ++++- infrastructure/helm/deploy-signoz.sh | 91 +++++++++- infrastructure/helm/signoz-values-dev.yaml | 97 ++++------- infrastructure/helm/signoz-values-prod.yaml | 23 ++- .../kubernetes/add-image-pull-secrets.sh | 125 -------------- .../kubernetes/add-monitoring-config.sh | 94 ---------- .../kubernetes/apply-monitoring-to-all.py | 162 ------------------ .../infrastructure/gateway-service.yaml | 2 +- infrastructure/kubernetes/base/configmap.yaml | 8 +- .../kubernetes/base/ingress-https.yaml | 23 +-- .../kubernetes/create-dockerhub-secret.sh | 126 ++++++++++++++ .../kubernetes/overlays/dev/dev-ingress.yaml | 15 +- .../overlays/prod/kustomization.yaml | 38 +--- .../overlays/prod/prod-configmap.yaml | 8 +- .../app/services/cleanup_service.py | 12 -- .../app/services/clone_orchestrator.py | 100 +---------- 16 files changed, 333 insertions(+), 635 deletions(-) delete mode 100755 infrastructure/kubernetes/add-image-pull-secrets.sh delete mode 100755 infrastructure/kubernetes/add-monitoring-config.sh delete mode 100755 infrastructure/kubernetes/apply-monitoring-to-all.py create mode 100755 infrastructure/kubernetes/create-dockerhub-secret.sh diff --git a/Tiltfile b/Tiltfile index 5ebab47a..603dc0b7 100644 --- a/Tiltfile +++ b/Tiltfile @@ -63,6 +63,35 @@ Monitoring: Applying security configurations... """) +# Create Docker Hub secret for image pulls (if credentials are available) +local_resource( + 'dockerhub-secret', + cmd=''' + echo "đŸŗ Setting up Docker Hub image pull secret..." + + # Check if Docker Hub credentials are available + if [ -n "$DOCKERHUB_USERNAME" ] && [ -n "$DOCKERHUB_PASSWORD" ]; then + echo " Found DOCKERHUB_USERNAME and DOCKERHUB_PASSWORD environment variables" + ./infrastructure/kubernetes/create-dockerhub-secret.sh + elif [ -f "$HOME/.docker/config.json" ]; then + echo " Attempting to use Docker CLI credentials..." + ./infrastructure/kubernetes/create-dockerhub-secret.sh + else + echo " âš ī¸ Docker Hub credentials not found" + echo " To enable automatic Docker Hub authentication:" + echo " 1. Run 'docker login', OR" + echo " 2. Set environment variables:" + echo " export DOCKERHUB_USERNAME='your-username'" + echo " export DOCKERHUB_PASSWORD='your-password-or-token'" + echo "" + echo " Continuing without Docker Hub authentication..." + echo " (This is OK for local development using local registry)" + fi + ''', + labels=['00-security'], + auto_init=True +) + # Apply security configurations before loading main manifests local_resource( 'security-setup', @@ -75,6 +104,7 @@ local_resource( kubectl apply -f infrastructure/kubernetes/base/configmaps/postgres-logging-config.yaml echo "✅ Security configurations applied" ''', + resource_deps=['dockerhub-secret'], labels=['00-security'], auto_init=True ) @@ -338,10 +368,20 @@ local_resource( echo "📊 Deploying SigNoz Monitoring Stack..." echo "" + # Ensure Docker Hub secret exists in bakery-ia namespace + echo "🔐 Ensuring Docker Hub secret exists in bakery-ia namespace..." + if ! kubectl get secret dockerhub-creds -n bakery-ia &>/dev/null; then + echo " âš ī¸ Docker Hub secret not found, attempting to create..." + ./infrastructure/kubernetes/create-dockerhub-secret.sh || echo " Continuing without Docker Hub authentication..." + else + echo " ✅ Docker Hub secret exists" + fi + echo "" + # Check if SigNoz is already deployed - if helm list -n signoz | grep -q signoz; then + if helm list -n bakery-ia | grep -q signoz; then echo "✅ SigNoz already deployed, checking status..." - helm status signoz -n signoz + helm status signoz -n bakery-ia else echo "🚀 Installing SigNoz..." diff --git a/infrastructure/helm/deploy-signoz.sh b/infrastructure/helm/deploy-signoz.sh index e3277748..5e4c370d 100755 --- a/infrastructure/helm/deploy-signoz.sh +++ b/infrastructure/helm/deploy-signoz.sh @@ -37,6 +37,14 @@ show_help() { $0 prod # Deploy to production $0 --upgrade prod # Upgrade production deployment $0 --remove dev # Remove development deployment" + echo "" + echo "Docker Hub Authentication:" + echo " This script automatically creates a Docker Hub secret for image pulls." + echo " Provide credentials via environment variables (recommended):" + echo " export DOCKERHUB_USERNAME='your-username'" + echo " export DOCKERHUB_PASSWORD='your-personal-access-token'" + echo " Or ensure you're logged in with Docker CLI:" + echo " docker login" } # Parse command line arguments @@ -124,6 +132,82 @@ ensure_namespace() { fi } +# Function to create Docker Hub secret for image pulls +create_dockerhub_secret() { + echo "${BLUE}Setting up Docker Hub image pull secret...${NC}" + + if [[ "$DRY_RUN" == true ]]; then + echo " (dry-run) Would create Docker Hub secret in namespace $NAMESPACE" + return + fi + + # Check if secret already exists + if kubectl get secret dockerhub-creds -n "$NAMESPACE" &> /dev/null; then + echo "${GREEN}Docker Hub secret already exists in namespace $NAMESPACE.${NC}" + return + fi + + # Check if Docker Hub credentials are available + if [[ -n "$DOCKERHUB_USERNAME" ]] && [[ -n "$DOCKERHUB_PASSWORD" ]]; then + echo "${BLUE}Found DOCKERHUB_USERNAME and DOCKERHUB_PASSWORD environment variables${NC}" + + kubectl create secret docker-registry dockerhub-creds \ + --docker-server=https://index.docker.io/v1/ \ + --docker-username="$DOCKERHUB_USERNAME" \ + --docker-password="$DOCKERHUB_PASSWORD" \ + --docker-email="${DOCKERHUB_EMAIL:-noreply@bakery-ia.local}" \ + -n "$NAMESPACE" + + echo "${GREEN}Docker Hub secret created successfully.${NC}" + + elif [[ -f "$HOME/.docker/config.json" ]]; then + echo "${BLUE}Attempting to use Docker CLI credentials...${NC}" + + # Try to extract credentials from Docker config + if grep -q "credsStore" "$HOME/.docker/config.json"; then + echo "${YELLOW}Docker is using a credential store. Please set environment variables:${NC}" + echo " export DOCKERHUB_USERNAME='your-username'" + echo " export DOCKERHUB_PASSWORD='your-password-or-token'" + echo "${YELLOW}Continuing without Docker Hub authentication...${NC}" + return + fi + + # Try to extract from base64 encoded auth + AUTH=$(cat "$HOME/.docker/config.json" | jq -r '.auths["https://index.docker.io/v1/"].auth // empty' 2>/dev/null) + if [[ -n "$AUTH" ]]; then + echo "${GREEN}Found Docker Hub credentials in Docker config${NC}" + local DOCKER_USERNAME=$(echo "$AUTH" | base64 -d | cut -d: -f1) + local DOCKER_PASSWORD=$(echo "$AUTH" | base64 -d | cut -d: -f2-) + + kubectl create secret docker-registry dockerhub-creds \ + --docker-server=https://index.docker.io/v1/ \ + --docker-username="$DOCKER_USERNAME" \ + --docker-password="$DOCKER_PASSWORD" \ + --docker-email="${DOCKERHUB_EMAIL:-noreply@bakery-ia.local}" \ + -n "$NAMESPACE" + + echo "${GREEN}Docker Hub secret created successfully.${NC}" + else + echo "${YELLOW}Could not find Docker Hub credentials${NC}" + echo "${YELLOW}To enable automatic Docker Hub authentication:${NC}" + echo " 1. Run 'docker login', OR" + echo " 2. Set environment variables:" + echo " export DOCKERHUB_USERNAME='your-username'" + echo " export DOCKERHUB_PASSWORD='your-password-or-token'" + echo "${YELLOW}Continuing without Docker Hub authentication...${NC}" + fi + else + echo "${YELLOW}Docker Hub credentials not found${NC}" + echo "${YELLOW}To enable automatic Docker Hub authentication:${NC}" + echo " 1. Run 'docker login', OR" + echo " 2. Set environment variables:" + echo " export DOCKERHUB_USERNAME='your-username'" + echo " export DOCKERHUB_PASSWORD='your-password-or-token'" + echo "${YELLOW}Continuing without Docker Hub authentication...${NC}" + fi + echo "" +} + # Function to deploy SigNoz deploy_signoz() { local values_file="infrastructure/helm/signoz-values-$ENVIRONMENT.yaml" @@ -278,12 +362,15 @@ main() { # Ensure namespace ensure_namespace - + if [[ "$REMOVE" == true ]]; then remove_signoz exit 0 fi - + + # Create Docker Hub secret for image pulls + create_dockerhub_secret + # Deploy SigNoz deploy_signoz diff --git a/infrastructure/helm/signoz-values-dev.yaml b/infrastructure/helm/signoz-values-dev.yaml index ae88d580..481c3ad5 100644 --- a/infrastructure/helm/signoz-values-dev.yaml +++ b/infrastructure/helm/signoz-values-dev.yaml @@ -7,74 +7,41 @@ global: storageClass: "standard" domain: "monitoring.bakery-ia.local" - # Docker Hub credentials for pulling images + # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc) imagePullSecrets: - - name: dockerhub-creds + - dockerhub-creds -# Frontend Configuration -frontend: +# Docker Hub credentials for pulling images (root level for SigNoz components) +imagePullSecrets: + - dockerhub-creds + +# SignOz Main Component (includes frontend and query service) +signoz: replicaCount: 1 - image: - repository: signoz/frontend - tag: 0.52.3 - pullPolicy: IfNotPresent - - service: - type: ClusterIP - port: 3301 - - ingress: - enabled: true - className: nginx - annotations: - nginx.ingress.kubernetes.io/rewrite-target: /$2 - nginx.ingress.kubernetes.io/use-regex: "true" - hosts: - - host: monitoring.bakery-ia.local - paths: - - path: /signoz(/|$)(.*) - pathType: ImplementationSpecific - tls: [] - - resources: - requests: - cpu: 25m # Reduced for local dev - memory: 64Mi # Reduced for local dev - limits: - cpu: 200m - memory: 256Mi - - env: - - name: FRONTEND_REFRESH_INTERVAL - value: "30000" - - name: BASE_URL - value: "https://monitoring.bakery-ia.local/signoz" - -# Query Service Configuration -queryService: - replicaCount: 1 - image: - repository: signoz/query-service - tag: 0.52.3 - pullPolicy: IfNotPresent service: type: ClusterIP port: 8080 + ingress: + enabled: true + className: nginx + annotations: {} + hosts: + - host: monitoring.bakery-ia.local + paths: + - path: / + pathType: Prefix + port: 8080 + tls: [] + resources: requests: - cpu: 50m # Reduced for local dev - memory: 128Mi # Reduced for local dev + cpu: 100m # Combined frontend + query service + memory: 256Mi limits: - cpu: 500m - memory: 512Mi - - env: - - name: DEPLOYMENT_TYPE - value: "kubernetes-helm" - - name: SIGNOZ_LOCAL_DB_PATH - value: "/var/lib/signoz" + cpu: 1000m + memory: 1Gi persistence: enabled: true @@ -135,6 +102,10 @@ clickhouse: cpu: 1000m memory: 1Gi +# Zookeeper Configuration (required by ClickHouse) +zookeeper: + enabled: true + # OpenTelemetry Collector - Data ingestion endpoint for all telemetry otelCollector: enabled: true @@ -262,8 +233,8 @@ otelCollector: timeout: 10s # ClickHouse exporter for metrics - clickhousemetricswrite: - endpoint: tcp://signoz-clickhouse:9000/?database=signoz_metrics + signozclickhousemetrics: + dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics" timeout: 10s # ClickHouse exporter for logs @@ -271,9 +242,9 @@ otelCollector: dsn: tcp://signoz-clickhouse:9000/?database=signoz_logs timeout: 10s - # Logging exporter for debugging (optional) - logging: - loglevel: info + # Debug exporter for debugging (optional) + debug: + verbosity: detailed service: pipelines: @@ -287,7 +258,7 @@ otelCollector: metrics: receivers: [otlp, postgresql/auth, postgresql/inventory, postgresql/orders, redis, rabbitmq] processors: [memory_limiter, batch, resourcedetection] - exporters: [clickhousemetricswrite] + exporters: [signozclickhousemetrics] # Logs pipeline logs: diff --git a/infrastructure/helm/signoz-values-prod.yaml b/infrastructure/helm/signoz-values-prod.yaml index d7c10bd1..9a932067 100644 --- a/infrastructure/helm/signoz-values-prod.yaml +++ b/infrastructure/helm/signoz-values-prod.yaml @@ -7,6 +7,13 @@ global: storageClass: "standard" domain: "monitoring.bakewise.ai" + # Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc) + imagePullSecrets: + - dockerhub-creds + +# Docker Hub credentials for pulling images (root level for SigNoz components) +imagePullSecrets: + - dockerhub-creds # Frontend Configuration frontend: @@ -351,8 +358,8 @@ otelCollector: max_interval: 30s max_elapsed_time: 300s - clickhousemetricswrite: - endpoint: tcp://clickhouse:9000/?database=signoz_metrics + signozclickhousemetrics: + endpoint: "tcp://clickhouse:9000/?database=signoz_metrics" timeout: 10s retry_on_failure: enabled: true @@ -369,9 +376,9 @@ otelCollector: max_interval: 30s max_elapsed_time: 300s - # Minimal logging for prod - logging: - loglevel: warn + # Debug exporter for debugging (replaces deprecated logging exporter) + debug: + verbosity: detailed sampling_initial: 2 sampling_thereafter: 500 @@ -381,17 +388,17 @@ otelCollector: traces: receivers: [otlp] processors: [memory_limiter, batch, resourcedetection, resource] - exporters: [clickhousetraces, logging] + exporters: [clickhousetraces, debug] metrics: receivers: [otlp, prometheus] processors: [memory_limiter, batch, resourcedetection, resource] - exporters: [clickhousemetricswrite] + exporters: [signozclickhousemetrics] logs: receivers: [otlp] processors: [memory_limiter, batch, resourcedetection, resource] - exporters: [clickhouselogsexporter, logging] + exporters: [clickhouselogsexporter, debug] # OpenTelemetry Collector Deployment Mode otelCollectorDeployment: diff --git a/infrastructure/kubernetes/add-image-pull-secrets.sh b/infrastructure/kubernetes/add-image-pull-secrets.sh deleted file mode 100755 index c327ed85..00000000 --- a/infrastructure/kubernetes/add-image-pull-secrets.sh +++ /dev/null @@ -1,125 +0,0 @@ -#!/bin/bash - -# Script to add imagePullSecrets to all Kubernetes deployments, jobs, and cronjobs -# This ensures all pods can pull images from Docker Hub using the dockerhub-creds secret - -SECRET_NAME="dockerhub-creds" -BASE_DIR="/Users/urtzialfaro/Documents/bakery-ia/infrastructure/kubernetes" - -# ANSI color codes -GREEN='\033[0;32m' -BLUE='\033[0;34m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -echo -e "${BLUE}Adding imagePullSecrets to all Kubernetes resources...${NC}" -echo "======================================================" -echo "" - -# Counter for files processed -count=0 - -# Function to add imagePullSecrets to a file -add_image_pull_secrets() { - local file="$1" - - # Check if file already has imagePullSecrets - if grep -q "imagePullSecrets:" "$file"; then - echo -e "${YELLOW} ⊘ Skipping (already has imagePullSecrets): $(basename $file)${NC}" - return - fi - - # Temporary file for processing - temp_file=$(mktemp) - - # Process the file using awk to add imagePullSecrets after "spec:" in template or job spec - awk ' - /^ spec:$/ && !done { - print $0 - print " imagePullSecrets:" - print " - name: dockerhub-creds" - done = 1 - next - } - { print } - ' "$file" > "$temp_file" - - # Check if changes were made - if ! cmp -s "$file" "$temp_file"; then - mv "$temp_file" "$file" - echo -e "${GREEN} ✓ Updated: $(basename $file)${NC}" - ((count++)) - else - rm "$temp_file" - echo -e "${YELLOW} ⊘ No changes needed: $(basename $file)${NC}" - fi -} - -# Process all service deployments -echo -e "${BLUE}Processing service deployments...${NC}" -find $BASE_DIR/base/components -name "*-service.yaml" | while read file; do - if [ -f "$file" ]; then - add_image_pull_secrets "$file" - fi -done -echo "" - -# Process all database deployments -echo -e "${BLUE}Processing database deployments...${NC}" -for file in $BASE_DIR/base/components/databases/*.yaml; do - if [ -f "$file" ]; then - add_image_pull_secrets "$file" - fi -done -echo "" - -# Process all migration jobs -echo -e "${BLUE}Processing migration jobs...${NC}" -for file in $BASE_DIR/base/migrations/*.yaml; do - if [ -f "$file" ]; then - add_image_pull_secrets "$file" - fi -done -echo "" - -# Process all cronjobs -echo -e "${BLUE}Processing cronjobs...${NC}" -for file in $BASE_DIR/base/cronjobs/*.yaml; do - if [ -f "$file" ]; then - add_image_pull_secrets "$file" - fi -done -echo "" - -# Process standalone jobs -echo -e "${BLUE}Processing standalone jobs...${NC}" -for file in $BASE_DIR/base/jobs/*.yaml; do - if [ -f "$file" ]; then - add_image_pull_secrets "$file" - fi -done -echo "" - -# Process deployments directory -echo -e "${BLUE}Processing deployments...${NC}" -for file in $BASE_DIR/base/deployments/*.yaml; do - if [ -f "$file" ]; then - add_image_pull_secrets "$file" - fi -done -echo "" - -# Process nominatim service -if [ -f "$BASE_DIR/base/components/infrastructure/nominatim.yaml" ]; then - echo -e "${BLUE}Processing nominatim service...${NC}" - add_image_pull_secrets "$BASE_DIR/base/components/infrastructure/nominatim.yaml" - echo "" -fi - -echo "======================================================" -echo -e "${GREEN}Completed! Updated $count file(s)${NC}" -echo "" -echo "Next steps:" -echo "1. Review the changes: git diff" -echo "2. Apply to cluster: kubectl apply -k infrastructure/kubernetes/overlays/dev" -echo "3. Verify pods are running: kubectl get pods -n bakery-ia" diff --git a/infrastructure/kubernetes/add-monitoring-config.sh b/infrastructure/kubernetes/add-monitoring-config.sh deleted file mode 100755 index 0d26e163..00000000 --- a/infrastructure/kubernetes/add-monitoring-config.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/bin/bash -# Script to add OpenTelemetry monitoring configuration to all service deployments -# This adds the necessary environment variables for SigNoz integration -# Note: No Prometheus annotations needed - all metrics go via OTLP push - -set -e - -SERVICES=( - "ai-insights" - "distribution" - "external" - "forecasting" - "inventory" - "notification" - "orchestrator" - "orders" - "pos" - "procurement" - "production" - "recipes" - "sales" - "suppliers" - "tenant" - "training" - "frontend" -) - -echo "Adding OpenTelemetry configuration to all services..." -echo "" - -for service in "${SERVICES[@]}"; do - SERVICE_FILE="infrastructure/kubernetes/base/components/${service}/${service}-service.yaml" - - if [ ! -f "$SERVICE_FILE" ]; then - echo "âš ī¸ Skipping $service (file not found: $SERVICE_FILE)" - continue - fi - - echo "📝 Processing $service-service..." - - # Check if already has OTEL env vars - if grep -q "OTEL_COLLECTOR_ENDPOINT" "$SERVICE_FILE"; then - echo " ✓ Already has OpenTelemetry configuration" - else - echo " + Adding OpenTelemetry environment variables" - # Create a YAML patch - cat > "/tmp/${service}-otel-patch.yaml" << 'EOF' - env: - # OpenTelemetry Configuration - - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" - - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" - - name: OTEL_SERVICE_NAME - value: "SERVICE_NAME_PLACEHOLDER" - - name: ENABLE_TRACING - value: "true" - # Logging Configuration - - name: OTEL_LOGS_EXPORTER - value: "otlp" - - name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED - value: "true" - # Metrics Configuration (all via OTLP, no Prometheus) - - name: ENABLE_OTEL_METRICS - value: "true" - - name: ENABLE_SYSTEM_METRICS - value: "true" -EOF - # Replace placeholder with actual service name - sed -i.bak "s/SERVICE_NAME_PLACEHOLDER/${service}-service/g" "/tmp/${service}-otel-patch.yaml" - - echo " âš ī¸ Manual step required: Add env vars from /tmp/${service}-otel-patch.yaml" - echo " Insert after 'ports:' section and before 'envFrom:' in $SERVICE_FILE" - fi - - echo " ✅ $service-service processed" - echo "" -done - -echo "" -echo "✅ Monitoring configuration prepared for all services!" -echo "" -echo "Next steps:" -echo "1. Review the changes and manually add env vars from /tmp/*-otel-patch.yaml files" -echo "2. Update SigNoz: helm upgrade signoz signoz/signoz -n signoz -f infrastructure/helm/signoz-values-dev.yaml" -echo "3. Restart services: kubectl rollout restart deployment -n bakery-ia" -echo "4. Check SigNoz UI at https://monitoring.bakery-ia.local for incoming data" -echo "" -echo "What metrics you'll see:" -echo " - HTTP requests (method, endpoint, status code, duration)" -echo " - System metrics (CPU, memory usage per process)" -echo " - System-wide metrics (total CPU, memory, disk I/O, network I/O)" -echo " - Custom business metrics (registrations, orders, etc.)" -echo " - All pushed via OpenTelemetry OTLP (no Prometheus scraping)" diff --git a/infrastructure/kubernetes/apply-monitoring-to-all.py b/infrastructure/kubernetes/apply-monitoring-to-all.py deleted file mode 100755 index eaab2b47..00000000 --- a/infrastructure/kubernetes/apply-monitoring-to-all.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python3 -""" -Script to automatically add OpenTelemetry monitoring configuration to all service deployments. -This adds environment variables for metrics, logs, and traces export to SigNoz. -""" - -import os -import re -import sys -from pathlib import Path - -# Services to configure -SERVICES = [ - "ai-insights", - "distribution", - "external", - "forecasting", - "inventory", - "notification", - "orchestrator", - "orders", - "pos", - "procurement", - "production", - "recipes", - "sales", - "suppliers", - "tenant", - "training", -] - -OTEL_ENV_VARS_TEMPLATE = """ env: - # OpenTelemetry Configuration - - name: OTEL_COLLECTOR_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" - - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318" - - name: OTEL_SERVICE_NAME - value: "{service_name}" - - name: ENABLE_TRACING - value: "true" - # Logging Configuration - - name: OTEL_LOGS_EXPORTER - value: "otlp" - - name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED - value: "true" - # Metrics Configuration (all via OTLP, no Prometheus) - - name: ENABLE_OTEL_METRICS - value: "true" - - name: ENABLE_SYSTEM_METRICS - value: "true" -""" - - -def has_otel_config(content: str) -> bool: - """Check if file already has OTEL configuration""" - return "OTEL_COLLECTOR_ENDPOINT" in content - - -def add_otel_config(content: str, service_name: str) -> str: - """Add OTEL configuration to service deployment""" - - # Prepare the env vars with the service name - env_vars = OTEL_ENV_VARS_TEMPLATE.format(service_name=f"{service_name}-service") - - # Find the container section and add env vars before envFrom - # Pattern: find " containers:" then first " envFrom:" after it - pattern = r'( containers:\n - name: [^\n]+\n image: [^\n]+\n(?: ports:\n(?: - [^\n]+\n)+)?)( envFrom:)' - - replacement = r'\1' + env_vars + r'\2' - - # Try to replace - new_content = re.sub(pattern, replacement, content, count=1) - - if new_content == content: - print(f" âš ī¸ Warning: Could not find insertion point automatically") - return content - - return new_content - - -def process_service(service_name: str, base_path: Path) -> bool: - """Process a single service deployment file""" - - service_file = base_path / "components" / service_name / f"{service_name}-service.yaml" - - if not service_file.exists(): - print(f" âš ī¸ File not found: {service_file}") - return False - - # Read file - with open(service_file, 'r') as f: - content = f.read() - - # Check if already configured - if has_otel_config(content): - print(f" ✓ Already configured") - return True - - # Add configuration - new_content = add_otel_config(content, service_name) - - if new_content == content: - return False - - # Write back - with open(service_file, 'w') as f: - f.write(new_content) - - print(f" ✅ Updated successfully") - return True - - -def main(): - """Main function""" - - # Find base path - script_dir = Path(__file__).parent - base_path = script_dir / "base" - - if not base_path.exists(): - print(f"❌ Error: Base path not found: {base_path}") - sys.exit(1) - - print("=" * 60) - print("Adding OpenTelemetry Monitoring Configuration") - print("=" * 60) - print() - - success_count = 0 - skip_count = 0 - fail_count = 0 - - for service in SERVICES: - print(f"📝 Processing {service}-service...") - - result = process_service(service, base_path) - - if result: - if has_otel_config(open(base_path / "components" / service / f"{service}-service.yaml").read()): - success_count += 1 - else: - fail_count += 1 - - print() - - print("=" * 60) - print(f"✅ Successfully configured: {success_count}") - if fail_count > 0: - print(f"âš ī¸ Failed to configure: {fail_count}") - print("=" * 60) - print() - - print("Next steps:") - print("1. Review the changes: git diff infrastructure/kubernetes/base/components/") - print("2. Update SigNoz: helm upgrade signoz signoz/signoz -n signoz -f infrastructure/helm/signoz-values-dev.yaml") - print("3. Apply changes: kubectl apply -k infrastructure/kubernetes/overlays/dev/") - print("4. Verify: kubectl logs -n bakery-ia deployment/ | grep -i 'otel\\|metrics'") - - -if __name__ == "__main__": - main() diff --git a/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml b/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml index acabca44..a02b3d5f 100644 --- a/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml +++ b/infrastructure/kubernetes/base/components/infrastructure/gateway-service.yaml @@ -52,7 +52,7 @@ spec: name: whatsapp-secrets env: - name: OTEL_EXPORTER_OTLP_ENDPOINT - value: "http://otel-collector.monitoring.svc.cluster.local:4317" + value: "http://signoz-otel-collector.signoz.svc.cluster.local:4317" resources: requests: memory: "256Mi" diff --git a/infrastructure/kubernetes/base/configmap.yaml b/infrastructure/kubernetes/base/configmap.yaml index 63d2a516..c973200b 100644 --- a/infrastructure/kubernetes/base/configmap.yaml +++ b/infrastructure/kubernetes/base/configmap.yaml @@ -291,7 +291,7 @@ data: HEALTH_CHECK_INTERVAL: "30" # Monitoring Configuration - SigNoz - SIGNOZ_ROOT_URL: "http://localhost/signoz" + SIGNOZ_ROOT_URL: "https://monitoring.bakery-ia.local" # ================================================================ # DATA COLLECTION SETTINGS @@ -390,9 +390,9 @@ data: OTEL_SERVICE_NAME: "bakery-ia" OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development" - # SigNoz Endpoints - SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080" - SIGNOZ_FRONTEND_URL: "http://signoz-frontend.signoz.svc.cluster.local:3301" + # SigNoz Endpoints (v0.106.0+ unified service) + SIGNOZ_ENDPOINT: "http://signoz.signoz.svc.cluster.local:8080" + SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local" # ================================================================ # REPLENISHMENT PLANNING SETTINGS diff --git a/infrastructure/kubernetes/base/ingress-https.yaml b/infrastructure/kubernetes/base/ingress-https.yaml index 3b5a96fb..679b4597 100644 --- a/infrastructure/kubernetes/base/ingress-https.yaml +++ b/infrastructure/kubernetes/base/ingress-https.yaml @@ -63,22 +63,7 @@ spec: name: gateway-service port: number: 8000 - - host: monitoring.your-domain.com # To be overridden in overlays - http: - paths: - # SigNoz Frontend UI and API (consolidated in newer versions) - - path: /signoz(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: signoz - port: - number: 8080 - # SigNoz API endpoints - - path: /signoz-api(/|$)(.*) - pathType: ImplementationSpecific - backend: - service: - name: signoz - port: - number: 8080 \ No newline at end of file + # Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace + # SigNoz creates its own Ingress via Helm chart configuration + # Access at: https://monitoring.your-domain.com/ (configured in signoz-values.yaml) + # SignOz ingress is managed separately - no need to configure here \ No newline at end of file diff --git a/infrastructure/kubernetes/create-dockerhub-secret.sh b/infrastructure/kubernetes/create-dockerhub-secret.sh new file mode 100755 index 00000000..76f5b7a2 --- /dev/null +++ b/infrastructure/kubernetes/create-dockerhub-secret.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +# ============================================================================= +# Create Docker Hub Image Pull Secret +# ============================================================================= +# This script creates a Kubernetes secret for pulling images from Docker Hub. +# The secret is used by both: +# 1. bakery-ia namespace deployments (Tilt + Kustomize) +# 2. Signoz Helm deployment +# +# Usage: +# ./create-dockerhub-secret.sh +# +# Prerequisites: +# - kubectl configured with access to the cluster +# - DOCKERHUB_USERNAME and DOCKERHUB_PASSWORD environment variables set +# - OR Docker CLI logged in (docker login) +# ============================================================================= + +set -e + +echo "🔐 Creating Docker Hub Image Pull Secret" +echo "==========================================" +echo "" + +# Check for required environment variables +if [ -z "$DOCKERHUB_USERNAME" ] || [ -z "$DOCKERHUB_PASSWORD" ]; then + echo "âš ī¸ DOCKERHUB_USERNAME and DOCKERHUB_PASSWORD environment variables not set" + echo "" + echo "Checking if Docker CLI is logged in..." + + # Try to extract credentials from Docker config + if [ -f "$HOME/.docker/config.json" ]; then + # Check if using credential store + if grep -q "credsStore" "$HOME/.docker/config.json"; then + echo "âš ī¸ Docker is using a credential store. Please set environment variables manually:" + echo "" + echo " export DOCKERHUB_USERNAME='your-username'" + echo " export DOCKERHUB_PASSWORD='your-password-or-token'" + echo "" + exit 1 + fi + + # Try to extract from base64 encoded auth + AUTH=$(cat "$HOME/.docker/config.json" | jq -r '.auths["https://index.docker.io/v1/"].auth // empty' 2>/dev/null) + if [ -n "$AUTH" ]; then + echo "✅ Found Docker Hub credentials in Docker config" + DOCKERHUB_USERNAME=$(echo "$AUTH" | base64 -d | cut -d: -f1) + DOCKERHUB_PASSWORD=$(echo "$AUTH" | base64 -d | cut -d: -f2-) + else + echo "❌ Could not find Docker Hub credentials" + echo "" + echo "Please either:" + echo " 1. Run 'docker login' first, OR" + echo " 2. Set environment variables:" + echo " export DOCKERHUB_USERNAME='your-username'" + echo " export DOCKERHUB_PASSWORD='your-password-or-token'" + echo "" + exit 1 + fi + else + echo "❌ Docker config not found and environment variables not set" + echo "" + echo "Please set environment variables:" + echo " export DOCKERHUB_USERNAME='your-username'" + echo " export DOCKERHUB_PASSWORD='your-password-or-token'" + echo "" + exit 1 + fi +fi + +echo "Using Docker Hub username: $DOCKERHUB_USERNAME" +echo "" + +# Function to create secret in a namespace +create_secret_in_namespace() { + local NAMESPACE=$1 + + echo "đŸ“Ļ Creating secret in namespace: $NAMESPACE" + + # Create namespace if it doesn't exist + if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then + echo " Creating namespace $NAMESPACE..." + kubectl create namespace "$NAMESPACE" + fi + + # Delete existing secret if it exists + if kubectl get secret dockerhub-creds -n "$NAMESPACE" &>/dev/null; then + echo " Deleting existing secret..." + kubectl delete secret dockerhub-creds -n "$NAMESPACE" + fi + + # Create the secret + kubectl create secret docker-registry dockerhub-creds \ + --docker-server=https://index.docker.io/v1/ \ + --docker-username="$DOCKERHUB_USERNAME" \ + --docker-password="$DOCKERHUB_PASSWORD" \ + --docker-email="${DOCKERHUB_EMAIL:-noreply@bakery-ia.local}" \ + -n "$NAMESPACE" + + echo " ✅ Secret created successfully" + echo "" +} + +# Create secret in bakery-ia namespace (for Tilt deployments) +create_secret_in_namespace "bakery-ia" + +# Create secret in signoz namespace (for Signoz Helm deployment - if namespace exists) +if kubectl get namespace signoz &>/dev/null; then + create_secret_in_namespace "signoz" +else + echo "â„šī¸ Signoz namespace not found, skipping (will be created on Helm install)" + echo "" +fi + +echo "✅ Docker Hub secrets created successfully!" +echo "" +echo "The secret 'dockerhub-creds' is now available in:" +echo " - bakery-ia namespace (for Tilt/Kustomize deployments)" +if kubectl get namespace signoz &>/dev/null; then + echo " - signoz namespace (for Signoz Helm deployment)" +fi +echo "" +echo "All pods with imagePullSecrets: dockerhub-creds will now use these credentials" +echo "to pull images from Docker Hub." +echo "" diff --git a/infrastructure/kubernetes/overlays/dev/dev-ingress.yaml b/infrastructure/kubernetes/overlays/dev/dev-ingress.yaml index c1c2dbbf..ed9394f4 100644 --- a/infrastructure/kubernetes/overlays/dev/dev-ingress.yaml +++ b/infrastructure/kubernetes/overlays/dev/dev-ingress.yaml @@ -73,14 +73,7 @@ spec: name: gateway-service port: number: 8000 - - host: monitoring.bakery-ia.local - http: - paths: - # SigNoz Frontend UI - - path: / - pathType: Prefix - backend: - service: - name: signoz - port: - number: 8080 \ No newline at end of file + # Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace + # SigNoz creates its own Ingress via Helm chart configuration (signoz-values-dev.yaml) + # Access at: https://monitoring.bakery-ia.local/ + # SignOz is served at the root of the monitoring subdomain \ No newline at end of file diff --git a/infrastructure/kubernetes/overlays/prod/kustomization.yaml b/infrastructure/kubernetes/overlays/prod/kustomization.yaml index 7ffca5c5..7e9a20e4 100644 --- a/infrastructure/kubernetes/overlays/prod/kustomization.yaml +++ b/infrastructure/kubernetes/overlays/prod/kustomization.yaml @@ -73,13 +73,13 @@ patches: value: "deployment.environment=production,cluster.name=bakery-ia-prod" - op: add path: /data/SIGNOZ_ENDPOINT - value: "http://signoz-query-service.signoz.svc.cluster.local:8080" + value: "http://signoz.signoz.svc.cluster.local:8080" - op: add path: /data/SIGNOZ_FRONTEND_URL - value: "https://monitoring.bakewise.ai/signoz" + value: "https://monitoring.bakewise.ai" - op: add path: /data/SIGNOZ_ROOT_URL - value: "https://monitoring.bakewise.ai/signoz" + value: "https://monitoring.bakewise.ai" - op: add path: /data/RATE_LIMIT_ENABLED value: "true" @@ -119,12 +119,12 @@ patches: limits: memory: "4Gi" cpu: "1000m" - # SigNoz Query Service production configuration + # SigNoz Main Service production configuration (v0.106.0+ unified service) - target: group: apps version: v1 - kind: Deployment - name: signoz-query-service + kind: StatefulSet + name: signoz namespace: signoz patch: |- - op: replace @@ -134,11 +134,11 @@ patches: path: /spec/template/spec/containers/0/resources value: requests: - memory: "1Gi" - cpu: "500m" - limits: memory: "2Gi" cpu: "1000m" + limits: + memory: "4Gi" + cpu: "2000m" # SigNoz AlertManager production configuration - target: group: apps @@ -159,26 +159,6 @@ patches: limits: memory: "1Gi" cpu: "500m" - # SigNoz Frontend production configuration - - target: - group: apps - version: v1 - kind: Deployment - name: signoz-frontend - namespace: signoz - patch: |- - - op: replace - path: /spec/replicas - value: 2 - - op: replace - path: /spec/template/spec/containers/0/resources - value: - requests: - memory: "512Mi" - cpu: "250m" - limits: - memory: "1Gi" - cpu: "500m" images: - name: bakery/auth-service diff --git a/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml b/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml index ddb40de6..0d70c1c0 100644 --- a/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml +++ b/infrastructure/kubernetes/overlays/prod/prod-configmap.yaml @@ -28,10 +28,10 @@ data: OTEL_SERVICE_NAME: "bakery-ia" OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=production,cluster.name=bakery-ia-prod" - # SigNoz Endpoints - SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080" - SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai/signoz" - SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai/signoz" + # SigNoz Endpoints (v0.106.0+ unified service) + SIGNOZ_ENDPOINT: "http://signoz.signoz.svc.cluster.local:8080" + SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai" + SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai" # Rate Limiting (stricter in production) RATE_LIMIT_ENABLED: "true" diff --git a/services/demo_session/app/services/cleanup_service.py b/services/demo_session/app/services/cleanup_service.py index 04ac43a7..48b7602b 100644 --- a/services/demo_session/app/services/cleanup_service.py +++ b/services/demo_session/app/services/cleanup_service.py @@ -360,18 +360,6 @@ class DemoCleanupService: logger.info("Demo session cleanup completed", stats=stats) - # Update Prometheus metrics - duration_ms = int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000) - demo_session_cleanup_duration_seconds.labels(tier="all").observe(duration_ms / 1000) - - # Update deleted sessions metrics by tier (we need to determine tiers from sessions) - for session in all_sessions_to_cleanup: - demo_sessions_deleted_total.labels( - tier=session.demo_account_type, - status="success" - ).inc() - demo_sessions_active.labels(tier=session.demo_account_type).dec() - return stats async def cleanup_old_destroyed_sessions(self, days: int = 7) -> int: diff --git a/services/demo_session/app/services/clone_orchestrator.py b/services/demo_session/app/services/clone_orchestrator.py index d72ccc2f..6e5ebcb9 100644 --- a/services/demo_session/app/services/clone_orchestrator.py +++ b/services/demo_session/app/services/clone_orchestrator.py @@ -284,9 +284,7 @@ class CloneOrchestrator: ) start_time = datetime.now(timezone.utc) - - # Update active sessions metric - demo_sessions_active.labels(tier=demo_account_type).inc() + # Filter services if specified services_to_clone = self.services @@ -383,29 +381,6 @@ class CloneOrchestrator: services_status=all_services, demo_account_type=demo_account_type ) - - # Update Prometheus metrics - demo_session_creation_duration_seconds.labels(tier=demo_account_type).observe(duration_ms / 1000) - demo_sessions_created_total.labels(tier=demo_account_type, status=overall_status).inc() - - # Update alert and insight metrics if available - if result.get("alert_generation"): - alert_gen = result["alert_generation"] - for alert_type, alerts in alert_gen.items(): - if isinstance(alerts, dict) and alerts.get("alerts_generated"): - demo_alerts_generated_total.labels( - tier=demo_account_type, - alert_type=alert_type - ).inc(alerts["alerts_generated"]) - - if result.get("ai_insights_generation"): - insights_gen = result["ai_insights_generation"] - for insight_type, insights in insights_gen.items(): - if isinstance(insights, dict) and insights.get("insights_posted"): - demo_ai_insights_generated_total.labels( - tier=demo_account_type, - insight_type=insight_type - ).inc(insights["insights_posted"]) return result @@ -549,20 +524,6 @@ class CloneOrchestrator: duration_ms=duration_ms ) - demo_cross_service_calls_total.labels( - source_service="demo-session", - target_service=service.name, - status="success" - ).inc() - demo_cross_service_call_duration_seconds.labels( - source_service="demo-session", - target_service=service.name - ).observe(duration_seconds) - demo_service_clone_duration_seconds.labels( - tier=demo_account_type, - service=service.name - ).observe(duration_seconds) - if response.status_code == 200: result = response.json() logger.info( @@ -582,17 +543,6 @@ class CloneOrchestrator: response_text=response.text ) - demo_cross_service_calls_total.labels( - source_service="demo-session", - target_service=service.name, - status="failed" - ).inc() - demo_cloning_errors_total.labels( - tier=demo_account_type, - service=service.name, - error_type="http_error" - ).inc() - return { "service": service.name, "status": "failed", @@ -614,22 +564,6 @@ class CloneOrchestrator: url=service.url ) - # Update error metrics - demo_cross_service_calls_total.labels( - source_service="demo-session", - target_service=service.name, - status="failed" - ).inc() - demo_cloning_errors_total.labels( - tier=demo_account_type, - service=service.name, - error_type="timeout" - ).inc() - demo_service_clone_duration_seconds.labels( - tier=demo_account_type, - service=service.name - ).observe(duration_seconds) - return { "service": service.name, "status": "failed", @@ -650,22 +584,6 @@ class CloneOrchestrator: exc_info=True ) - # Update error metrics - demo_cross_service_calls_total.labels( - source_service="demo-session", - target_service=service.name, - status="failed" - ).inc() - demo_cloning_errors_total.labels( - tier=demo_account_type, - service=service.name, - error_type="network_error" - ).inc() - demo_service_clone_duration_seconds.labels( - tier=demo_account_type, - service=service.name - ).observe(duration_seconds) - return { "service": service.name, "status": "failed", @@ -686,22 +604,6 @@ class CloneOrchestrator: exc_info=True ) - # Update error metrics - demo_cross_service_calls_total.labels( - source_service="demo-session", - target_service=service.name, - status="failed" - ).inc() - demo_cloning_errors_total.labels( - tier=demo_account_type, - service=service.name, - error_type="exception" - ).inc() - demo_service_clone_duration_seconds.labels( - tier=demo_account_type, - service=service.name - ).observe(duration_seconds) - return { "service": service.name, "status": "failed",