Imporve monitoring

This commit is contained in:
Urtzi Alfaro
2026-01-09 06:57:18 +01:00
parent e8fda39e50
commit 4af860c010
16 changed files with 333 additions and 635 deletions

View File

@@ -1,125 +0,0 @@
#!/bin/bash
# Script to add imagePullSecrets to all Kubernetes deployments, jobs, and cronjobs
# This ensures all pods can pull images from Docker Hub using the dockerhub-creds secret
SECRET_NAME="dockerhub-creds"
BASE_DIR="/Users/urtzialfaro/Documents/bakery-ia/infrastructure/kubernetes"
# ANSI color codes
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo -e "${BLUE}Adding imagePullSecrets to all Kubernetes resources...${NC}"
echo "======================================================"
echo ""
# Counter for files processed
count=0
# Function to add imagePullSecrets to a file
add_image_pull_secrets() {
local file="$1"
# Check if file already has imagePullSecrets
if grep -q "imagePullSecrets:" "$file"; then
echo -e "${YELLOW} ⊘ Skipping (already has imagePullSecrets): $(basename $file)${NC}"
return
fi
# Temporary file for processing
temp_file=$(mktemp)
# Process the file using awk to add imagePullSecrets after "spec:" in template or job spec
awk '
/^ spec:$/ && !done {
print $0
print " imagePullSecrets:"
print " - name: dockerhub-creds"
done = 1
next
}
{ print }
' "$file" > "$temp_file"
# Check if changes were made
if ! cmp -s "$file" "$temp_file"; then
mv "$temp_file" "$file"
echo -e "${GREEN} ✓ Updated: $(basename $file)${NC}"
((count++))
else
rm "$temp_file"
echo -e "${YELLOW} ⊘ No changes needed: $(basename $file)${NC}"
fi
}
# Process all service deployments
echo -e "${BLUE}Processing service deployments...${NC}"
find $BASE_DIR/base/components -name "*-service.yaml" | while read file; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process all database deployments
echo -e "${BLUE}Processing database deployments...${NC}"
for file in $BASE_DIR/base/components/databases/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process all migration jobs
echo -e "${BLUE}Processing migration jobs...${NC}"
for file in $BASE_DIR/base/migrations/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process all cronjobs
echo -e "${BLUE}Processing cronjobs...${NC}"
for file in $BASE_DIR/base/cronjobs/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process standalone jobs
echo -e "${BLUE}Processing standalone jobs...${NC}"
for file in $BASE_DIR/base/jobs/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process deployments directory
echo -e "${BLUE}Processing deployments...${NC}"
for file in $BASE_DIR/base/deployments/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process nominatim service
if [ -f "$BASE_DIR/base/components/infrastructure/nominatim.yaml" ]; then
echo -e "${BLUE}Processing nominatim service...${NC}"
add_image_pull_secrets "$BASE_DIR/base/components/infrastructure/nominatim.yaml"
echo ""
fi
echo "======================================================"
echo -e "${GREEN}Completed! Updated $count file(s)${NC}"
echo ""
echo "Next steps:"
echo "1. Review the changes: git diff"
echo "2. Apply to cluster: kubectl apply -k infrastructure/kubernetes/overlays/dev"
echo "3. Verify pods are running: kubectl get pods -n bakery-ia"

View File

@@ -1,94 +0,0 @@
#!/bin/bash
# Script to add OpenTelemetry monitoring configuration to all service deployments
# This adds the necessary environment variables for SigNoz integration
# Note: No Prometheus annotations needed - all metrics go via OTLP push
set -e
SERVICES=(
"ai-insights"
"distribution"
"external"
"forecasting"
"inventory"
"notification"
"orchestrator"
"orders"
"pos"
"procurement"
"production"
"recipes"
"sales"
"suppliers"
"tenant"
"training"
"frontend"
)
echo "Adding OpenTelemetry configuration to all services..."
echo ""
for service in "${SERVICES[@]}"; do
SERVICE_FILE="infrastructure/kubernetes/base/components/${service}/${service}-service.yaml"
if [ ! -f "$SERVICE_FILE" ]; then
echo "⚠️ Skipping $service (file not found: $SERVICE_FILE)"
continue
fi
echo "📝 Processing $service-service..."
# Check if already has OTEL env vars
if grep -q "OTEL_COLLECTOR_ENDPOINT" "$SERVICE_FILE"; then
echo " ✓ Already has OpenTelemetry configuration"
else
echo " + Adding OpenTelemetry environment variables"
# Create a YAML patch
cat > "/tmp/${service}-otel-patch.yaml" << 'EOF'
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "SERVICE_NAME_PLACEHOLDER"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration (all via OTLP, no Prometheus)
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
EOF
# Replace placeholder with actual service name
sed -i.bak "s/SERVICE_NAME_PLACEHOLDER/${service}-service/g" "/tmp/${service}-otel-patch.yaml"
echo " ⚠️ Manual step required: Add env vars from /tmp/${service}-otel-patch.yaml"
echo " Insert after 'ports:' section and before 'envFrom:' in $SERVICE_FILE"
fi
echo "$service-service processed"
echo ""
done
echo ""
echo "✅ Monitoring configuration prepared for all services!"
echo ""
echo "Next steps:"
echo "1. Review the changes and manually add env vars from /tmp/*-otel-patch.yaml files"
echo "2. Update SigNoz: helm upgrade signoz signoz/signoz -n signoz -f infrastructure/helm/signoz-values-dev.yaml"
echo "3. Restart services: kubectl rollout restart deployment -n bakery-ia"
echo "4. Check SigNoz UI at https://monitoring.bakery-ia.local for incoming data"
echo ""
echo "What metrics you'll see:"
echo " - HTTP requests (method, endpoint, status code, duration)"
echo " - System metrics (CPU, memory usage per process)"
echo " - System-wide metrics (total CPU, memory, disk I/O, network I/O)"
echo " - Custom business metrics (registrations, orders, etc.)"
echo " - All pushed via OpenTelemetry OTLP (no Prometheus scraping)"

View File

@@ -1,162 +0,0 @@
#!/usr/bin/env python3
"""
Script to automatically add OpenTelemetry monitoring configuration to all service deployments.
This adds environment variables for metrics, logs, and traces export to SigNoz.
"""
import os
import re
import sys
from pathlib import Path
# Services to configure
SERVICES = [
"ai-insights",
"distribution",
"external",
"forecasting",
"inventory",
"notification",
"orchestrator",
"orders",
"pos",
"procurement",
"production",
"recipes",
"sales",
"suppliers",
"tenant",
"training",
]
OTEL_ENV_VARS_TEMPLATE = """ env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "{service_name}"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration (all via OTLP, no Prometheus)
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
"""
def has_otel_config(content: str) -> bool:
"""Check if file already has OTEL configuration"""
return "OTEL_COLLECTOR_ENDPOINT" in content
def add_otel_config(content: str, service_name: str) -> str:
"""Add OTEL configuration to service deployment"""
# Prepare the env vars with the service name
env_vars = OTEL_ENV_VARS_TEMPLATE.format(service_name=f"{service_name}-service")
# Find the container section and add env vars before envFrom
# Pattern: find " containers:" then first " envFrom:" after it
pattern = r'( containers:\n - name: [^\n]+\n image: [^\n]+\n(?: ports:\n(?: - [^\n]+\n)+)?)( envFrom:)'
replacement = r'\1' + env_vars + r'\2'
# Try to replace
new_content = re.sub(pattern, replacement, content, count=1)
if new_content == content:
print(f" ⚠️ Warning: Could not find insertion point automatically")
return content
return new_content
def process_service(service_name: str, base_path: Path) -> bool:
"""Process a single service deployment file"""
service_file = base_path / "components" / service_name / f"{service_name}-service.yaml"
if not service_file.exists():
print(f" ⚠️ File not found: {service_file}")
return False
# Read file
with open(service_file, 'r') as f:
content = f.read()
# Check if already configured
if has_otel_config(content):
print(f" ✓ Already configured")
return True
# Add configuration
new_content = add_otel_config(content, service_name)
if new_content == content:
return False
# Write back
with open(service_file, 'w') as f:
f.write(new_content)
print(f" ✅ Updated successfully")
return True
def main():
"""Main function"""
# Find base path
script_dir = Path(__file__).parent
base_path = script_dir / "base"
if not base_path.exists():
print(f"❌ Error: Base path not found: {base_path}")
sys.exit(1)
print("=" * 60)
print("Adding OpenTelemetry Monitoring Configuration")
print("=" * 60)
print()
success_count = 0
skip_count = 0
fail_count = 0
for service in SERVICES:
print(f"📝 Processing {service}-service...")
result = process_service(service, base_path)
if result:
if has_otel_config(open(base_path / "components" / service / f"{service}-service.yaml").read()):
success_count += 1
else:
fail_count += 1
print()
print("=" * 60)
print(f"✅ Successfully configured: {success_count}")
if fail_count > 0:
print(f"⚠️ Failed to configure: {fail_count}")
print("=" * 60)
print()
print("Next steps:")
print("1. Review the changes: git diff infrastructure/kubernetes/base/components/")
print("2. Update SigNoz: helm upgrade signoz signoz/signoz -n signoz -f infrastructure/helm/signoz-values-dev.yaml")
print("3. Apply changes: kubectl apply -k infrastructure/kubernetes/overlays/dev/")
print("4. Verify: kubectl logs -n bakery-ia deployment/<service-name> | grep -i 'otel\\|metrics'")
if __name__ == "__main__":
main()

View File

@@ -52,7 +52,7 @@ spec:
name: whatsapp-secrets
env:
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://otel-collector.monitoring.svc.cluster.local:4317"
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4317"
resources:
requests:
memory: "256Mi"

View File

@@ -291,7 +291,7 @@ data:
HEALTH_CHECK_INTERVAL: "30"
# Monitoring Configuration - SigNoz
SIGNOZ_ROOT_URL: "http://localhost/signoz"
SIGNOZ_ROOT_URL: "https://monitoring.bakery-ia.local"
# ================================================================
# DATA COLLECTION SETTINGS
@@ -390,9 +390,9 @@ data:
OTEL_SERVICE_NAME: "bakery-ia"
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
# SigNoz Endpoints
SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "http://signoz-frontend.signoz.svc.cluster.local:3301"
# SigNoz Endpoints (v0.106.0+ unified service)
SIGNOZ_ENDPOINT: "http://signoz.signoz.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local"
# ================================================================
# REPLENISHMENT PLANNING SETTINGS

View File

@@ -63,22 +63,7 @@ spec:
name: gateway-service
port:
number: 8000
- host: monitoring.your-domain.com # To be overridden in overlays
http:
paths:
# SigNoz Frontend UI and API (consolidated in newer versions)
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz
port:
number: 8080
# SigNoz API endpoints
- path: /signoz-api(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz
port:
number: 8080
# Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace
# SigNoz creates its own Ingress via Helm chart configuration
# Access at: https://monitoring.your-domain.com/ (configured in signoz-values.yaml)
# SignOz ingress is managed separately - no need to configure here

View File

@@ -0,0 +1,126 @@
#!/bin/bash
# =============================================================================
# Create Docker Hub Image Pull Secret
# =============================================================================
# This script creates a Kubernetes secret for pulling images from Docker Hub.
# The secret is used by both:
# 1. bakery-ia namespace deployments (Tilt + Kustomize)
# 2. Signoz Helm deployment
#
# Usage:
# ./create-dockerhub-secret.sh
#
# Prerequisites:
# - kubectl configured with access to the cluster
# - DOCKERHUB_USERNAME and DOCKERHUB_PASSWORD environment variables set
# - OR Docker CLI logged in (docker login)
# =============================================================================
set -e
echo "🔐 Creating Docker Hub Image Pull Secret"
echo "=========================================="
echo ""
# Check for required environment variables
if [ -z "$DOCKERHUB_USERNAME" ] || [ -z "$DOCKERHUB_PASSWORD" ]; then
echo "⚠️ DOCKERHUB_USERNAME and DOCKERHUB_PASSWORD environment variables not set"
echo ""
echo "Checking if Docker CLI is logged in..."
# Try to extract credentials from Docker config
if [ -f "$HOME/.docker/config.json" ]; then
# Check if using credential store
if grep -q "credsStore" "$HOME/.docker/config.json"; then
echo "⚠️ Docker is using a credential store. Please set environment variables manually:"
echo ""
echo " export DOCKERHUB_USERNAME='your-username'"
echo " export DOCKERHUB_PASSWORD='your-password-or-token'"
echo ""
exit 1
fi
# Try to extract from base64 encoded auth
AUTH=$(cat "$HOME/.docker/config.json" | jq -r '.auths["https://index.docker.io/v1/"].auth // empty' 2>/dev/null)
if [ -n "$AUTH" ]; then
echo "✅ Found Docker Hub credentials in Docker config"
DOCKERHUB_USERNAME=$(echo "$AUTH" | base64 -d | cut -d: -f1)
DOCKERHUB_PASSWORD=$(echo "$AUTH" | base64 -d | cut -d: -f2-)
else
echo "❌ Could not find Docker Hub credentials"
echo ""
echo "Please either:"
echo " 1. Run 'docker login' first, OR"
echo " 2. Set environment variables:"
echo " export DOCKERHUB_USERNAME='your-username'"
echo " export DOCKERHUB_PASSWORD='your-password-or-token'"
echo ""
exit 1
fi
else
echo "❌ Docker config not found and environment variables not set"
echo ""
echo "Please set environment variables:"
echo " export DOCKERHUB_USERNAME='your-username'"
echo " export DOCKERHUB_PASSWORD='your-password-or-token'"
echo ""
exit 1
fi
fi
echo "Using Docker Hub username: $DOCKERHUB_USERNAME"
echo ""
# Function to create secret in a namespace
create_secret_in_namespace() {
local NAMESPACE=$1
echo "📦 Creating secret in namespace: $NAMESPACE"
# Create namespace if it doesn't exist
if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
echo " Creating namespace $NAMESPACE..."
kubectl create namespace "$NAMESPACE"
fi
# Delete existing secret if it exists
if kubectl get secret dockerhub-creds -n "$NAMESPACE" &>/dev/null; then
echo " Deleting existing secret..."
kubectl delete secret dockerhub-creds -n "$NAMESPACE"
fi
# Create the secret
kubectl create secret docker-registry dockerhub-creds \
--docker-server=https://index.docker.io/v1/ \
--docker-username="$DOCKERHUB_USERNAME" \
--docker-password="$DOCKERHUB_PASSWORD" \
--docker-email="${DOCKERHUB_EMAIL:-noreply@bakery-ia.local}" \
-n "$NAMESPACE"
echo " ✅ Secret created successfully"
echo ""
}
# Create secret in bakery-ia namespace (for Tilt deployments)
create_secret_in_namespace "bakery-ia"
# Create secret in signoz namespace (for Signoz Helm deployment - if namespace exists)
if kubectl get namespace signoz &>/dev/null; then
create_secret_in_namespace "signoz"
else
echo " Signoz namespace not found, skipping (will be created on Helm install)"
echo ""
fi
echo "✅ Docker Hub secrets created successfully!"
echo ""
echo "The secret 'dockerhub-creds' is now available in:"
echo " - bakery-ia namespace (for Tilt/Kustomize deployments)"
if kubectl get namespace signoz &>/dev/null; then
echo " - signoz namespace (for Signoz Helm deployment)"
fi
echo ""
echo "All pods with imagePullSecrets: dockerhub-creds will now use these credentials"
echo "to pull images from Docker Hub."
echo ""

View File

@@ -73,14 +73,7 @@ spec:
name: gateway-service
port:
number: 8000
- host: monitoring.bakery-ia.local
http:
paths:
# SigNoz Frontend UI
- path: /
pathType: Prefix
backend:
service:
name: signoz
port:
number: 8080
# Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace
# SigNoz creates its own Ingress via Helm chart configuration (signoz-values-dev.yaml)
# Access at: https://monitoring.bakery-ia.local/
# SignOz is served at the root of the monitoring subdomain

View File

@@ -73,13 +73,13 @@ patches:
value: "deployment.environment=production,cluster.name=bakery-ia-prod"
- op: add
path: /data/SIGNOZ_ENDPOINT
value: "http://signoz-query-service.signoz.svc.cluster.local:8080"
value: "http://signoz.signoz.svc.cluster.local:8080"
- op: add
path: /data/SIGNOZ_FRONTEND_URL
value: "https://monitoring.bakewise.ai/signoz"
value: "https://monitoring.bakewise.ai"
- op: add
path: /data/SIGNOZ_ROOT_URL
value: "https://monitoring.bakewise.ai/signoz"
value: "https://monitoring.bakewise.ai"
- op: add
path: /data/RATE_LIMIT_ENABLED
value: "true"
@@ -119,12 +119,12 @@ patches:
limits:
memory: "4Gi"
cpu: "1000m"
# SigNoz Query Service production configuration
# SigNoz Main Service production configuration (v0.106.0+ unified service)
- target:
group: apps
version: v1
kind: Deployment
name: signoz-query-service
kind: StatefulSet
name: signoz
namespace: signoz
patch: |-
- op: replace
@@ -134,11 +134,11 @@ patches:
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "4Gi"
cpu: "2000m"
# SigNoz AlertManager production configuration
- target:
group: apps
@@ -159,26 +159,6 @@ patches:
limits:
memory: "1Gi"
cpu: "500m"
# SigNoz Frontend production configuration
- target:
group: apps
version: v1
kind: Deployment
name: signoz-frontend
namespace: signoz
patch: |-
- op: replace
path: /spec/replicas
value: 2
- op: replace
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
images:
- name: bakery/auth-service

View File

@@ -28,10 +28,10 @@ data:
OTEL_SERVICE_NAME: "bakery-ia"
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=production,cluster.name=bakery-ia-prod"
# SigNoz Endpoints
SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai/signoz"
SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai/signoz"
# SigNoz Endpoints (v0.106.0+ unified service)
SIGNOZ_ENDPOINT: "http://signoz.signoz.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai"
SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai"
# Rate Limiting (stricter in production)
RATE_LIMIT_ENABLED: "true"