Imporve monitoring

This commit is contained in:
Urtzi Alfaro
2026-01-09 06:57:18 +01:00
parent e8fda39e50
commit 4af860c010
16 changed files with 333 additions and 635 deletions

View File

@@ -63,6 +63,35 @@ Monitoring:
Applying security configurations...
""")
# Create Docker Hub secret for image pulls (if credentials are available)
local_resource(
'dockerhub-secret',
cmd='''
echo "🐳 Setting up Docker Hub image pull secret..."
# Check if Docker Hub credentials are available
if [ -n "$DOCKERHUB_USERNAME" ] && [ -n "$DOCKERHUB_PASSWORD" ]; then
echo " Found DOCKERHUB_USERNAME and DOCKERHUB_PASSWORD environment variables"
./infrastructure/kubernetes/create-dockerhub-secret.sh
elif [ -f "$HOME/.docker/config.json" ]; then
echo " Attempting to use Docker CLI credentials..."
./infrastructure/kubernetes/create-dockerhub-secret.sh
else
echo " ⚠️ Docker Hub credentials not found"
echo " To enable automatic Docker Hub authentication:"
echo " 1. Run 'docker login', OR"
echo " 2. Set environment variables:"
echo " export DOCKERHUB_USERNAME='your-username'"
echo " export DOCKERHUB_PASSWORD='your-password-or-token'"
echo ""
echo " Continuing without Docker Hub authentication..."
echo " (This is OK for local development using local registry)"
fi
''',
labels=['00-security'],
auto_init=True
)
# Apply security configurations before loading main manifests
local_resource(
'security-setup',
@@ -75,6 +104,7 @@ local_resource(
kubectl apply -f infrastructure/kubernetes/base/configmaps/postgres-logging-config.yaml
echo "✅ Security configurations applied"
''',
resource_deps=['dockerhub-secret'],
labels=['00-security'],
auto_init=True
)
@@ -338,10 +368,20 @@ local_resource(
echo "📊 Deploying SigNoz Monitoring Stack..."
echo ""
# Ensure Docker Hub secret exists in bakery-ia namespace
echo "🔐 Ensuring Docker Hub secret exists in bakery-ia namespace..."
if ! kubectl get secret dockerhub-creds -n bakery-ia &>/dev/null; then
echo " ⚠️ Docker Hub secret not found, attempting to create..."
./infrastructure/kubernetes/create-dockerhub-secret.sh || echo " Continuing without Docker Hub authentication..."
else
echo " ✅ Docker Hub secret exists"
fi
echo ""
# Check if SigNoz is already deployed
if helm list -n signoz | grep -q signoz; then
if helm list -n bakery-ia | grep -q signoz; then
echo "✅ SigNoz already deployed, checking status..."
helm status signoz -n signoz
helm status signoz -n bakery-ia
else
echo "🚀 Installing SigNoz..."

View File

@@ -37,6 +37,14 @@ show_help() {
$0 prod # Deploy to production
$0 --upgrade prod # Upgrade production deployment
$0 --remove dev # Remove development deployment"
echo ""
echo "Docker Hub Authentication:"
echo " This script automatically creates a Docker Hub secret for image pulls."
echo " Provide credentials via environment variables (recommended):"
echo " export DOCKERHUB_USERNAME='your-username'"
echo " export DOCKERHUB_PASSWORD='your-personal-access-token'"
echo " Or ensure you're logged in with Docker CLI:"
echo " docker login"
}
# Parse command line arguments
@@ -124,6 +132,82 @@ ensure_namespace() {
fi
}
# Function to create Docker Hub secret for image pulls
create_dockerhub_secret() {
echo "${BLUE}Setting up Docker Hub image pull secret...${NC}"
if [[ "$DRY_RUN" == true ]]; then
echo " (dry-run) Would create Docker Hub secret in namespace $NAMESPACE"
return
fi
# Check if secret already exists
if kubectl get secret dockerhub-creds -n "$NAMESPACE" &> /dev/null; then
echo "${GREEN}Docker Hub secret already exists in namespace $NAMESPACE.${NC}"
return
fi
# Check if Docker Hub credentials are available
if [[ -n "$DOCKERHUB_USERNAME" ]] && [[ -n "$DOCKERHUB_PASSWORD" ]]; then
echo "${BLUE}Found DOCKERHUB_USERNAME and DOCKERHUB_PASSWORD environment variables${NC}"
kubectl create secret docker-registry dockerhub-creds \
--docker-server=https://index.docker.io/v1/ \
--docker-username="$DOCKERHUB_USERNAME" \
--docker-password="$DOCKERHUB_PASSWORD" \
--docker-email="${DOCKERHUB_EMAIL:-noreply@bakery-ia.local}" \
-n "$NAMESPACE"
echo "${GREEN}Docker Hub secret created successfully.${NC}"
elif [[ -f "$HOME/.docker/config.json" ]]; then
echo "${BLUE}Attempting to use Docker CLI credentials...${NC}"
# Try to extract credentials from Docker config
if grep -q "credsStore" "$HOME/.docker/config.json"; then
echo "${YELLOW}Docker is using a credential store. Please set environment variables:${NC}"
echo " export DOCKERHUB_USERNAME='your-username'"
echo " export DOCKERHUB_PASSWORD='your-password-or-token'"
echo "${YELLOW}Continuing without Docker Hub authentication...${NC}"
return
fi
# Try to extract from base64 encoded auth
AUTH=$(cat "$HOME/.docker/config.json" | jq -r '.auths["https://index.docker.io/v1/"].auth // empty' 2>/dev/null)
if [[ -n "$AUTH" ]]; then
echo "${GREEN}Found Docker Hub credentials in Docker config${NC}"
local DOCKER_USERNAME=$(echo "$AUTH" | base64 -d | cut -d: -f1)
local DOCKER_PASSWORD=$(echo "$AUTH" | base64 -d | cut -d: -f2-)
kubectl create secret docker-registry dockerhub-creds \
--docker-server=https://index.docker.io/v1/ \
--docker-username="$DOCKER_USERNAME" \
--docker-password="$DOCKER_PASSWORD" \
--docker-email="${DOCKERHUB_EMAIL:-noreply@bakery-ia.local}" \
-n "$NAMESPACE"
echo "${GREEN}Docker Hub secret created successfully.${NC}"
else
echo "${YELLOW}Could not find Docker Hub credentials${NC}"
echo "${YELLOW}To enable automatic Docker Hub authentication:${NC}"
echo " 1. Run 'docker login', OR"
echo " 2. Set environment variables:"
echo " export DOCKERHUB_USERNAME='your-username'"
echo " export DOCKERHUB_PASSWORD='your-password-or-token'"
echo "${YELLOW}Continuing without Docker Hub authentication...${NC}"
fi
else
echo "${YELLOW}Docker Hub credentials not found${NC}"
echo "${YELLOW}To enable automatic Docker Hub authentication:${NC}"
echo " 1. Run 'docker login', OR"
echo " 2. Set environment variables:"
echo " export DOCKERHUB_USERNAME='your-username'"
echo " export DOCKERHUB_PASSWORD='your-password-or-token'"
echo "${YELLOW}Continuing without Docker Hub authentication...${NC}"
fi
echo ""
}
# Function to deploy SigNoz
deploy_signoz() {
local values_file="infrastructure/helm/signoz-values-$ENVIRONMENT.yaml"
@@ -278,12 +362,15 @@ main() {
# Ensure namespace
ensure_namespace
if [[ "$REMOVE" == true ]]; then
remove_signoz
exit 0
fi
# Create Docker Hub secret for image pulls
create_dockerhub_secret
# Deploy SigNoz
deploy_signoz

View File

@@ -7,74 +7,41 @@
global:
storageClass: "standard"
domain: "monitoring.bakery-ia.local"
# Docker Hub credentials for pulling images
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
imagePullSecrets:
- name: dockerhub-creds
- dockerhub-creds
# Frontend Configuration
frontend:
# Docker Hub credentials for pulling images (root level for SigNoz components)
imagePullSecrets:
- dockerhub-creds
# SignOz Main Component (includes frontend and query service)
signoz:
replicaCount: 1
image:
repository: signoz/frontend
tag: 0.52.3
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 3301
ingress:
enabled: true
className: nginx
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/use-regex: "true"
hosts:
- host: monitoring.bakery-ia.local
paths:
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
tls: []
resources:
requests:
cpu: 25m # Reduced for local dev
memory: 64Mi # Reduced for local dev
limits:
cpu: 200m
memory: 256Mi
env:
- name: FRONTEND_REFRESH_INTERVAL
value: "30000"
- name: BASE_URL
value: "https://monitoring.bakery-ia.local/signoz"
# Query Service Configuration
queryService:
replicaCount: 1
image:
repository: signoz/query-service
tag: 0.52.3
pullPolicy: IfNotPresent
service:
type: ClusterIP
port: 8080
ingress:
enabled: true
className: nginx
annotations: {}
hosts:
- host: monitoring.bakery-ia.local
paths:
- path: /
pathType: Prefix
port: 8080
tls: []
resources:
requests:
cpu: 50m # Reduced for local dev
memory: 128Mi # Reduced for local dev
cpu: 100m # Combined frontend + query service
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
env:
- name: DEPLOYMENT_TYPE
value: "kubernetes-helm"
- name: SIGNOZ_LOCAL_DB_PATH
value: "/var/lib/signoz"
cpu: 1000m
memory: 1Gi
persistence:
enabled: true
@@ -135,6 +102,10 @@ clickhouse:
cpu: 1000m
memory: 1Gi
# Zookeeper Configuration (required by ClickHouse)
zookeeper:
enabled: true
# OpenTelemetry Collector - Data ingestion endpoint for all telemetry
otelCollector:
enabled: true
@@ -262,8 +233,8 @@ otelCollector:
timeout: 10s
# ClickHouse exporter for metrics
clickhousemetricswrite:
endpoint: tcp://signoz-clickhouse:9000/?database=signoz_metrics
signozclickhousemetrics:
dsn: "tcp://admin:27ff0399-0d3a-4bd8-919d-17c2181e6fb9@signoz-clickhouse:9000/signoz_metrics"
timeout: 10s
# ClickHouse exporter for logs
@@ -271,9 +242,9 @@ otelCollector:
dsn: tcp://signoz-clickhouse:9000/?database=signoz_logs
timeout: 10s
# Logging exporter for debugging (optional)
logging:
loglevel: info
# Debug exporter for debugging (optional)
debug:
verbosity: detailed
service:
pipelines:
@@ -287,7 +258,7 @@ otelCollector:
metrics:
receivers: [otlp, postgresql/auth, postgresql/inventory, postgresql/orders, redis, rabbitmq]
processors: [memory_limiter, batch, resourcedetection]
exporters: [clickhousemetricswrite]
exporters: [signozclickhousemetrics]
# Logs pipeline
logs:

View File

@@ -7,6 +7,13 @@
global:
storageClass: "standard"
domain: "monitoring.bakewise.ai"
# Docker Hub credentials - applied to all sub-charts (including Zookeeper, ClickHouse, etc)
imagePullSecrets:
- dockerhub-creds
# Docker Hub credentials for pulling images (root level for SigNoz components)
imagePullSecrets:
- dockerhub-creds
# Frontend Configuration
frontend:
@@ -351,8 +358,8 @@ otelCollector:
max_interval: 30s
max_elapsed_time: 300s
clickhousemetricswrite:
endpoint: tcp://clickhouse:9000/?database=signoz_metrics
signozclickhousemetrics:
endpoint: "tcp://clickhouse:9000/?database=signoz_metrics"
timeout: 10s
retry_on_failure:
enabled: true
@@ -369,9 +376,9 @@ otelCollector:
max_interval: 30s
max_elapsed_time: 300s
# Minimal logging for prod
logging:
loglevel: warn
# Debug exporter for debugging (replaces deprecated logging exporter)
debug:
verbosity: detailed
sampling_initial: 2
sampling_thereafter: 500
@@ -381,17 +388,17 @@ otelCollector:
traces:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhousetraces, logging]
exporters: [clickhousetraces, debug]
metrics:
receivers: [otlp, prometheus]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhousemetricswrite]
exporters: [signozclickhousemetrics]
logs:
receivers: [otlp]
processors: [memory_limiter, batch, resourcedetection, resource]
exporters: [clickhouselogsexporter, logging]
exporters: [clickhouselogsexporter, debug]
# OpenTelemetry Collector Deployment Mode
otelCollectorDeployment:

View File

@@ -1,125 +0,0 @@
#!/bin/bash
# Script to add imagePullSecrets to all Kubernetes deployments, jobs, and cronjobs
# This ensures all pods can pull images from Docker Hub using the dockerhub-creds secret
SECRET_NAME="dockerhub-creds"
BASE_DIR="/Users/urtzialfaro/Documents/bakery-ia/infrastructure/kubernetes"
# ANSI color codes
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo -e "${BLUE}Adding imagePullSecrets to all Kubernetes resources...${NC}"
echo "======================================================"
echo ""
# Counter for files processed
count=0
# Function to add imagePullSecrets to a file
add_image_pull_secrets() {
local file="$1"
# Check if file already has imagePullSecrets
if grep -q "imagePullSecrets:" "$file"; then
echo -e "${YELLOW} ⊘ Skipping (already has imagePullSecrets): $(basename $file)${NC}"
return
fi
# Temporary file for processing
temp_file=$(mktemp)
# Process the file using awk to add imagePullSecrets after "spec:" in template or job spec
awk '
/^ spec:$/ && !done {
print $0
print " imagePullSecrets:"
print " - name: dockerhub-creds"
done = 1
next
}
{ print }
' "$file" > "$temp_file"
# Check if changes were made
if ! cmp -s "$file" "$temp_file"; then
mv "$temp_file" "$file"
echo -e "${GREEN} ✓ Updated: $(basename $file)${NC}"
((count++))
else
rm "$temp_file"
echo -e "${YELLOW} ⊘ No changes needed: $(basename $file)${NC}"
fi
}
# Process all service deployments
echo -e "${BLUE}Processing service deployments...${NC}"
find $BASE_DIR/base/components -name "*-service.yaml" | while read file; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process all database deployments
echo -e "${BLUE}Processing database deployments...${NC}"
for file in $BASE_DIR/base/components/databases/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process all migration jobs
echo -e "${BLUE}Processing migration jobs...${NC}"
for file in $BASE_DIR/base/migrations/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process all cronjobs
echo -e "${BLUE}Processing cronjobs...${NC}"
for file in $BASE_DIR/base/cronjobs/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process standalone jobs
echo -e "${BLUE}Processing standalone jobs...${NC}"
for file in $BASE_DIR/base/jobs/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process deployments directory
echo -e "${BLUE}Processing deployments...${NC}"
for file in $BASE_DIR/base/deployments/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process nominatim service
if [ -f "$BASE_DIR/base/components/infrastructure/nominatim.yaml" ]; then
echo -e "${BLUE}Processing nominatim service...${NC}"
add_image_pull_secrets "$BASE_DIR/base/components/infrastructure/nominatim.yaml"
echo ""
fi
echo "======================================================"
echo -e "${GREEN}Completed! Updated $count file(s)${NC}"
echo ""
echo "Next steps:"
echo "1. Review the changes: git diff"
echo "2. Apply to cluster: kubectl apply -k infrastructure/kubernetes/overlays/dev"
echo "3. Verify pods are running: kubectl get pods -n bakery-ia"

View File

@@ -1,94 +0,0 @@
#!/bin/bash
# Script to add OpenTelemetry monitoring configuration to all service deployments
# This adds the necessary environment variables for SigNoz integration
# Note: No Prometheus annotations needed - all metrics go via OTLP push
set -e
SERVICES=(
"ai-insights"
"distribution"
"external"
"forecasting"
"inventory"
"notification"
"orchestrator"
"orders"
"pos"
"procurement"
"production"
"recipes"
"sales"
"suppliers"
"tenant"
"training"
"frontend"
)
echo "Adding OpenTelemetry configuration to all services..."
echo ""
for service in "${SERVICES[@]}"; do
SERVICE_FILE="infrastructure/kubernetes/base/components/${service}/${service}-service.yaml"
if [ ! -f "$SERVICE_FILE" ]; then
echo "⚠️ Skipping $service (file not found: $SERVICE_FILE)"
continue
fi
echo "📝 Processing $service-service..."
# Check if already has OTEL env vars
if grep -q "OTEL_COLLECTOR_ENDPOINT" "$SERVICE_FILE"; then
echo " ✓ Already has OpenTelemetry configuration"
else
echo " + Adding OpenTelemetry environment variables"
# Create a YAML patch
cat > "/tmp/${service}-otel-patch.yaml" << 'EOF'
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "SERVICE_NAME_PLACEHOLDER"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration (all via OTLP, no Prometheus)
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
EOF
# Replace placeholder with actual service name
sed -i.bak "s/SERVICE_NAME_PLACEHOLDER/${service}-service/g" "/tmp/${service}-otel-patch.yaml"
echo " ⚠️ Manual step required: Add env vars from /tmp/${service}-otel-patch.yaml"
echo " Insert after 'ports:' section and before 'envFrom:' in $SERVICE_FILE"
fi
echo "$service-service processed"
echo ""
done
echo ""
echo "✅ Monitoring configuration prepared for all services!"
echo ""
echo "Next steps:"
echo "1. Review the changes and manually add env vars from /tmp/*-otel-patch.yaml files"
echo "2. Update SigNoz: helm upgrade signoz signoz/signoz -n signoz -f infrastructure/helm/signoz-values-dev.yaml"
echo "3. Restart services: kubectl rollout restart deployment -n bakery-ia"
echo "4. Check SigNoz UI at https://monitoring.bakery-ia.local for incoming data"
echo ""
echo "What metrics you'll see:"
echo " - HTTP requests (method, endpoint, status code, duration)"
echo " - System metrics (CPU, memory usage per process)"
echo " - System-wide metrics (total CPU, memory, disk I/O, network I/O)"
echo " - Custom business metrics (registrations, orders, etc.)"
echo " - All pushed via OpenTelemetry OTLP (no Prometheus scraping)"

View File

@@ -1,162 +0,0 @@
#!/usr/bin/env python3
"""
Script to automatically add OpenTelemetry monitoring configuration to all service deployments.
This adds environment variables for metrics, logs, and traces export to SigNoz.
"""
import os
import re
import sys
from pathlib import Path
# Services to configure
SERVICES = [
"ai-insights",
"distribution",
"external",
"forecasting",
"inventory",
"notification",
"orchestrator",
"orders",
"pos",
"procurement",
"production",
"recipes",
"sales",
"suppliers",
"tenant",
"training",
]
OTEL_ENV_VARS_TEMPLATE = """ env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "{service_name}"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration (all via OTLP, no Prometheus)
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
"""
def has_otel_config(content: str) -> bool:
"""Check if file already has OTEL configuration"""
return "OTEL_COLLECTOR_ENDPOINT" in content
def add_otel_config(content: str, service_name: str) -> str:
"""Add OTEL configuration to service deployment"""
# Prepare the env vars with the service name
env_vars = OTEL_ENV_VARS_TEMPLATE.format(service_name=f"{service_name}-service")
# Find the container section and add env vars before envFrom
# Pattern: find " containers:" then first " envFrom:" after it
pattern = r'( containers:\n - name: [^\n]+\n image: [^\n]+\n(?: ports:\n(?: - [^\n]+\n)+)?)( envFrom:)'
replacement = r'\1' + env_vars + r'\2'
# Try to replace
new_content = re.sub(pattern, replacement, content, count=1)
if new_content == content:
print(f" ⚠️ Warning: Could not find insertion point automatically")
return content
return new_content
def process_service(service_name: str, base_path: Path) -> bool:
"""Process a single service deployment file"""
service_file = base_path / "components" / service_name / f"{service_name}-service.yaml"
if not service_file.exists():
print(f" ⚠️ File not found: {service_file}")
return False
# Read file
with open(service_file, 'r') as f:
content = f.read()
# Check if already configured
if has_otel_config(content):
print(f" ✓ Already configured")
return True
# Add configuration
new_content = add_otel_config(content, service_name)
if new_content == content:
return False
# Write back
with open(service_file, 'w') as f:
f.write(new_content)
print(f" ✅ Updated successfully")
return True
def main():
"""Main function"""
# Find base path
script_dir = Path(__file__).parent
base_path = script_dir / "base"
if not base_path.exists():
print(f"❌ Error: Base path not found: {base_path}")
sys.exit(1)
print("=" * 60)
print("Adding OpenTelemetry Monitoring Configuration")
print("=" * 60)
print()
success_count = 0
skip_count = 0
fail_count = 0
for service in SERVICES:
print(f"📝 Processing {service}-service...")
result = process_service(service, base_path)
if result:
if has_otel_config(open(base_path / "components" / service / f"{service}-service.yaml").read()):
success_count += 1
else:
fail_count += 1
print()
print("=" * 60)
print(f"✅ Successfully configured: {success_count}")
if fail_count > 0:
print(f"⚠️ Failed to configure: {fail_count}")
print("=" * 60)
print()
print("Next steps:")
print("1. Review the changes: git diff infrastructure/kubernetes/base/components/")
print("2. Update SigNoz: helm upgrade signoz signoz/signoz -n signoz -f infrastructure/helm/signoz-values-dev.yaml")
print("3. Apply changes: kubectl apply -k infrastructure/kubernetes/overlays/dev/")
print("4. Verify: kubectl logs -n bakery-ia deployment/<service-name> | grep -i 'otel\\|metrics'")
if __name__ == "__main__":
main()

View File

@@ -52,7 +52,7 @@ spec:
name: whatsapp-secrets
env:
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://otel-collector.monitoring.svc.cluster.local:4317"
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4317"
resources:
requests:
memory: "256Mi"

View File

@@ -291,7 +291,7 @@ data:
HEALTH_CHECK_INTERVAL: "30"
# Monitoring Configuration - SigNoz
SIGNOZ_ROOT_URL: "http://localhost/signoz"
SIGNOZ_ROOT_URL: "https://monitoring.bakery-ia.local"
# ================================================================
# DATA COLLECTION SETTINGS
@@ -390,9 +390,9 @@ data:
OTEL_SERVICE_NAME: "bakery-ia"
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=development"
# SigNoz Endpoints
SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "http://signoz-frontend.signoz.svc.cluster.local:3301"
# SigNoz Endpoints (v0.106.0+ unified service)
SIGNOZ_ENDPOINT: "http://signoz.signoz.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakery-ia.local"
# ================================================================
# REPLENISHMENT PLANNING SETTINGS

View File

@@ -63,22 +63,7 @@ spec:
name: gateway-service
port:
number: 8000
- host: monitoring.your-domain.com # To be overridden in overlays
http:
paths:
# SigNoz Frontend UI and API (consolidated in newer versions)
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz
port:
number: 8080
# SigNoz API endpoints
- path: /signoz-api(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz
port:
number: 8080
# Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace
# SigNoz creates its own Ingress via Helm chart configuration
# Access at: https://monitoring.your-domain.com/ (configured in signoz-values.yaml)
# SignOz ingress is managed separately - no need to configure here

View File

@@ -0,0 +1,126 @@
#!/bin/bash
# =============================================================================
# Create Docker Hub Image Pull Secret
# =============================================================================
# This script creates a Kubernetes secret for pulling images from Docker Hub.
# The secret is used by both:
# 1. bakery-ia namespace deployments (Tilt + Kustomize)
# 2. Signoz Helm deployment
#
# Usage:
# ./create-dockerhub-secret.sh
#
# Prerequisites:
# - kubectl configured with access to the cluster
# - DOCKERHUB_USERNAME and DOCKERHUB_PASSWORD environment variables set
# - OR Docker CLI logged in (docker login)
# =============================================================================
set -e
echo "🔐 Creating Docker Hub Image Pull Secret"
echo "=========================================="
echo ""
# Check for required environment variables
if [ -z "$DOCKERHUB_USERNAME" ] || [ -z "$DOCKERHUB_PASSWORD" ]; then
echo "⚠️ DOCKERHUB_USERNAME and DOCKERHUB_PASSWORD environment variables not set"
echo ""
echo "Checking if Docker CLI is logged in..."
# Try to extract credentials from Docker config
if [ -f "$HOME/.docker/config.json" ]; then
# Check if using credential store
if grep -q "credsStore" "$HOME/.docker/config.json"; then
echo "⚠️ Docker is using a credential store. Please set environment variables manually:"
echo ""
echo " export DOCKERHUB_USERNAME='your-username'"
echo " export DOCKERHUB_PASSWORD='your-password-or-token'"
echo ""
exit 1
fi
# Try to extract from base64 encoded auth
AUTH=$(cat "$HOME/.docker/config.json" | jq -r '.auths["https://index.docker.io/v1/"].auth // empty' 2>/dev/null)
if [ -n "$AUTH" ]; then
echo "✅ Found Docker Hub credentials in Docker config"
DOCKERHUB_USERNAME=$(echo "$AUTH" | base64 -d | cut -d: -f1)
DOCKERHUB_PASSWORD=$(echo "$AUTH" | base64 -d | cut -d: -f2-)
else
echo "❌ Could not find Docker Hub credentials"
echo ""
echo "Please either:"
echo " 1. Run 'docker login' first, OR"
echo " 2. Set environment variables:"
echo " export DOCKERHUB_USERNAME='your-username'"
echo " export DOCKERHUB_PASSWORD='your-password-or-token'"
echo ""
exit 1
fi
else
echo "❌ Docker config not found and environment variables not set"
echo ""
echo "Please set environment variables:"
echo " export DOCKERHUB_USERNAME='your-username'"
echo " export DOCKERHUB_PASSWORD='your-password-or-token'"
echo ""
exit 1
fi
fi
echo "Using Docker Hub username: $DOCKERHUB_USERNAME"
echo ""
# Function to create secret in a namespace
create_secret_in_namespace() {
local NAMESPACE=$1
echo "📦 Creating secret in namespace: $NAMESPACE"
# Create namespace if it doesn't exist
if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
echo " Creating namespace $NAMESPACE..."
kubectl create namespace "$NAMESPACE"
fi
# Delete existing secret if it exists
if kubectl get secret dockerhub-creds -n "$NAMESPACE" &>/dev/null; then
echo " Deleting existing secret..."
kubectl delete secret dockerhub-creds -n "$NAMESPACE"
fi
# Create the secret
kubectl create secret docker-registry dockerhub-creds \
--docker-server=https://index.docker.io/v1/ \
--docker-username="$DOCKERHUB_USERNAME" \
--docker-password="$DOCKERHUB_PASSWORD" \
--docker-email="${DOCKERHUB_EMAIL:-noreply@bakery-ia.local}" \
-n "$NAMESPACE"
echo " ✅ Secret created successfully"
echo ""
}
# Create secret in bakery-ia namespace (for Tilt deployments)
create_secret_in_namespace "bakery-ia"
# Create secret in signoz namespace (for Signoz Helm deployment - if namespace exists)
if kubectl get namespace signoz &>/dev/null; then
create_secret_in_namespace "signoz"
else
echo " Signoz namespace not found, skipping (will be created on Helm install)"
echo ""
fi
echo "✅ Docker Hub secrets created successfully!"
echo ""
echo "The secret 'dockerhub-creds' is now available in:"
echo " - bakery-ia namespace (for Tilt/Kustomize deployments)"
if kubectl get namespace signoz &>/dev/null; then
echo " - signoz namespace (for Signoz Helm deployment)"
fi
echo ""
echo "All pods with imagePullSecrets: dockerhub-creds will now use these credentials"
echo "to pull images from Docker Hub."
echo ""

View File

@@ -73,14 +73,7 @@ spec:
name: gateway-service
port:
number: 8000
- host: monitoring.bakery-ia.local
http:
paths:
# SigNoz Frontend UI
- path: /
pathType: Prefix
backend:
service:
name: signoz
port:
number: 8080
# Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace
# SigNoz creates its own Ingress via Helm chart configuration (signoz-values-dev.yaml)
# Access at: https://monitoring.bakery-ia.local/
# SignOz is served at the root of the monitoring subdomain

View File

@@ -73,13 +73,13 @@ patches:
value: "deployment.environment=production,cluster.name=bakery-ia-prod"
- op: add
path: /data/SIGNOZ_ENDPOINT
value: "http://signoz-query-service.signoz.svc.cluster.local:8080"
value: "http://signoz.signoz.svc.cluster.local:8080"
- op: add
path: /data/SIGNOZ_FRONTEND_URL
value: "https://monitoring.bakewise.ai/signoz"
value: "https://monitoring.bakewise.ai"
- op: add
path: /data/SIGNOZ_ROOT_URL
value: "https://monitoring.bakewise.ai/signoz"
value: "https://monitoring.bakewise.ai"
- op: add
path: /data/RATE_LIMIT_ENABLED
value: "true"
@@ -119,12 +119,12 @@ patches:
limits:
memory: "4Gi"
cpu: "1000m"
# SigNoz Query Service production configuration
# SigNoz Main Service production configuration (v0.106.0+ unified service)
- target:
group: apps
version: v1
kind: Deployment
name: signoz-query-service
kind: StatefulSet
name: signoz
namespace: signoz
patch: |-
- op: replace
@@ -134,11 +134,11 @@ patches:
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "4Gi"
cpu: "2000m"
# SigNoz AlertManager production configuration
- target:
group: apps
@@ -159,26 +159,6 @@ patches:
limits:
memory: "1Gi"
cpu: "500m"
# SigNoz Frontend production configuration
- target:
group: apps
version: v1
kind: Deployment
name: signoz-frontend
namespace: signoz
patch: |-
- op: replace
path: /spec/replicas
value: 2
- op: replace
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
images:
- name: bakery/auth-service

View File

@@ -28,10 +28,10 @@ data:
OTEL_SERVICE_NAME: "bakery-ia"
OTEL_RESOURCE_ATTRIBUTES: "deployment.environment=production,cluster.name=bakery-ia-prod"
# SigNoz Endpoints
SIGNOZ_ENDPOINT: "http://signoz-query-service.signoz.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai/signoz"
SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai/signoz"
# SigNoz Endpoints (v0.106.0+ unified service)
SIGNOZ_ENDPOINT: "http://signoz.signoz.svc.cluster.local:8080"
SIGNOZ_FRONTEND_URL: "https://monitoring.bakewise.ai"
SIGNOZ_ROOT_URL: "https://monitoring.bakewise.ai"
# Rate Limiting (stricter in production)
RATE_LIMIT_ENABLED: "true"

View File

@@ -360,18 +360,6 @@ class DemoCleanupService:
logger.info("Demo session cleanup completed", stats=stats)
# Update Prometheus metrics
duration_ms = int((datetime.now(timezone.utc) - start_time).total_seconds() * 1000)
demo_session_cleanup_duration_seconds.labels(tier="all").observe(duration_ms / 1000)
# Update deleted sessions metrics by tier (we need to determine tiers from sessions)
for session in all_sessions_to_cleanup:
demo_sessions_deleted_total.labels(
tier=session.demo_account_type,
status="success"
).inc()
demo_sessions_active.labels(tier=session.demo_account_type).dec()
return stats
async def cleanup_old_destroyed_sessions(self, days: int = 7) -> int:

View File

@@ -284,9 +284,7 @@ class CloneOrchestrator:
)
start_time = datetime.now(timezone.utc)
# Update active sessions metric
demo_sessions_active.labels(tier=demo_account_type).inc()
# Filter services if specified
services_to_clone = self.services
@@ -383,29 +381,6 @@ class CloneOrchestrator:
services_status=all_services,
demo_account_type=demo_account_type
)
# Update Prometheus metrics
demo_session_creation_duration_seconds.labels(tier=demo_account_type).observe(duration_ms / 1000)
demo_sessions_created_total.labels(tier=demo_account_type, status=overall_status).inc()
# Update alert and insight metrics if available
if result.get("alert_generation"):
alert_gen = result["alert_generation"]
for alert_type, alerts in alert_gen.items():
if isinstance(alerts, dict) and alerts.get("alerts_generated"):
demo_alerts_generated_total.labels(
tier=demo_account_type,
alert_type=alert_type
).inc(alerts["alerts_generated"])
if result.get("ai_insights_generation"):
insights_gen = result["ai_insights_generation"]
for insight_type, insights in insights_gen.items():
if isinstance(insights, dict) and insights.get("insights_posted"):
demo_ai_insights_generated_total.labels(
tier=demo_account_type,
insight_type=insight_type
).inc(insights["insights_posted"])
return result
@@ -549,20 +524,6 @@ class CloneOrchestrator:
duration_ms=duration_ms
)
demo_cross_service_calls_total.labels(
source_service="demo-session",
target_service=service.name,
status="success"
).inc()
demo_cross_service_call_duration_seconds.labels(
source_service="demo-session",
target_service=service.name
).observe(duration_seconds)
demo_service_clone_duration_seconds.labels(
tier=demo_account_type,
service=service.name
).observe(duration_seconds)
if response.status_code == 200:
result = response.json()
logger.info(
@@ -582,17 +543,6 @@ class CloneOrchestrator:
response_text=response.text
)
demo_cross_service_calls_total.labels(
source_service="demo-session",
target_service=service.name,
status="failed"
).inc()
demo_cloning_errors_total.labels(
tier=demo_account_type,
service=service.name,
error_type="http_error"
).inc()
return {
"service": service.name,
"status": "failed",
@@ -614,22 +564,6 @@ class CloneOrchestrator:
url=service.url
)
# Update error metrics
demo_cross_service_calls_total.labels(
source_service="demo-session",
target_service=service.name,
status="failed"
).inc()
demo_cloning_errors_total.labels(
tier=demo_account_type,
service=service.name,
error_type="timeout"
).inc()
demo_service_clone_duration_seconds.labels(
tier=demo_account_type,
service=service.name
).observe(duration_seconds)
return {
"service": service.name,
"status": "failed",
@@ -650,22 +584,6 @@ class CloneOrchestrator:
exc_info=True
)
# Update error metrics
demo_cross_service_calls_total.labels(
source_service="demo-session",
target_service=service.name,
status="failed"
).inc()
demo_cloning_errors_total.labels(
tier=demo_account_type,
service=service.name,
error_type="network_error"
).inc()
demo_service_clone_duration_seconds.labels(
tier=demo_account_type,
service=service.name
).observe(duration_seconds)
return {
"service": service.name,
"status": "failed",
@@ -686,22 +604,6 @@ class CloneOrchestrator:
exc_info=True
)
# Update error metrics
demo_cross_service_calls_total.labels(
source_service="demo-session",
target_service=service.name,
status="failed"
).inc()
demo_cloning_errors_total.labels(
tier=demo_account_type,
service=service.name,
error_type="exception"
).inc()
demo_service_clone_duration_seconds.labels(
tier=demo_account_type,
service=service.name
).observe(duration_seconds)
return {
"service": service.name,
"status": "failed",