Update monitoring packages to latest versions
- Updated all OpenTelemetry packages to latest versions: - opentelemetry-api: 1.27.0 → 1.39.1 - opentelemetry-sdk: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-grpc: 1.27.0 → 1.39.1 - opentelemetry-exporter-otlp-proto-http: 1.27.0 → 1.39.1 - opentelemetry-instrumentation-fastapi: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-httpx: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-redis: 0.48b0 → 0.60b1 - opentelemetry-instrumentation-sqlalchemy: 0.48b0 → 0.60b1 - Removed prometheus-client==0.23.1 from all services - Unified all services to use the same monitoring package versions Generated by Mistral Vibe. Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
This commit is contained in:
125
infrastructure/kubernetes/add-image-pull-secrets.sh
Executable file
125
infrastructure/kubernetes/add-image-pull-secrets.sh
Executable file
@@ -0,0 +1,125 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Script to add imagePullSecrets to all Kubernetes deployments, jobs, and cronjobs
|
||||
# This ensures all pods can pull images from Docker Hub using the dockerhub-creds secret
|
||||
|
||||
SECRET_NAME="dockerhub-creds"
|
||||
BASE_DIR="/Users/urtzialfaro/Documents/bakery-ia/infrastructure/kubernetes"
|
||||
|
||||
# ANSI color codes
|
||||
GREEN='\033[0;32m'
|
||||
BLUE='\033[0;34m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo -e "${BLUE}Adding imagePullSecrets to all Kubernetes resources...${NC}"
|
||||
echo "======================================================"
|
||||
echo ""
|
||||
|
||||
# Counter for files processed
|
||||
count=0
|
||||
|
||||
# Function to add imagePullSecrets to a file
|
||||
add_image_pull_secrets() {
|
||||
local file="$1"
|
||||
|
||||
# Check if file already has imagePullSecrets
|
||||
if grep -q "imagePullSecrets:" "$file"; then
|
||||
echo -e "${YELLOW} ⊘ Skipping (already has imagePullSecrets): $(basename $file)${NC}"
|
||||
return
|
||||
fi
|
||||
|
||||
# Temporary file for processing
|
||||
temp_file=$(mktemp)
|
||||
|
||||
# Process the file using awk to add imagePullSecrets after "spec:" in template or job spec
|
||||
awk '
|
||||
/^ spec:$/ && !done {
|
||||
print $0
|
||||
print " imagePullSecrets:"
|
||||
print " - name: dockerhub-creds"
|
||||
done = 1
|
||||
next
|
||||
}
|
||||
{ print }
|
||||
' "$file" > "$temp_file"
|
||||
|
||||
# Check if changes were made
|
||||
if ! cmp -s "$file" "$temp_file"; then
|
||||
mv "$temp_file" "$file"
|
||||
echo -e "${GREEN} ✓ Updated: $(basename $file)${NC}"
|
||||
((count++))
|
||||
else
|
||||
rm "$temp_file"
|
||||
echo -e "${YELLOW} ⊘ No changes needed: $(basename $file)${NC}"
|
||||
fi
|
||||
}
|
||||
|
||||
# Process all service deployments
|
||||
echo -e "${BLUE}Processing service deployments...${NC}"
|
||||
find $BASE_DIR/base/components -name "*-service.yaml" | while read file; do
|
||||
if [ -f "$file" ]; then
|
||||
add_image_pull_secrets "$file"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Process all database deployments
|
||||
echo -e "${BLUE}Processing database deployments...${NC}"
|
||||
for file in $BASE_DIR/base/components/databases/*.yaml; do
|
||||
if [ -f "$file" ]; then
|
||||
add_image_pull_secrets "$file"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Process all migration jobs
|
||||
echo -e "${BLUE}Processing migration jobs...${NC}"
|
||||
for file in $BASE_DIR/base/migrations/*.yaml; do
|
||||
if [ -f "$file" ]; then
|
||||
add_image_pull_secrets "$file"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Process all cronjobs
|
||||
echo -e "${BLUE}Processing cronjobs...${NC}"
|
||||
for file in $BASE_DIR/base/cronjobs/*.yaml; do
|
||||
if [ -f "$file" ]; then
|
||||
add_image_pull_secrets "$file"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Process standalone jobs
|
||||
echo -e "${BLUE}Processing standalone jobs...${NC}"
|
||||
for file in $BASE_DIR/base/jobs/*.yaml; do
|
||||
if [ -f "$file" ]; then
|
||||
add_image_pull_secrets "$file"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Process deployments directory
|
||||
echo -e "${BLUE}Processing deployments...${NC}"
|
||||
for file in $BASE_DIR/base/deployments/*.yaml; do
|
||||
if [ -f "$file" ]; then
|
||||
add_image_pull_secrets "$file"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
|
||||
# Process nominatim service
|
||||
if [ -f "$BASE_DIR/base/components/infrastructure/nominatim.yaml" ]; then
|
||||
echo -e "${BLUE}Processing nominatim service...${NC}"
|
||||
add_image_pull_secrets "$BASE_DIR/base/components/infrastructure/nominatim.yaml"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
echo "======================================================"
|
||||
echo -e "${GREEN}Completed! Updated $count file(s)${NC}"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo "1. Review the changes: git diff"
|
||||
echo "2. Apply to cluster: kubectl apply -k infrastructure/kubernetes/overlays/dev"
|
||||
echo "3. Verify pods are running: kubectl get pods -n bakery-ia"
|
||||
94
infrastructure/kubernetes/add-monitoring-config.sh
Executable file
94
infrastructure/kubernetes/add-monitoring-config.sh
Executable file
@@ -0,0 +1,94 @@
|
||||
#!/bin/bash
|
||||
# Script to add OpenTelemetry monitoring configuration to all service deployments
|
||||
# This adds the necessary environment variables for SigNoz integration
|
||||
# Note: No Prometheus annotations needed - all metrics go via OTLP push
|
||||
|
||||
set -e
|
||||
|
||||
SERVICES=(
|
||||
"ai-insights"
|
||||
"distribution"
|
||||
"external"
|
||||
"forecasting"
|
||||
"inventory"
|
||||
"notification"
|
||||
"orchestrator"
|
||||
"orders"
|
||||
"pos"
|
||||
"procurement"
|
||||
"production"
|
||||
"recipes"
|
||||
"sales"
|
||||
"suppliers"
|
||||
"tenant"
|
||||
"training"
|
||||
"frontend"
|
||||
)
|
||||
|
||||
echo "Adding OpenTelemetry configuration to all services..."
|
||||
echo ""
|
||||
|
||||
for service in "${SERVICES[@]}"; do
|
||||
SERVICE_FILE="infrastructure/kubernetes/base/components/${service}/${service}-service.yaml"
|
||||
|
||||
if [ ! -f "$SERVICE_FILE" ]; then
|
||||
echo "⚠️ Skipping $service (file not found: $SERVICE_FILE)"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "📝 Processing $service-service..."
|
||||
|
||||
# Check if already has OTEL env vars
|
||||
if grep -q "OTEL_COLLECTOR_ENDPOINT" "$SERVICE_FILE"; then
|
||||
echo " ✓ Already has OpenTelemetry configuration"
|
||||
else
|
||||
echo " + Adding OpenTelemetry environment variables"
|
||||
# Create a YAML patch
|
||||
cat > "/tmp/${service}-otel-patch.yaml" << 'EOF'
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "SERVICE_NAME_PLACEHOLDER"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration (all via OTLP, no Prometheus)
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
EOF
|
||||
# Replace placeholder with actual service name
|
||||
sed -i.bak "s/SERVICE_NAME_PLACEHOLDER/${service}-service/g" "/tmp/${service}-otel-patch.yaml"
|
||||
|
||||
echo " ⚠️ Manual step required: Add env vars from /tmp/${service}-otel-patch.yaml"
|
||||
echo " Insert after 'ports:' section and before 'envFrom:' in $SERVICE_FILE"
|
||||
fi
|
||||
|
||||
echo " ✅ $service-service processed"
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "✅ Monitoring configuration prepared for all services!"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo "1. Review the changes and manually add env vars from /tmp/*-otel-patch.yaml files"
|
||||
echo "2. Update SigNoz: helm upgrade signoz signoz/signoz -n signoz -f infrastructure/helm/signoz-values-dev.yaml"
|
||||
echo "3. Restart services: kubectl rollout restart deployment -n bakery-ia"
|
||||
echo "4. Check SigNoz UI at https://monitoring.bakery-ia.local for incoming data"
|
||||
echo ""
|
||||
echo "What metrics you'll see:"
|
||||
echo " - HTTP requests (method, endpoint, status code, duration)"
|
||||
echo " - System metrics (CPU, memory usage per process)"
|
||||
echo " - System-wide metrics (total CPU, memory, disk I/O, network I/O)"
|
||||
echo " - Custom business metrics (registrations, orders, etc.)"
|
||||
echo " - All pushed via OpenTelemetry OTLP (no Prometheus scraping)"
|
||||
162
infrastructure/kubernetes/apply-monitoring-to-all.py
Executable file
162
infrastructure/kubernetes/apply-monitoring-to-all.py
Executable file
@@ -0,0 +1,162 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to automatically add OpenTelemetry monitoring configuration to all service deployments.
|
||||
This adds environment variables for metrics, logs, and traces export to SigNoz.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Services to configure
|
||||
SERVICES = [
|
||||
"ai-insights",
|
||||
"distribution",
|
||||
"external",
|
||||
"forecasting",
|
||||
"inventory",
|
||||
"notification",
|
||||
"orchestrator",
|
||||
"orders",
|
||||
"pos",
|
||||
"procurement",
|
||||
"production",
|
||||
"recipes",
|
||||
"sales",
|
||||
"suppliers",
|
||||
"tenant",
|
||||
"training",
|
||||
]
|
||||
|
||||
OTEL_ENV_VARS_TEMPLATE = """ env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "{service_name}"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration (all via OTLP, no Prometheus)
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
"""
|
||||
|
||||
|
||||
def has_otel_config(content: str) -> bool:
|
||||
"""Check if file already has OTEL configuration"""
|
||||
return "OTEL_COLLECTOR_ENDPOINT" in content
|
||||
|
||||
|
||||
def add_otel_config(content: str, service_name: str) -> str:
|
||||
"""Add OTEL configuration to service deployment"""
|
||||
|
||||
# Prepare the env vars with the service name
|
||||
env_vars = OTEL_ENV_VARS_TEMPLATE.format(service_name=f"{service_name}-service")
|
||||
|
||||
# Find the container section and add env vars before envFrom
|
||||
# Pattern: find " containers:" then first " envFrom:" after it
|
||||
pattern = r'( containers:\n - name: [^\n]+\n image: [^\n]+\n(?: ports:\n(?: - [^\n]+\n)+)?)( envFrom:)'
|
||||
|
||||
replacement = r'\1' + env_vars + r'\2'
|
||||
|
||||
# Try to replace
|
||||
new_content = re.sub(pattern, replacement, content, count=1)
|
||||
|
||||
if new_content == content:
|
||||
print(f" ⚠️ Warning: Could not find insertion point automatically")
|
||||
return content
|
||||
|
||||
return new_content
|
||||
|
||||
|
||||
def process_service(service_name: str, base_path: Path) -> bool:
|
||||
"""Process a single service deployment file"""
|
||||
|
||||
service_file = base_path / "components" / service_name / f"{service_name}-service.yaml"
|
||||
|
||||
if not service_file.exists():
|
||||
print(f" ⚠️ File not found: {service_file}")
|
||||
return False
|
||||
|
||||
# Read file
|
||||
with open(service_file, 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
# Check if already configured
|
||||
if has_otel_config(content):
|
||||
print(f" ✓ Already configured")
|
||||
return True
|
||||
|
||||
# Add configuration
|
||||
new_content = add_otel_config(content, service_name)
|
||||
|
||||
if new_content == content:
|
||||
return False
|
||||
|
||||
# Write back
|
||||
with open(service_file, 'w') as f:
|
||||
f.write(new_content)
|
||||
|
||||
print(f" ✅ Updated successfully")
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
|
||||
# Find base path
|
||||
script_dir = Path(__file__).parent
|
||||
base_path = script_dir / "base"
|
||||
|
||||
if not base_path.exists():
|
||||
print(f"❌ Error: Base path not found: {base_path}")
|
||||
sys.exit(1)
|
||||
|
||||
print("=" * 60)
|
||||
print("Adding OpenTelemetry Monitoring Configuration")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
success_count = 0
|
||||
skip_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for service in SERVICES:
|
||||
print(f"📝 Processing {service}-service...")
|
||||
|
||||
result = process_service(service, base_path)
|
||||
|
||||
if result:
|
||||
if has_otel_config(open(base_path / "components" / service / f"{service}-service.yaml").read()):
|
||||
success_count += 1
|
||||
else:
|
||||
fail_count += 1
|
||||
|
||||
print()
|
||||
|
||||
print("=" * 60)
|
||||
print(f"✅ Successfully configured: {success_count}")
|
||||
if fail_count > 0:
|
||||
print(f"⚠️ Failed to configure: {fail_count}")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
print("Next steps:")
|
||||
print("1. Review the changes: git diff infrastructure/kubernetes/base/components/")
|
||||
print("2. Update SigNoz: helm upgrade signoz signoz/signoz -n signoz -f infrastructure/helm/signoz-values-dev.yaml")
|
||||
print("3. Apply changes: kubectl apply -k infrastructure/kubernetes/overlays/dev/")
|
||||
print("4. Verify: kubectl logs -n bakery-ia deployment/<service-name> | grep -i 'otel\\|metrics'")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: ai-insights-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "ai-insights-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: auth-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -93,6 +95,21 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "auth-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: ai-insights-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: alert-processor-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: auth-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: distribution-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: external-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: forecasting-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: inventory-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: notification-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: orchestrator-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: orders-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: pos-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: {{SERVICE_NAME}}-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
containers:
|
||||
- name: postgres
|
||||
image: postgres:17-alpine
|
||||
@@ -121,4 +123,4 @@ spec:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
storage: 1Gi
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: procurement-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: production-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: rabbitmq
|
||||
app.kubernetes.io/component: message-broker
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
containers:
|
||||
- name: rabbitmq
|
||||
image: rabbitmq:4.1-management-alpine
|
||||
@@ -120,4 +122,4 @@ spec:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
storage: 2Gi
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: recipes-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: redis
|
||||
app.kubernetes.io/component: cache
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 999 # redis group
|
||||
initContainers:
|
||||
@@ -166,4 +168,4 @@ spec:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Gi
|
||||
storage: 1Gi
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: sales-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: suppliers-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: tenant-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: training-db
|
||||
app.kubernetes.io/component: database
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
securityContext:
|
||||
fsGroup: 70
|
||||
initContainers:
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app: distribution-service
|
||||
tier: backend
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
containers:
|
||||
- name: distribution-service
|
||||
image: bakery/distribution-service:latest
|
||||
@@ -58,6 +60,25 @@ spec:
|
||||
value: "30"
|
||||
- name: HTTP_RETRIES
|
||||
value: "3"
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "distribution-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
@@ -107,4 +128,4 @@ spec:
|
||||
port: 8000
|
||||
targetPort: 8000
|
||||
name: http
|
||||
type: ClusterIP
|
||||
type: ClusterIP
|
||||
|
||||
@@ -23,6 +23,8 @@ spec:
|
||||
app.kubernetes.io/component: microservice
|
||||
version: "2.0"
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -85,6 +87,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "external-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: forecasting-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "forecasting-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: frontend
|
||||
app.kubernetes.io/component: frontend
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
containers:
|
||||
- name: frontend
|
||||
image: bakery/dashboard:latest
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: gateway
|
||||
app.kubernetes.io/component: gateway
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
containers:
|
||||
- name: gateway
|
||||
image: bakery/gateway:latest
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: inventory-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "inventory-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -1,501 +0,0 @@
|
||||
# Bakery IA - Production Monitoring Stack
|
||||
|
||||
This directory contains the complete production-ready monitoring infrastructure for the Bakery IA platform.
|
||||
|
||||
## 📊 Components
|
||||
|
||||
### Core Monitoring
|
||||
- **Prometheus v3.0.1** - Time-series metrics database (2 replicas with HA)
|
||||
- **Grafana v12.3.0** - Visualization and dashboarding
|
||||
- **AlertManager v0.27.0** - Alert routing and notification (3 replicas with HA)
|
||||
|
||||
### Distributed Tracing
|
||||
- **Jaeger v1.51** - Distributed tracing with persistent storage
|
||||
|
||||
### Exporters
|
||||
- **PostgreSQL Exporter v0.15.0** - Database metrics and health
|
||||
- **Node Exporter v1.7.0** - Infrastructure and OS-level metrics (DaemonSet)
|
||||
|
||||
## 🚀 Deployment
|
||||
|
||||
### Prerequisites
|
||||
1. Kubernetes cluster (v1.24+)
|
||||
2. kubectl configured
|
||||
3. kustomize (v4.0+) or kubectl with kustomize support
|
||||
4. Storage class available for PersistentVolumeClaims
|
||||
|
||||
### Production Deployment
|
||||
|
||||
```bash
|
||||
# 1. Update secrets with production values
|
||||
kubectl create secret generic grafana-admin \
|
||||
--from-literal=admin-user=admin \
|
||||
--from-literal=admin-password=$(openssl rand -base64 32) \
|
||||
--namespace monitoring --dry-run=client -o yaml > secrets.yaml
|
||||
|
||||
# 2. Update AlertManager SMTP credentials
|
||||
kubectl create secret generic alertmanager-secrets \
|
||||
--from-literal=smtp-host="smtp.gmail.com:587" \
|
||||
--from-literal=smtp-username="alerts@yourdomain.com" \
|
||||
--from-literal=smtp-password="YOUR_SMTP_PASSWORD" \
|
||||
--from-literal=smtp-from="alerts@yourdomain.com" \
|
||||
--from-literal=slack-webhook-url="https://hooks.slack.com/services/YOUR/WEBHOOK/URL" \
|
||||
--namespace monitoring --dry-run=client -o yaml >> secrets.yaml
|
||||
|
||||
# 3. Update PostgreSQL exporter connection string
|
||||
kubectl create secret generic postgres-exporter \
|
||||
--from-literal=data-source-name="postgresql://user:password@postgres.bakery-ia:5432/bakery?sslmode=require" \
|
||||
--namespace monitoring --dry-run=client -o yaml >> secrets.yaml
|
||||
|
||||
# 4. Deploy monitoring stack
|
||||
kubectl apply -k infrastructure/kubernetes/overlays/prod
|
||||
|
||||
# 5. Verify deployment
|
||||
kubectl get pods -n monitoring
|
||||
kubectl get pvc -n monitoring
|
||||
```
|
||||
|
||||
### Local Development Deployment
|
||||
|
||||
For local Kind clusters, monitoring is disabled by default to save resources. To enable:
|
||||
|
||||
```bash
|
||||
# Uncomment monitoring in overlays/dev/kustomization.yaml
|
||||
# Then apply:
|
||||
kubectl apply -k infrastructure/kubernetes/overlays/dev
|
||||
```
|
||||
|
||||
## 🔐 Security Configuration
|
||||
|
||||
### Important Security Notes
|
||||
|
||||
⚠️ **NEVER commit real secrets to Git!**
|
||||
|
||||
The `secrets.yaml` file contains placeholder values. In production, use one of:
|
||||
|
||||
1. **Sealed Secrets** (Recommended)
|
||||
```bash
|
||||
kubectl apply -f https://github.com/bitnami-labs/sealed-secrets/releases/download/v0.24.0/controller.yaml
|
||||
kubeseal --format=yaml < secrets.yaml > sealed-secrets.yaml
|
||||
```
|
||||
|
||||
2. **External Secrets Operator**
|
||||
```bash
|
||||
helm install external-secrets external-secrets/external-secrets -n external-secrets
|
||||
```
|
||||
|
||||
3. **Cloud Provider Secrets**
|
||||
- AWS Secrets Manager
|
||||
- GCP Secret Manager
|
||||
- Azure Key Vault
|
||||
|
||||
### Grafana Admin Password
|
||||
|
||||
Change the default password immediately:
|
||||
```bash
|
||||
# Generate strong password
|
||||
NEW_PASSWORD=$(openssl rand -base64 32)
|
||||
|
||||
# Update secret
|
||||
kubectl patch secret grafana-admin -n monitoring \
|
||||
-p="{\"data\":{\"admin-password\":\"$(echo -n $NEW_PASSWORD | base64)\"}}"
|
||||
|
||||
# Restart Grafana
|
||||
kubectl rollout restart deployment grafana -n monitoring
|
||||
```
|
||||
|
||||
## 📈 Accessing Monitoring Services
|
||||
|
||||
### Via Ingress (Production)
|
||||
|
||||
```
|
||||
https://monitoring.yourdomain.com/grafana
|
||||
https://monitoring.yourdomain.com/prometheus
|
||||
https://monitoring.yourdomain.com/alertmanager
|
||||
https://monitoring.yourdomain.com/jaeger
|
||||
```
|
||||
|
||||
### Via Port Forwarding (Development)
|
||||
|
||||
```bash
|
||||
# Grafana
|
||||
kubectl port-forward -n monitoring svc/grafana 3000:3000
|
||||
|
||||
# Prometheus
|
||||
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
|
||||
|
||||
# AlertManager
|
||||
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093
|
||||
|
||||
# Jaeger
|
||||
kubectl port-forward -n monitoring svc/jaeger-query 16686:16686
|
||||
```
|
||||
|
||||
Then access:
|
||||
- Grafana: http://localhost:3000
|
||||
- Prometheus: http://localhost:9090
|
||||
- AlertManager: http://localhost:9093
|
||||
- Jaeger: http://localhost:16686
|
||||
|
||||
## 📊 Grafana Dashboards
|
||||
|
||||
### Pre-configured Dashboards
|
||||
|
||||
1. **Gateway Metrics** - API gateway performance
|
||||
- Request rate by endpoint
|
||||
- P95 latency
|
||||
- Error rates
|
||||
- Authentication metrics
|
||||
|
||||
2. **Services Overview** - Microservices health
|
||||
- Request rate by service
|
||||
- P99 latency
|
||||
- Error rates by service
|
||||
- Service health status
|
||||
|
||||
3. **Circuit Breakers** - Resilience patterns
|
||||
- Circuit breaker states
|
||||
- Trip rates
|
||||
- Rejected requests
|
||||
|
||||
4. **PostgreSQL Monitoring** - Database health
|
||||
- Connections, transactions, cache hit ratio
|
||||
- Slow queries, locks, replication lag
|
||||
|
||||
5. **Node Metrics** - Infrastructure monitoring
|
||||
- CPU, memory, disk, network per node
|
||||
|
||||
6. **AlertManager** - Alert management
|
||||
- Active alerts, firing rate, notifications
|
||||
|
||||
7. **Business Metrics** - KPIs
|
||||
- Service performance, tenant activity, ML metrics
|
||||
|
||||
### Creating Custom Dashboards
|
||||
|
||||
1. Login to Grafana (admin/[your-password])
|
||||
2. Click "+ → Dashboard"
|
||||
3. Add panels with Prometheus queries
|
||||
4. Save dashboard
|
||||
5. Export JSON and add to `grafana-dashboards.yaml`
|
||||
|
||||
## 🚨 Alert Configuration
|
||||
|
||||
### Alert Rules
|
||||
|
||||
Alert rules are defined in `alert-rules.yaml` and organized by category:
|
||||
|
||||
- **bakery_services** - Service health, errors, latency, memory
|
||||
- **bakery_business** - Training jobs, ML accuracy, API limits
|
||||
- **alert_system_health** - Alert system components, RabbitMQ, Redis
|
||||
- **alert_system_performance** - Processing errors, delivery failures
|
||||
- **alert_system_business** - Alert volume, response times
|
||||
- **alert_system_capacity** - Queue sizes, storage performance
|
||||
- **alert_system_critical** - System failures, data loss
|
||||
- **monitoring_health** - Prometheus, AlertManager self-monitoring
|
||||
|
||||
### Alert Routing
|
||||
|
||||
Alerts are routed based on:
|
||||
- **Severity** (critical, warning, info)
|
||||
- **Component** (alert-system, database, infrastructure)
|
||||
- **Service** name
|
||||
|
||||
### Notification Channels
|
||||
|
||||
Configure in `alertmanager.yaml`:
|
||||
|
||||
1. **Email** (default)
|
||||
- critical-alerts@yourdomain.com
|
||||
- oncall@yourdomain.com
|
||||
|
||||
2. **Slack** (optional, commented out)
|
||||
- Update slack-webhook-url in secrets
|
||||
- Uncomment slack_configs in alertmanager.yaml
|
||||
|
||||
3. **PagerDuty** (add if needed)
|
||||
```yaml
|
||||
pagerduty_configs:
|
||||
- routing_key: YOUR_ROUTING_KEY
|
||||
severity: '{{ .Labels.severity }}'
|
||||
```
|
||||
|
||||
### Testing Alerts
|
||||
|
||||
```bash
|
||||
# Fire a test alert
|
||||
kubectl run test-alert --image=busybox -n bakery-ia --restart=Never -- sleep 3600
|
||||
|
||||
# Check alert in Prometheus
|
||||
# Navigate to http://localhost:9090/alerts
|
||||
|
||||
# Check AlertManager
|
||||
# Navigate to http://localhost:9093
|
||||
```
|
||||
|
||||
## 🔍 Troubleshooting
|
||||
|
||||
### Prometheus Issues
|
||||
|
||||
```bash
|
||||
# Check Prometheus logs
|
||||
kubectl logs -n monitoring prometheus-0 -f
|
||||
|
||||
# Check Prometheus targets
|
||||
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
|
||||
# Visit http://localhost:9090/targets
|
||||
|
||||
# Check Prometheus configuration
|
||||
kubectl get configmap prometheus-config -n monitoring -o yaml
|
||||
```
|
||||
|
||||
### AlertManager Issues
|
||||
|
||||
```bash
|
||||
# Check AlertManager logs
|
||||
kubectl logs -n monitoring alertmanager-0 -f
|
||||
|
||||
# Check AlertManager configuration
|
||||
kubectl exec -n monitoring alertmanager-0 -- cat /etc/alertmanager/alertmanager.yml
|
||||
|
||||
# Test SMTP connection
|
||||
kubectl exec -n monitoring alertmanager-0 -- \
|
||||
wget --spider --server-response --timeout=10 smtp://smtp.gmail.com:587
|
||||
```
|
||||
|
||||
### Grafana Issues
|
||||
|
||||
```bash
|
||||
# Check Grafana logs
|
||||
kubectl logs -n monitoring deployment/grafana -f
|
||||
|
||||
# Reset Grafana admin password
|
||||
kubectl exec -n monitoring deployment/grafana -- \
|
||||
grafana-cli admin reset-admin-password NEW_PASSWORD
|
||||
```
|
||||
|
||||
### PostgreSQL Exporter Issues
|
||||
|
||||
```bash
|
||||
# Check exporter logs
|
||||
kubectl logs -n monitoring deployment/postgres-exporter -f
|
||||
|
||||
# Test database connection
|
||||
kubectl exec -n monitoring deployment/postgres-exporter -- \
|
||||
wget -O- http://localhost:9187/metrics | grep pg_up
|
||||
```
|
||||
|
||||
### Node Exporter Issues
|
||||
|
||||
```bash
|
||||
# Check node exporter on specific node
|
||||
kubectl logs -n monitoring daemonset/node-exporter --selector=kubernetes.io/hostname=NODE_NAME -f
|
||||
|
||||
# Check metrics endpoint
|
||||
kubectl exec -n monitoring daemonset/node-exporter -- \
|
||||
wget -O- http://localhost:9100/metrics | head -n 20
|
||||
```
|
||||
|
||||
## 📏 Resource Requirements
|
||||
|
||||
### Minimum Requirements (Development)
|
||||
- CPU: 2 cores
|
||||
- Memory: 4Gi
|
||||
- Storage: 30Gi
|
||||
|
||||
### Recommended Requirements (Production)
|
||||
- CPU: 6-8 cores
|
||||
- Memory: 16Gi
|
||||
- Storage: 100Gi
|
||||
|
||||
### Component Resource Allocation
|
||||
|
||||
| Component | Replicas | CPU Request | Memory Request | CPU Limit | Memory Limit |
|
||||
|-----------|----------|-------------|----------------|-----------|--------------|
|
||||
| Prometheus | 2 | 500m | 1Gi | 1 | 2Gi |
|
||||
| AlertManager | 3 | 100m | 128Mi | 500m | 256Mi |
|
||||
| Grafana | 1 | 100m | 256Mi | 500m | 512Mi |
|
||||
| Postgres Exporter | 1 | 50m | 64Mi | 200m | 128Mi |
|
||||
| Node Exporter | 1/node | 50m | 64Mi | 200m | 128Mi |
|
||||
| Jaeger | 1 | 250m | 512Mi | 500m | 1Gi |
|
||||
|
||||
## 🔄 High Availability
|
||||
|
||||
### Prometheus HA
|
||||
|
||||
- 2 replicas in StatefulSet
|
||||
- Each has independent storage (volumeClaimTemplates)
|
||||
- Anti-affinity to spread across nodes
|
||||
- Both scrape the same targets independently
|
||||
- Use Thanos for long-term storage and global query view (future enhancement)
|
||||
|
||||
### AlertManager HA
|
||||
|
||||
- 3 replicas in StatefulSet
|
||||
- Clustered mode (gossip protocol)
|
||||
- Automatic leader election
|
||||
- Alert deduplication across instances
|
||||
- Anti-affinity to spread across nodes
|
||||
|
||||
### PodDisruptionBudgets
|
||||
|
||||
Ensure minimum availability during:
|
||||
- Node maintenance
|
||||
- Cluster upgrades
|
||||
- Rolling updates
|
||||
|
||||
```yaml
|
||||
Prometheus: minAvailable=1 (out of 2)
|
||||
AlertManager: minAvailable=2 (out of 3)
|
||||
Grafana: minAvailable=1 (out of 1)
|
||||
```
|
||||
|
||||
## 📊 Metrics Reference
|
||||
|
||||
### Application Metrics (from services)
|
||||
|
||||
```promql
|
||||
# HTTP request rate
|
||||
rate(http_requests_total[5m])
|
||||
|
||||
# HTTP error rate
|
||||
rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])
|
||||
|
||||
# Request latency (P95)
|
||||
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
|
||||
|
||||
# Active connections
|
||||
active_connections
|
||||
```
|
||||
|
||||
### PostgreSQL Metrics
|
||||
|
||||
```promql
|
||||
# Active connections
|
||||
pg_stat_database_numbackends
|
||||
|
||||
# Transaction rate
|
||||
rate(pg_stat_database_xact_commit[5m])
|
||||
|
||||
# Cache hit ratio
|
||||
rate(pg_stat_database_blks_hit[5m]) /
|
||||
(rate(pg_stat_database_blks_hit[5m]) + rate(pg_stat_database_blks_read[5m]))
|
||||
|
||||
# Replication lag
|
||||
pg_replication_lag_seconds
|
||||
```
|
||||
|
||||
### Node Metrics
|
||||
|
||||
```promql
|
||||
# CPU usage
|
||||
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||
|
||||
# Memory usage
|
||||
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100
|
||||
|
||||
# Disk I/O
|
||||
rate(node_disk_read_bytes_total[5m])
|
||||
rate(node_disk_written_bytes_total[5m])
|
||||
|
||||
# Network traffic
|
||||
rate(node_network_receive_bytes_total[5m])
|
||||
rate(node_network_transmit_bytes_total[5m])
|
||||
```
|
||||
|
||||
## 🔗 Distributed Tracing
|
||||
|
||||
### Jaeger Configuration
|
||||
|
||||
Services automatically send traces when `JAEGER_ENABLED=true`:
|
||||
|
||||
```yaml
|
||||
# In prod-configmap.yaml
|
||||
JAEGER_ENABLED: "true"
|
||||
JAEGER_AGENT_HOST: "jaeger-agent.monitoring.svc.cluster.local"
|
||||
JAEGER_AGENT_PORT: "6831"
|
||||
```
|
||||
|
||||
### Viewing Traces
|
||||
|
||||
1. Access Jaeger UI: https://monitoring.yourdomain.com/jaeger
|
||||
2. Select service from dropdown
|
||||
3. Click "Find Traces"
|
||||
4. Explore trace details, spans, and timing
|
||||
|
||||
### Trace Sampling
|
||||
|
||||
Current sampling: 100% (all traces collected)
|
||||
|
||||
For high-traffic production:
|
||||
```yaml
|
||||
# Adjust in shared/monitoring/tracing.py
|
||||
JAEGER_SAMPLE_RATE: "0.1" # 10% of traces
|
||||
```
|
||||
|
||||
## 📚 Additional Resources
|
||||
|
||||
- [Prometheus Documentation](https://prometheus.io/docs/)
|
||||
- [Grafana Documentation](https://grafana.com/docs/)
|
||||
- [AlertManager Documentation](https://prometheus.io/docs/alerting/latest/alertmanager/)
|
||||
- [Jaeger Documentation](https://www.jaegertracing.io/docs/)
|
||||
- [PostgreSQL Exporter](https://github.com/prometheus-community/postgres_exporter)
|
||||
- [Node Exporter](https://github.com/prometheus/node_exporter)
|
||||
|
||||
## 🆘 Support
|
||||
|
||||
For monitoring issues:
|
||||
1. Check component logs (see Troubleshooting section)
|
||||
2. Verify Prometheus targets are UP
|
||||
3. Check AlertManager configuration and routing
|
||||
4. Review resource usage and quotas
|
||||
5. Contact platform team: platform-team@yourdomain.com
|
||||
|
||||
## 🔄 Maintenance
|
||||
|
||||
### Regular Tasks
|
||||
|
||||
**Daily:**
|
||||
- Review critical alerts
|
||||
- Check service health dashboards
|
||||
|
||||
**Weekly:**
|
||||
- Review alert noise and adjust thresholds
|
||||
- Check storage usage for Prometheus and Jaeger
|
||||
- Review slow queries in PostgreSQL dashboard
|
||||
|
||||
**Monthly:**
|
||||
- Update dashboard with new metrics
|
||||
- Review and update alert runbooks
|
||||
- Capacity planning based on trends
|
||||
|
||||
### Backup and Recovery
|
||||
|
||||
**Prometheus Data:**
|
||||
```bash
|
||||
# Backup Prometheus data
|
||||
kubectl exec -n monitoring prometheus-0 -- tar czf /tmp/prometheus-backup.tar.gz /prometheus
|
||||
kubectl cp monitoring/prometheus-0:/tmp/prometheus-backup.tar.gz ./prometheus-backup.tar.gz
|
||||
|
||||
# Restore (stop Prometheus first)
|
||||
kubectl cp ./prometheus-backup.tar.gz monitoring/prometheus-0:/tmp/
|
||||
kubectl exec -n monitoring prometheus-0 -- tar xzf /tmp/prometheus-backup.tar.gz -C /
|
||||
```
|
||||
|
||||
**Grafana Dashboards:**
|
||||
```bash
|
||||
# Export all dashboards via API
|
||||
curl -u admin:password http://localhost:3000/api/search | \
|
||||
jq -r '.[] | .uid' | \
|
||||
xargs -I{} curl -u admin:password http://localhost:3000/api/dashboards/uid/{} > dashboards-backup.json
|
||||
```
|
||||
|
||||
## 📝 Version History
|
||||
|
||||
- **v1.0.0** (2026-01-07) - Initial production-ready monitoring stack
|
||||
- Prometheus v3.0.1 with HA
|
||||
- AlertManager v0.27.0 with clustering
|
||||
- Grafana v12.3.0 with 7 dashboards
|
||||
- PostgreSQL and Node exporters
|
||||
- 50+ alert rules
|
||||
- Comprehensive documentation
|
||||
@@ -1,20 +0,0 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
# Minimal Monitoring Infrastructure
|
||||
# SigNoz is now managed via Helm in the 'signoz' namespace
|
||||
# This kustomization only maintains:
|
||||
# - Namespace for legacy resources (if needed)
|
||||
# - Node exporter for infrastructure metrics
|
||||
# - PostgreSQL exporter for database metrics
|
||||
# - Optional OTEL collector (can be disabled if using SigNoz's built-in collector)
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- secrets.yaml
|
||||
# Exporters for metrics collection
|
||||
- node-exporter.yaml
|
||||
- postgres-exporter.yaml
|
||||
# Optional: Keep OTEL collector or use SigNoz's built-in one
|
||||
# Uncomment if you want a dedicated OTEL collector in monitoring namespace
|
||||
# - otel-collector.yaml
|
||||
@@ -1,7 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: monitoring
|
||||
labels:
|
||||
name: monitoring
|
||||
app.kubernetes.io/part-of: bakery-ia
|
||||
@@ -1,103 +0,0 @@
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: node-exporter
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: node-exporter
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
rollingUpdate:
|
||||
maxUnavailable: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: node-exporter
|
||||
spec:
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
tolerations:
|
||||
# Run on all nodes including master
|
||||
- operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: node-exporter
|
||||
image: quay.io/prometheus/node-exporter:v1.7.0
|
||||
args:
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--path.rootfs=/host/root'
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)'
|
||||
- '--collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$'
|
||||
- '--collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$'
|
||||
- '--collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$'
|
||||
- '--web.listen-address=:9100'
|
||||
ports:
|
||||
- containerPort: 9100
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
resources:
|
||||
requests:
|
||||
memory: "64Mi"
|
||||
cpu: "50m"
|
||||
limits:
|
||||
memory: "128Mi"
|
||||
cpu: "200m"
|
||||
volumeMounts:
|
||||
- name: sys
|
||||
mountPath: /host/sys
|
||||
mountPropagation: HostToContainer
|
||||
readOnly: true
|
||||
- name: root
|
||||
mountPath: /host/root
|
||||
mountPropagation: HostToContainer
|
||||
readOnly: true
|
||||
- name: proc
|
||||
mountPath: /host/proc
|
||||
mountPropagation: HostToContainer
|
||||
readOnly: true
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
volumes:
|
||||
- name: sys
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: root
|
||||
hostPath:
|
||||
path: /
|
||||
- name: proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: node-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: node-exporter
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "9100"
|
||||
spec:
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: metrics
|
||||
port: 9100
|
||||
protocol: TCP
|
||||
targetPort: 9100
|
||||
selector:
|
||||
app: node-exporter
|
||||
@@ -1,167 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: otel-collector-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
otel-collector-config.yaml: |
|
||||
extensions:
|
||||
health_check:
|
||||
endpoint: 0.0.0.0:13133
|
||||
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 10s
|
||||
send_batch_size: 1024
|
||||
|
||||
# Memory limiter to prevent OOM
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 512
|
||||
spike_limit_mib: 128
|
||||
|
||||
exporters:
|
||||
# Export metrics to Prometheus
|
||||
prometheus:
|
||||
endpoint: "0.0.0.0:8889"
|
||||
namespace: otelcol
|
||||
const_labels:
|
||||
source: otel-collector
|
||||
|
||||
# Export to SigNoz
|
||||
otlp/signoz:
|
||||
endpoint: "signoz-query-service.monitoring.svc.cluster.local:8080"
|
||||
tls:
|
||||
insecure: true
|
||||
|
||||
# Logging exporter for debugging traces and logs
|
||||
logging:
|
||||
loglevel: info
|
||||
sampling_initial: 5
|
||||
sampling_thereafter: 200
|
||||
|
||||
service:
|
||||
extensions: [health_check]
|
||||
pipelines:
|
||||
# Traces pipeline: receive -> process -> export to SigNoz
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [otlp/signoz, logging]
|
||||
|
||||
# Metrics pipeline: receive -> process -> export to both Prometheus and SigNoz
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [prometheus, otlp/signoz]
|
||||
|
||||
# Logs pipeline: receive -> process -> export to SigNoz
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [otlp/signoz, logging]
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: otel-collector
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: otel-collector
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: otel-collector
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: otel-collector
|
||||
spec:
|
||||
containers:
|
||||
- name: otel-collector
|
||||
image: otel/opentelemetry-collector-contrib:0.91.0
|
||||
args:
|
||||
- --config=/conf/otel-collector-config.yaml
|
||||
ports:
|
||||
- containerPort: 4317
|
||||
protocol: TCP
|
||||
name: otlp-grpc
|
||||
- containerPort: 4318
|
||||
protocol: TCP
|
||||
name: otlp-http
|
||||
- containerPort: 8889
|
||||
protocol: TCP
|
||||
name: prometheus
|
||||
- containerPort: 13133
|
||||
protocol: TCP
|
||||
name: health-check
|
||||
volumeMounts:
|
||||
- name: otel-collector-config
|
||||
mountPath: /conf
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 13133
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 13133
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: otel-collector-config
|
||||
configMap:
|
||||
name: otel-collector-config
|
||||
items:
|
||||
- key: otel-collector-config.yaml
|
||||
path: otel-collector-config.yaml
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: otel-collector
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: otel-collector
|
||||
annotations:
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: "8889"
|
||||
prometheus.io/path: "/metrics"
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 4317
|
||||
targetPort: 4317
|
||||
protocol: TCP
|
||||
name: otlp-grpc
|
||||
- port: 4318
|
||||
targetPort: 4318
|
||||
protocol: TCP
|
||||
name: otlp-http
|
||||
- port: 8889
|
||||
targetPort: 8889
|
||||
protocol: TCP
|
||||
name: prometheus
|
||||
selector:
|
||||
app: otel-collector
|
||||
@@ -1,306 +0,0 @@
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: postgres-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: postgres-exporter
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: postgres-exporter
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: postgres-exporter
|
||||
spec:
|
||||
containers:
|
||||
- name: postgres-exporter
|
||||
image: prometheuscommunity/postgres-exporter:v0.15.0
|
||||
ports:
|
||||
- containerPort: 9187
|
||||
name: metrics
|
||||
env:
|
||||
- name: DATA_SOURCE_NAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: postgres-exporter
|
||||
key: data-source-name
|
||||
# Enable extended metrics
|
||||
- name: PG_EXPORTER_EXTEND_QUERY_PATH
|
||||
value: "/etc/postgres-exporter/queries.yaml"
|
||||
# Disable default metrics (we'll use custom ones)
|
||||
- name: PG_EXPORTER_DISABLE_DEFAULT_METRICS
|
||||
value: "false"
|
||||
# Disable settings metrics (can be noisy)
|
||||
- name: PG_EXPORTER_DISABLE_SETTINGS_METRICS
|
||||
value: "false"
|
||||
volumeMounts:
|
||||
- name: queries
|
||||
mountPath: /etc/postgres-exporter
|
||||
resources:
|
||||
requests:
|
||||
memory: "64Mi"
|
||||
cpu: "50m"
|
||||
limits:
|
||||
memory: "128Mi"
|
||||
cpu: "200m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 9187
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 9187
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: queries
|
||||
configMap:
|
||||
name: postgres-exporter-queries
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: postgres-exporter-queries
|
||||
namespace: monitoring
|
||||
data:
|
||||
queries.yaml: |
|
||||
# Custom PostgreSQL queries for bakery-ia metrics
|
||||
|
||||
pg_database:
|
||||
query: |
|
||||
SELECT
|
||||
datname,
|
||||
numbackends as connections,
|
||||
xact_commit as transactions_committed,
|
||||
xact_rollback as transactions_rolled_back,
|
||||
blks_read as blocks_read,
|
||||
blks_hit as blocks_hit,
|
||||
tup_returned as tuples_returned,
|
||||
tup_fetched as tuples_fetched,
|
||||
tup_inserted as tuples_inserted,
|
||||
tup_updated as tuples_updated,
|
||||
tup_deleted as tuples_deleted,
|
||||
conflicts as conflicts,
|
||||
temp_files as temp_files,
|
||||
temp_bytes as temp_bytes,
|
||||
deadlocks as deadlocks
|
||||
FROM pg_stat_database
|
||||
WHERE datname NOT IN ('template0', 'template1', 'postgres')
|
||||
metrics:
|
||||
- datname:
|
||||
usage: "LABEL"
|
||||
description: "Name of the database"
|
||||
- connections:
|
||||
usage: "GAUGE"
|
||||
description: "Number of backends currently connected to this database"
|
||||
- transactions_committed:
|
||||
usage: "COUNTER"
|
||||
description: "Number of transactions in this database that have been committed"
|
||||
- transactions_rolled_back:
|
||||
usage: "COUNTER"
|
||||
description: "Number of transactions in this database that have been rolled back"
|
||||
- blocks_read:
|
||||
usage: "COUNTER"
|
||||
description: "Number of disk blocks read in this database"
|
||||
- blocks_hit:
|
||||
usage: "COUNTER"
|
||||
description: "Number of times disk blocks were found in the buffer cache"
|
||||
- tuples_returned:
|
||||
usage: "COUNTER"
|
||||
description: "Number of rows returned by queries in this database"
|
||||
- tuples_fetched:
|
||||
usage: "COUNTER"
|
||||
description: "Number of rows fetched by queries in this database"
|
||||
- tuples_inserted:
|
||||
usage: "COUNTER"
|
||||
description: "Number of rows inserted by queries in this database"
|
||||
- tuples_updated:
|
||||
usage: "COUNTER"
|
||||
description: "Number of rows updated by queries in this database"
|
||||
- tuples_deleted:
|
||||
usage: "COUNTER"
|
||||
description: "Number of rows deleted by queries in this database"
|
||||
- conflicts:
|
||||
usage: "COUNTER"
|
||||
description: "Number of queries canceled due to conflicts with recovery"
|
||||
- temp_files:
|
||||
usage: "COUNTER"
|
||||
description: "Number of temporary files created by queries"
|
||||
- temp_bytes:
|
||||
usage: "COUNTER"
|
||||
description: "Total amount of data written to temporary files by queries"
|
||||
- deadlocks:
|
||||
usage: "COUNTER"
|
||||
description: "Number of deadlocks detected in this database"
|
||||
|
||||
pg_replication:
|
||||
query: |
|
||||
SELECT
|
||||
CASE WHEN pg_is_in_recovery() THEN 1 ELSE 0 END as is_replica,
|
||||
EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT as lag_seconds
|
||||
metrics:
|
||||
- is_replica:
|
||||
usage: "GAUGE"
|
||||
description: "1 if this is a replica, 0 if primary"
|
||||
- lag_seconds:
|
||||
usage: "GAUGE"
|
||||
description: "Replication lag in seconds (only on replicas)"
|
||||
|
||||
pg_slow_queries:
|
||||
query: |
|
||||
SELECT
|
||||
datname,
|
||||
usename,
|
||||
state,
|
||||
COUNT(*) as count,
|
||||
MAX(EXTRACT(EPOCH FROM (now() - query_start))) as max_duration_seconds
|
||||
FROM pg_stat_activity
|
||||
WHERE state != 'idle'
|
||||
AND query NOT LIKE '%pg_stat_activity%'
|
||||
AND query_start < now() - interval '30 seconds'
|
||||
GROUP BY datname, usename, state
|
||||
metrics:
|
||||
- datname:
|
||||
usage: "LABEL"
|
||||
description: "Database name"
|
||||
- usename:
|
||||
usage: "LABEL"
|
||||
description: "User name"
|
||||
- state:
|
||||
usage: "LABEL"
|
||||
description: "Query state"
|
||||
- count:
|
||||
usage: "GAUGE"
|
||||
description: "Number of slow queries"
|
||||
- max_duration_seconds:
|
||||
usage: "GAUGE"
|
||||
description: "Maximum query duration in seconds"
|
||||
|
||||
pg_table_stats:
|
||||
query: |
|
||||
SELECT
|
||||
schemaname,
|
||||
relname,
|
||||
seq_scan,
|
||||
seq_tup_read,
|
||||
idx_scan,
|
||||
idx_tup_fetch,
|
||||
n_tup_ins,
|
||||
n_tup_upd,
|
||||
n_tup_del,
|
||||
n_tup_hot_upd,
|
||||
n_live_tup,
|
||||
n_dead_tup,
|
||||
n_mod_since_analyze,
|
||||
last_vacuum,
|
||||
last_autovacuum,
|
||||
last_analyze,
|
||||
last_autoanalyze
|
||||
FROM pg_stat_user_tables
|
||||
WHERE schemaname = 'public'
|
||||
ORDER BY n_live_tup DESC
|
||||
LIMIT 20
|
||||
metrics:
|
||||
- schemaname:
|
||||
usage: "LABEL"
|
||||
description: "Schema name"
|
||||
- relname:
|
||||
usage: "LABEL"
|
||||
description: "Table name"
|
||||
- seq_scan:
|
||||
usage: "COUNTER"
|
||||
description: "Number of sequential scans"
|
||||
- seq_tup_read:
|
||||
usage: "COUNTER"
|
||||
description: "Number of tuples read by sequential scans"
|
||||
- idx_scan:
|
||||
usage: "COUNTER"
|
||||
description: "Number of index scans"
|
||||
- idx_tup_fetch:
|
||||
usage: "COUNTER"
|
||||
description: "Number of tuples fetched by index scans"
|
||||
- n_tup_ins:
|
||||
usage: "COUNTER"
|
||||
description: "Number of tuples inserted"
|
||||
- n_tup_upd:
|
||||
usage: "COUNTER"
|
||||
description: "Number of tuples updated"
|
||||
- n_tup_del:
|
||||
usage: "COUNTER"
|
||||
description: "Number of tuples deleted"
|
||||
- n_tup_hot_upd:
|
||||
usage: "COUNTER"
|
||||
description: "Number of tuples HOT updated"
|
||||
- n_live_tup:
|
||||
usage: "GAUGE"
|
||||
description: "Estimated number of live rows"
|
||||
- n_dead_tup:
|
||||
usage: "GAUGE"
|
||||
description: "Estimated number of dead rows"
|
||||
- n_mod_since_analyze:
|
||||
usage: "GAUGE"
|
||||
description: "Number of rows modified since last analyze"
|
||||
|
||||
pg_locks:
|
||||
query: |
|
||||
SELECT
|
||||
mode,
|
||||
locktype,
|
||||
COUNT(*) as count
|
||||
FROM pg_locks
|
||||
GROUP BY mode, locktype
|
||||
metrics:
|
||||
- mode:
|
||||
usage: "LABEL"
|
||||
description: "Lock mode"
|
||||
- locktype:
|
||||
usage: "LABEL"
|
||||
description: "Lock type"
|
||||
- count:
|
||||
usage: "GAUGE"
|
||||
description: "Number of locks"
|
||||
|
||||
pg_connection_pool:
|
||||
query: |
|
||||
SELECT
|
||||
state,
|
||||
COUNT(*) as count,
|
||||
MAX(EXTRACT(EPOCH FROM (now() - state_change))) as max_state_duration_seconds
|
||||
FROM pg_stat_activity
|
||||
GROUP BY state
|
||||
metrics:
|
||||
- state:
|
||||
usage: "LABEL"
|
||||
description: "Connection state"
|
||||
- count:
|
||||
usage: "GAUGE"
|
||||
description: "Number of connections in this state"
|
||||
- max_state_duration_seconds:
|
||||
usage: "GAUGE"
|
||||
description: "Maximum time a connection has been in this state"
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: postgres-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: postgres-exporter
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 9187
|
||||
targetPort: 9187
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
selector:
|
||||
app: postgres-exporter
|
||||
@@ -1,52 +0,0 @@
|
||||
---
|
||||
# NOTE: This file contains example secrets for development.
|
||||
# For production, use one of the following:
|
||||
# 1. Sealed Secrets (bitnami-labs/sealed-secrets)
|
||||
# 2. External Secrets Operator
|
||||
# 3. HashiCorp Vault
|
||||
# 4. Cloud provider secret managers (AWS Secrets Manager, GCP Secret Manager, Azure Key Vault)
|
||||
#
|
||||
# NEVER commit real production secrets to git!
|
||||
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: grafana-admin
|
||||
namespace: monitoring
|
||||
type: Opaque
|
||||
stringData:
|
||||
admin-user: admin
|
||||
# CHANGE THIS PASSWORD IN PRODUCTION!
|
||||
# Generate with: openssl rand -base64 32
|
||||
admin-password: "CHANGE_ME_IN_PRODUCTION"
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: alertmanager-secrets
|
||||
namespace: monitoring
|
||||
type: Opaque
|
||||
stringData:
|
||||
# SMTP configuration for email alerts
|
||||
# CHANGE THESE VALUES IN PRODUCTION!
|
||||
smtp-host: "smtp.gmail.com:587"
|
||||
smtp-username: "alerts@yourdomain.com"
|
||||
smtp-password: "CHANGE_ME_IN_PRODUCTION"
|
||||
smtp-from: "alerts@yourdomain.com"
|
||||
|
||||
# Slack webhook URL (optional)
|
||||
slack-webhook-url: "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: postgres-exporter
|
||||
namespace: monitoring
|
||||
type: Opaque
|
||||
stringData:
|
||||
# PostgreSQL connection string
|
||||
# Format: postgresql://username:password@hostname:port/database?sslmode=disable
|
||||
# CHANGE THIS IN PRODUCTION!
|
||||
data-source-name: "postgresql://postgres:postgres@postgres.bakery-ia:5432/bakery?sslmode=disable"
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: notification-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "notification-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: orchestrator-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "orchestrator-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: orders-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "orders-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: pos-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "pos-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: procurement-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "procurement-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: production-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "production-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: recipes-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "recipes-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: sales-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "sales-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: suppliers-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "suppliers-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: tenant-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "tenant-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
app.kubernetes.io/name: training-service
|
||||
app.kubernetes.io/component: microservice
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
# Wait for Redis to be ready
|
||||
- name: wait-for-redis
|
||||
@@ -92,6 +94,26 @@ spec:
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
name: http
|
||||
env:
|
||||
# OpenTelemetry Configuration
|
||||
- name: OTEL_COLLECTOR_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: "training-service"
|
||||
- name: ENABLE_TRACING
|
||||
value: "true"
|
||||
# Logging Configuration
|
||||
- name: OTEL_LOGS_EXPORTER
|
||||
value: "otlp"
|
||||
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
|
||||
value: "true"
|
||||
# Metrics Configuration
|
||||
- name: ENABLE_OTEL_METRICS
|
||||
value: "true"
|
||||
- name: ENABLE_SYSTEM_METRICS
|
||||
value: "true"
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: bakery-config
|
||||
|
||||
@@ -17,6 +17,8 @@ spec:
|
||||
labels:
|
||||
app: demo-cleanup
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
|
||||
@@ -22,6 +22,8 @@ spec:
|
||||
app: external-service
|
||||
job: data-rotation
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
ttlSecondsAfterFinished: 172800
|
||||
backoffLimit: 2
|
||||
|
||||
|
||||
@@ -19,6 +19,8 @@ spec:
|
||||
component: background-jobs
|
||||
service: demo-session
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
containers:
|
||||
- name: worker
|
||||
image: bakery/demo-session-service
|
||||
|
||||
@@ -20,25 +20,23 @@ metadata:
|
||||
nginx.ingress.kubernetes.io/upstream-keepalive-timeout: "3600"
|
||||
# WebSocket upgrade support
|
||||
nginx.ingress.kubernetes.io/websocket-services: "gateway-service"
|
||||
# CORS configuration for HTTPS and local development
|
||||
# CORS configuration for HTTPS
|
||||
nginx.ingress.kubernetes.io/enable-cors: "true"
|
||||
nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakery-ia.local,https://api.bakery-ia.local,https://monitoring.bakery-ia.local,https://localhost"
|
||||
nginx.ingress.kubernetes.io/cors-allow-origin: "https://your-domain.com" # To be overridden in overlays
|
||||
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH"
|
||||
nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin, Cache-Control"
|
||||
nginx.ingress.kubernetes.io/cors-allow-credentials: "true"
|
||||
# Cert-manager annotations for automatic certificate issuance
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-staging"
|
||||
cert-manager.io/acme-challenge-type: http01
|
||||
# Using issuer appropriate for environment
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-prod" # To be overridden in dev overlay
|
||||
spec:
|
||||
ingressClassName: nginx
|
||||
tls:
|
||||
- hosts:
|
||||
- bakery-ia.local
|
||||
- api.bakery-ia.local
|
||||
- monitoring.bakery-ia.local
|
||||
secretName: bakery-ia-tls-cert
|
||||
- your-domain.com # To be overridden in overlays
|
||||
secretName: bakery-tls-cert # To be overridden in overlays
|
||||
rules:
|
||||
- host: bakery-ia.local
|
||||
- host: your-domain.com # To be overridden in overlays
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
@@ -55,7 +53,7 @@ spec:
|
||||
name: gateway-service
|
||||
port:
|
||||
number: 8000
|
||||
- host: api.bakery-ia.local
|
||||
- host: api.your-domain.com # To be overridden in overlays
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
@@ -65,20 +63,22 @@ spec:
|
||||
name: gateway-service
|
||||
port:
|
||||
number: 8000
|
||||
- host: monitoring.bakery-ia.local
|
||||
- host: monitoring.your-domain.com # To be overridden in overlays
|
||||
http:
|
||||
paths:
|
||||
- path: /grafana
|
||||
pathType: Prefix
|
||||
# SigNoz Frontend UI and API (consolidated in newer versions)
|
||||
- path: /signoz(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: grafana-service
|
||||
name: signoz
|
||||
port:
|
||||
number: 3000
|
||||
- path: /prometheus
|
||||
pathType: Prefix
|
||||
number: 8080
|
||||
# SigNoz API endpoints
|
||||
- path: /signoz-api(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: prometheus-service
|
||||
name: signoz
|
||||
port:
|
||||
number: 9090
|
||||
number: 8080
|
||||
@@ -17,6 +17,8 @@ spec:
|
||||
app: external-service
|
||||
job: data-init
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
restartPolicy: OnFailure
|
||||
|
||||
initContainers:
|
||||
|
||||
@@ -15,6 +15,8 @@ spec:
|
||||
app.kubernetes.io/name: nominatim-init
|
||||
app.kubernetes.io/component: data-init
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: nominatim-import
|
||||
|
||||
@@ -66,6 +66,10 @@ resources:
|
||||
# Persistent storage
|
||||
- components/volumes/model-storage-pvc.yaml
|
||||
|
||||
# Cert manager cluster issuers
|
||||
- components/cert-manager/cluster-issuer-staging.yaml
|
||||
- components/cert-manager/local-ca-issuer.yaml
|
||||
|
||||
# Database services
|
||||
- components/databases/auth-db.yaml
|
||||
- components/databases/tenant-db.yaml
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: ai-insights-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: alert-processor-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: auth-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -29,4 +29,4 @@ roleRef:
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: demo-seed-sa
|
||||
namespace: bakery-ia
|
||||
namespace: bakery-ia
|
||||
|
||||
@@ -15,6 +15,8 @@ spec:
|
||||
app.kubernetes.io/name: demo-session-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: distribution-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: external-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: forecasting-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: inventory-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: notification-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: orchestrator-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: orders-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: pos-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: procurement-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: production-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: recipes-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: sales-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: suppliers-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: tenant-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: tenant-seed-pilot-coupon
|
||||
app.kubernetes.io/component: seed
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
serviceAccountName: demo-seed-sa
|
||||
initContainers:
|
||||
- name: wait-for-tenant-migration
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
app.kubernetes.io/name: training-migration
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: dockerhub-creds
|
||||
initContainers:
|
||||
- name: wait-for-db
|
||||
image: postgres:17-alpine
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
name: selfsigned-issuer
|
||||
spec:
|
||||
selfSigned: {}
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
name: letsencrypt-staging
|
||||
spec:
|
||||
acme:
|
||||
# The ACME server URL (Let's Encrypt staging)
|
||||
server: https://acme-staging-v02.api.letsencrypt.org/directory
|
||||
# Email address used for ACME registration
|
||||
email: admin@bakery-ia.local # Change this to your email
|
||||
# Name of a secret used to store the ACME account private key
|
||||
privateKeySecretRef:
|
||||
name: letsencrypt-staging
|
||||
# Enable the HTTP-01 challenge provider
|
||||
solvers:
|
||||
- http01:
|
||||
ingress:
|
||||
class: nginx
|
||||
podTemplate:
|
||||
spec:
|
||||
nodeSelector:
|
||||
"kubernetes.io/os": linux
|
||||
@@ -24,6 +24,7 @@ spec:
|
||||
- localhost
|
||||
- bakery-ia.local
|
||||
- api.bakery-ia.local
|
||||
- monitoring.bakery-ia.local
|
||||
- "*.bakery-ia.local"
|
||||
|
||||
# IP addresses (for localhost)
|
||||
|
||||
@@ -36,6 +36,7 @@ spec:
|
||||
- hosts:
|
||||
- localhost
|
||||
- bakery-ia.local
|
||||
- monitoring.bakery-ia.local
|
||||
secretName: bakery-dev-tls-cert
|
||||
rules:
|
||||
- host: localhost
|
||||
@@ -54,4 +55,32 @@ spec:
|
||||
service:
|
||||
name: gateway-service
|
||||
port:
|
||||
number: 8000
|
||||
number: 8000
|
||||
- host: bakery-ia.local
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: frontend-service
|
||||
port:
|
||||
number: 3000
|
||||
- path: /api
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: gateway-service
|
||||
port:
|
||||
number: 8000
|
||||
- host: monitoring.bakery-ia.local
|
||||
http:
|
||||
paths:
|
||||
# SigNoz Frontend UI
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: signoz
|
||||
port:
|
||||
number: 8080
|
||||
@@ -9,15 +9,12 @@ metadata:
|
||||
|
||||
resources:
|
||||
- ../../base
|
||||
# Monitoring enabled for dev environment
|
||||
- ../../base/components/monitoring
|
||||
- dev-ingress.yaml
|
||||
# SigNoz ingress is applied by Tilt (see Tiltfile)
|
||||
# - signoz-ingress.yaml
|
||||
# SigNoz is managed via Helm deployment (see Tiltfile signoz-deploy)
|
||||
# Monitoring is handled by SigNoz (no separate monitoring components needed)
|
||||
# Dev-Prod Parity: Enable HTTPS with self-signed certificates
|
||||
- dev-certificate.yaml
|
||||
- monitoring-certificate.yaml
|
||||
- cluster-issuer-staging.yaml
|
||||
# SigNoz paths are now included in the main ingress (ingress-https.yaml)
|
||||
|
||||
# Exclude nominatim from dev to save resources
|
||||
# Using scale to 0 for StatefulSet to prevent pod creation
|
||||
@@ -611,39 +608,6 @@ patches:
|
||||
limits:
|
||||
memory: "512Mi"
|
||||
cpu: "300m"
|
||||
# Optional exporters resource patches for dev
|
||||
- target:
|
||||
group: apps
|
||||
version: v1
|
||||
kind: DaemonSet
|
||||
name: node-exporter
|
||||
namespace: monitoring
|
||||
patch: |-
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/resources
|
||||
value:
|
||||
requests:
|
||||
memory: "32Mi"
|
||||
cpu: "25m"
|
||||
limits:
|
||||
memory: "64Mi"
|
||||
cpu: "100m"
|
||||
- target:
|
||||
group: apps
|
||||
version: v1
|
||||
kind: Deployment
|
||||
name: postgres-exporter
|
||||
namespace: monitoring
|
||||
patch: |-
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/resources
|
||||
value:
|
||||
requests:
|
||||
memory: "32Mi"
|
||||
cpu: "25m"
|
||||
limits:
|
||||
memory: "64Mi"
|
||||
cpu: "100m"
|
||||
|
||||
secretGenerator:
|
||||
- name: dev-secrets
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: bakery-dev-monitoring-tls-cert
|
||||
namespace: monitoring
|
||||
spec:
|
||||
# Self-signed certificate for local development
|
||||
secretName: bakery-ia-tls-cert
|
||||
|
||||
# Certificate duration
|
||||
duration: 2160h # 90 days
|
||||
renewBefore: 360h # 15 days
|
||||
|
||||
# Subject configuration
|
||||
subject:
|
||||
organizations:
|
||||
- Bakery IA Development
|
||||
|
||||
# Common name
|
||||
commonName: localhost
|
||||
|
||||
# DNS names this certificate is valid for
|
||||
dnsNames:
|
||||
- localhost
|
||||
- monitoring.bakery-ia.local
|
||||
|
||||
# IP addresses (for localhost)
|
||||
ipAddresses:
|
||||
- 127.0.0.1
|
||||
- ::1
|
||||
|
||||
# Use self-signed issuer for development
|
||||
issuerRef:
|
||||
name: selfsigned-issuer
|
||||
kind: ClusterIssuer
|
||||
group: cert-manager.io
|
||||
|
||||
# Private key configuration
|
||||
privateKey:
|
||||
algorithm: RSA
|
||||
encoding: PKCS1
|
||||
size: 2048
|
||||
|
||||
# Usages
|
||||
usages:
|
||||
- server auth
|
||||
- client auth
|
||||
- digital signature
|
||||
- key encipherment
|
||||
@@ -1,39 +0,0 @@
|
||||
---
|
||||
# SigNoz Ingress for Development (localhost)
|
||||
# SigNoz is deployed via Helm in the 'signoz' namespace
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: signoz-ingress-localhost
|
||||
namespace: signoz
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
||||
nginx.ingress.kubernetes.io/use-regex: "true"
|
||||
spec:
|
||||
ingressClassName: nginx
|
||||
tls:
|
||||
- hosts:
|
||||
- localhost
|
||||
secretName: bakery-ia-tls-cert
|
||||
rules:
|
||||
- host: localhost
|
||||
http:
|
||||
paths:
|
||||
# SigNoz Frontend UI
|
||||
- path: /signoz(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: signoz-frontend
|
||||
port:
|
||||
number: 3301
|
||||
# SigNoz Query Service API
|
||||
- path: /signoz-api(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: signoz-query-service
|
||||
port:
|
||||
number: 8080
|
||||
@@ -8,13 +8,13 @@ namespace: bakery-ia
|
||||
|
||||
resources:
|
||||
- ../../base
|
||||
- ../../base/components/monitoring
|
||||
- prod-ingress.yaml
|
||||
- prod-configmap.yaml
|
||||
# SigNoz is managed via Helm deployment (see infrastructure/helm/deploy-signoz.sh)
|
||||
# Monitoring is handled by SigNoz (no separate monitoring components needed)
|
||||
# SigNoz paths are now included in the main ingress (ingress-https.yaml)
|
||||
|
||||
patchesStrategicMerge:
|
||||
- storage-patch.yaml
|
||||
- monitoring-ingress-patch.yaml
|
||||
|
||||
labels:
|
||||
- includeSelectors: true
|
||||
@@ -22,8 +22,83 @@ labels:
|
||||
environment: production
|
||||
tier: production
|
||||
|
||||
# SigNoz resource patches for production
|
||||
# Production configuration patches
|
||||
patches:
|
||||
# Override ConfigMap values for production
|
||||
- target:
|
||||
kind: ConfigMap
|
||||
name: bakery-config
|
||||
patch: |-
|
||||
- op: replace
|
||||
path: /data/ENVIRONMENT
|
||||
value: "production"
|
||||
- op: replace
|
||||
path: /data/DEBUG
|
||||
value: "false"
|
||||
- op: replace
|
||||
path: /data/LOG_LEVEL
|
||||
value: "INFO"
|
||||
- op: replace
|
||||
path: /data/PROFILING_ENABLED
|
||||
value: "false"
|
||||
- op: replace
|
||||
path: /data/MOCK_EXTERNAL_APIS
|
||||
value: "false"
|
||||
- op: add
|
||||
path: /data/REQUEST_TIMEOUT
|
||||
value: "30"
|
||||
- op: add
|
||||
path: /data/MAX_CONNECTIONS
|
||||
value: "100"
|
||||
- op: replace
|
||||
path: /data/ENABLE_TRACING
|
||||
value: "true"
|
||||
- op: replace
|
||||
path: /data/ENABLE_METRICS
|
||||
value: "true"
|
||||
- op: replace
|
||||
path: /data/ENABLE_LOGS
|
||||
value: "true"
|
||||
- op: add
|
||||
path: /data/OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4317"
|
||||
- op: add
|
||||
path: /data/OTEL_EXPORTER_OTLP_PROTOCOL
|
||||
value: "grpc"
|
||||
- op: add
|
||||
path: /data/OTEL_SERVICE_NAME
|
||||
value: "bakery-ia"
|
||||
- op: add
|
||||
path: /data/OTEL_RESOURCE_ATTRIBUTES
|
||||
value: "deployment.environment=production,cluster.name=bakery-ia-prod"
|
||||
- op: add
|
||||
path: /data/SIGNOZ_ENDPOINT
|
||||
value: "http://signoz-query-service.signoz.svc.cluster.local:8080"
|
||||
- op: add
|
||||
path: /data/SIGNOZ_FRONTEND_URL
|
||||
value: "https://monitoring.bakewise.ai/signoz"
|
||||
- op: add
|
||||
path: /data/SIGNOZ_ROOT_URL
|
||||
value: "https://monitoring.bakewise.ai/signoz"
|
||||
- op: add
|
||||
path: /data/RATE_LIMIT_ENABLED
|
||||
value: "true"
|
||||
- op: add
|
||||
path: /data/RATE_LIMIT_PER_MINUTE
|
||||
value: "60"
|
||||
- op: add
|
||||
path: /data/CORS_ORIGINS
|
||||
value: "https://bakewise.ai"
|
||||
- op: add
|
||||
path: /data/CORS_ALLOW_CREDENTIALS
|
||||
value: "true"
|
||||
- op: add
|
||||
path: /data/VITE_API_URL
|
||||
value: "/api"
|
||||
- op: add
|
||||
path: /data/VITE_ENVIRONMENT
|
||||
value: "production"
|
||||
# SigNoz resource patches for production
|
||||
# SigNoz ClickHouse production configuration
|
||||
- target:
|
||||
group: apps
|
||||
|
||||
@@ -60,5 +60,6 @@ spec:
|
||||
name: gateway-service
|
||||
port:
|
||||
number: 8000
|
||||
|
||||
# Monitoring (monitoring.bakewise.ai) is now handled by signoz-ingress.yaml in the signoz namespace
|
||||
# Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace
|
||||
# SigNoz creates its own Ingress via Helm chart configuration
|
||||
# Access at: https://monitoring.bakewise.ai (configured in signoz-values-prod.yaml)
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
---
|
||||
# SigNoz Ingress for Production
|
||||
# SigNoz is deployed via Helm in the 'signoz' namespace
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: signoz-ingress-prod
|
||||
namespace: signoz
|
||||
labels:
|
||||
app.kubernetes.io/name: signoz
|
||||
app.kubernetes.io/component: ingress
|
||||
annotations:
|
||||
# Nginx ingress controller annotations
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "50m"
|
||||
nginx.ingress.kubernetes.io/proxy-connect-timeout: "600"
|
||||
nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
|
||||
nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
|
||||
nginx.ingress.kubernetes.io/rewrite-target: /$2
|
||||
nginx.ingress.kubernetes.io/use-regex: "true"
|
||||
|
||||
# CORS configuration
|
||||
nginx.ingress.kubernetes.io/enable-cors: "true"
|
||||
nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakewise.ai,https://monitoring.bakewise.ai"
|
||||
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH"
|
||||
nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin"
|
||||
nginx.ingress.kubernetes.io/cors-allow-credentials: "true"
|
||||
|
||||
# Security headers
|
||||
nginx.ingress.kubernetes.io/configuration-snippet: |
|
||||
more_set_headers "X-Frame-Options: SAMEORIGIN";
|
||||
more_set_headers "X-Content-Type-Options: nosniff";
|
||||
more_set_headers "X-XSS-Protection: 1; mode=block";
|
||||
more_set_headers "Referrer-Policy: strict-origin-when-cross-origin";
|
||||
|
||||
# Rate limiting
|
||||
nginx.ingress.kubernetes.io/limit-rps: "100"
|
||||
nginx.ingress.kubernetes.io/limit-connections: "50"
|
||||
|
||||
# Cert-manager annotations for automatic certificate issuance
|
||||
cert-manager.io/cluster-issuer: "letsencrypt-production"
|
||||
cert-manager.io/acme-challenge-type: http01
|
||||
|
||||
spec:
|
||||
ingressClassName: nginx
|
||||
tls:
|
||||
- hosts:
|
||||
- monitoring.bakewise.ai
|
||||
secretName: signoz-prod-tls-cert
|
||||
rules:
|
||||
- host: monitoring.bakewise.ai
|
||||
http:
|
||||
paths:
|
||||
# SigNoz Frontend UI
|
||||
- path: /signoz(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: signoz-frontend
|
||||
port:
|
||||
number: 3301
|
||||
# SigNoz Query Service API
|
||||
- path: /signoz-api(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: signoz-query-service
|
||||
port:
|
||||
number: 8080
|
||||
# SigNoz AlertManager
|
||||
- path: /signoz-alerts(/|$)(.*)
|
||||
pathType: ImplementationSpecific
|
||||
backend:
|
||||
service:
|
||||
name: signoz-alertmanager
|
||||
port:
|
||||
number: 9093
|
||||
133
infrastructure/kubernetes/setup-database-monitoring.sh
Executable file
133
infrastructure/kubernetes/setup-database-monitoring.sh
Executable file
@@ -0,0 +1,133 @@
|
||||
#!/bin/bash
|
||||
# Setup script for database monitoring with OpenTelemetry and SigNoz
|
||||
# This script creates monitoring users in PostgreSQL and deploys the collector
|
||||
|
||||
set -e
|
||||
|
||||
echo "========================================="
|
||||
echo "Database Monitoring Setup for SigNoz"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
# Configuration
|
||||
NAMESPACE="bakery-ia"
|
||||
MONITOR_USER="otel_monitor"
|
||||
MONITOR_PASSWORD=$(openssl rand -base64 32)
|
||||
|
||||
# PostgreSQL databases to monitor
|
||||
DATABASES=(
|
||||
"auth-db-service:auth_db"
|
||||
"inventory-db-service:inventory_db"
|
||||
"orders-db-service:orders_db"
|
||||
"tenant-db-service:tenant_db"
|
||||
"sales-db-service:sales_db"
|
||||
"production-db-service:production_db"
|
||||
"recipes-db-service:recipes_db"
|
||||
"procurement-db-service:procurement_db"
|
||||
"distribution-db-service:distribution_db"
|
||||
"forecasting-db-service:forecasting_db"
|
||||
"external-db-service:external_db"
|
||||
"suppliers-db-service:suppliers_db"
|
||||
"pos-db-service:pos_db"
|
||||
"training-db-service:training_db"
|
||||
"notification-db-service:notification_db"
|
||||
"orchestrator-db-service:orchestrator_db"
|
||||
"ai-insights-db-service:ai_insights_db"
|
||||
)
|
||||
|
||||
echo "Step 1: Creating monitoring user in PostgreSQL databases"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
for db_entry in "${DATABASES[@]}"; do
|
||||
IFS=':' read -r service dbname <<< "$db_entry"
|
||||
|
||||
echo "Creating monitoring user in $dbname..."
|
||||
|
||||
# Create monitoring user via kubectl exec
|
||||
kubectl exec -n "$NAMESPACE" "deployment/${service%-service}" -- psql -U postgres -d "$dbname" -c "
|
||||
DO \$\$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '$MONITOR_USER') THEN
|
||||
CREATE USER $MONITOR_USER WITH PASSWORD '$MONITOR_PASSWORD';
|
||||
GRANT pg_monitor TO $MONITOR_USER;
|
||||
GRANT CONNECT ON DATABASE $dbname TO $MONITOR_USER;
|
||||
RAISE NOTICE 'User $MONITOR_USER created successfully';
|
||||
ELSE
|
||||
RAISE NOTICE 'User $MONITOR_USER already exists';
|
||||
END IF;
|
||||
END
|
||||
\$\$;
|
||||
" 2>/dev/null || echo " ⚠️ Warning: Could not create user in $dbname (may already exist or database not ready)"
|
||||
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo "✅ Monitoring users created"
|
||||
echo ""
|
||||
|
||||
echo "Step 2: Creating Kubernetes secret for monitoring credentials"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
# Create secret for database monitoring
|
||||
kubectl create secret generic database-monitor-secrets \
|
||||
-n "$NAMESPACE" \
|
||||
--from-literal=POSTGRES_MONITOR_USER="$MONITOR_USER" \
|
||||
--from-literal=POSTGRES_MONITOR_PASSWORD="$MONITOR_PASSWORD" \
|
||||
--dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
echo "✅ Secret created: database-monitor-secrets"
|
||||
echo ""
|
||||
|
||||
echo "Step 3: Deploying OpenTelemetry collector for database monitoring"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
kubectl apply -f infrastructure/kubernetes/base/monitoring/database-otel-collector.yaml
|
||||
|
||||
echo "✅ Database monitoring collector deployed"
|
||||
echo ""
|
||||
|
||||
echo "Step 4: Waiting for collector to be ready"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
|
||||
kubectl wait --for=condition=available --timeout=60s \
|
||||
deployment/database-otel-collector -n "$NAMESPACE"
|
||||
|
||||
echo "✅ Collector is ready"
|
||||
echo ""
|
||||
|
||||
echo "========================================="
|
||||
echo "Database Monitoring Setup Complete!"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
echo "What's been configured:"
|
||||
echo " ✅ Monitoring user created in all PostgreSQL databases"
|
||||
echo " ✅ OpenTelemetry collector deployed for database metrics"
|
||||
echo " ✅ Metrics exported to SigNoz"
|
||||
echo ""
|
||||
echo "Metrics being collected:"
|
||||
echo " 📊 PostgreSQL: connections, commits, rollbacks, deadlocks, table sizes"
|
||||
echo " 📊 Redis: memory usage, keyspace hits/misses, connected clients"
|
||||
echo " 📊 RabbitMQ: queue depth, message rates, consumer count"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo " 1. Check collector logs:"
|
||||
echo " kubectl logs -n $NAMESPACE deployment/database-otel-collector"
|
||||
echo ""
|
||||
echo " 2. View metrics in SigNoz:"
|
||||
echo " - Go to https://monitoring.bakery-ia.local"
|
||||
echo " - Create dashboard with queries like:"
|
||||
echo " * postgresql.backends (connections)"
|
||||
echo " * postgresql.database.size (database size)"
|
||||
echo " * redis.memory.used (Redis memory)"
|
||||
echo " * rabbitmq.message.current (queue depth)"
|
||||
echo ""
|
||||
echo " 3. Create alerts for:"
|
||||
echo " - High connection count (approaching max_connections)"
|
||||
echo " - Slow query detection (via application traces)"
|
||||
echo " - High Redis memory usage"
|
||||
echo " - RabbitMQ queue buildup"
|
||||
echo ""
|
||||
65
infrastructure/kubernetes/setup-dockerhub-secrets.sh
Executable file
65
infrastructure/kubernetes/setup-dockerhub-secrets.sh
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Setup Docker Hub image pull secrets for all namespaces
|
||||
# This script creates docker-registry secrets for pulling images from Docker Hub
|
||||
|
||||
set -e
|
||||
|
||||
# Docker Hub credentials
|
||||
DOCKER_SERVER="docker.io"
|
||||
DOCKER_USERNAME="uals"
|
||||
DOCKER_PASSWORD="dckr_pat_zzEY5Q58x1S0puraIoKEtbpue3A"
|
||||
DOCKER_EMAIL="ualfaro@gmail.com"
|
||||
SECRET_NAME="dockerhub-creds"
|
||||
|
||||
# List of namespaces used in the project
|
||||
NAMESPACES=(
|
||||
"bakery-ia"
|
||||
"bakery-ia-dev"
|
||||
"bakery-ia-prod"
|
||||
"default"
|
||||
)
|
||||
|
||||
echo "Setting up Docker Hub image pull secrets..."
|
||||
echo "==========================================="
|
||||
echo ""
|
||||
|
||||
for namespace in "${NAMESPACES[@]}"; do
|
||||
echo "Processing namespace: $namespace"
|
||||
|
||||
# Create namespace if it doesn't exist
|
||||
if ! kubectl get namespace "$namespace" >/dev/null 2>&1; then
|
||||
echo " Creating namespace: $namespace"
|
||||
kubectl create namespace "$namespace"
|
||||
fi
|
||||
|
||||
# Delete existing secret if it exists
|
||||
if kubectl get secret "$SECRET_NAME" -n "$namespace" >/dev/null 2>&1; then
|
||||
echo " Deleting existing secret in namespace: $namespace"
|
||||
kubectl delete secret "$SECRET_NAME" -n "$namespace"
|
||||
fi
|
||||
|
||||
# Create the docker-registry secret
|
||||
echo " Creating Docker Hub secret in namespace: $namespace"
|
||||
kubectl create secret docker-registry "$SECRET_NAME" \
|
||||
--docker-server="$DOCKER_SERVER" \
|
||||
--docker-username="$DOCKER_USERNAME" \
|
||||
--docker-password="$DOCKER_PASSWORD" \
|
||||
--docker-email="$DOCKER_EMAIL" \
|
||||
-n "$namespace"
|
||||
|
||||
echo " ✓ Secret created successfully in namespace: $namespace"
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo "==========================================="
|
||||
echo "Docker Hub secrets setup completed!"
|
||||
echo ""
|
||||
echo "The secret '$SECRET_NAME' has been created in all namespaces:"
|
||||
for namespace in "${NAMESPACES[@]}"; do
|
||||
echo " - $namespace"
|
||||
done
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo "1. Apply Kubernetes manifests with imagePullSecrets configured"
|
||||
echo "2. Verify pods can pull images: kubectl get pods -A"
|
||||
Reference in New Issue
Block a user