Update monitoring packages to latest versions

- Updated all OpenTelemetry packages to latest versions:
  - opentelemetry-api: 1.27.0 → 1.39.1
  - opentelemetry-sdk: 1.27.0 → 1.39.1
  - opentelemetry-exporter-otlp-proto-grpc: 1.27.0 → 1.39.1
  - opentelemetry-exporter-otlp-proto-http: 1.27.0 → 1.39.1
  - opentelemetry-instrumentation-fastapi: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-httpx: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-redis: 0.48b0 → 0.60b1
  - opentelemetry-instrumentation-sqlalchemy: 0.48b0 → 0.60b1

- Removed prometheus-client==0.23.1 from all services
- Unified all services to use the same monitoring package versions

Generated by Mistral Vibe.
Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
This commit is contained in:
Urtzi Alfaro
2026-01-08 19:25:52 +01:00
parent dfb7e4b237
commit 29d19087f1
129 changed files with 5718 additions and 1821 deletions

View File

@@ -0,0 +1,125 @@
#!/bin/bash
# Script to add imagePullSecrets to all Kubernetes deployments, jobs, and cronjobs
# This ensures all pods can pull images from Docker Hub using the dockerhub-creds secret
SECRET_NAME="dockerhub-creds"
BASE_DIR="/Users/urtzialfaro/Documents/bakery-ia/infrastructure/kubernetes"
# ANSI color codes
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo -e "${BLUE}Adding imagePullSecrets to all Kubernetes resources...${NC}"
echo "======================================================"
echo ""
# Counter for files processed
count=0
# Function to add imagePullSecrets to a file
add_image_pull_secrets() {
local file="$1"
# Check if file already has imagePullSecrets
if grep -q "imagePullSecrets:" "$file"; then
echo -e "${YELLOW} ⊘ Skipping (already has imagePullSecrets): $(basename $file)${NC}"
return
fi
# Temporary file for processing
temp_file=$(mktemp)
# Process the file using awk to add imagePullSecrets after "spec:" in template or job spec
awk '
/^ spec:$/ && !done {
print $0
print " imagePullSecrets:"
print " - name: dockerhub-creds"
done = 1
next
}
{ print }
' "$file" > "$temp_file"
# Check if changes were made
if ! cmp -s "$file" "$temp_file"; then
mv "$temp_file" "$file"
echo -e "${GREEN} ✓ Updated: $(basename $file)${NC}"
((count++))
else
rm "$temp_file"
echo -e "${YELLOW} ⊘ No changes needed: $(basename $file)${NC}"
fi
}
# Process all service deployments
echo -e "${BLUE}Processing service deployments...${NC}"
find $BASE_DIR/base/components -name "*-service.yaml" | while read file; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process all database deployments
echo -e "${BLUE}Processing database deployments...${NC}"
for file in $BASE_DIR/base/components/databases/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process all migration jobs
echo -e "${BLUE}Processing migration jobs...${NC}"
for file in $BASE_DIR/base/migrations/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process all cronjobs
echo -e "${BLUE}Processing cronjobs...${NC}"
for file in $BASE_DIR/base/cronjobs/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process standalone jobs
echo -e "${BLUE}Processing standalone jobs...${NC}"
for file in $BASE_DIR/base/jobs/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process deployments directory
echo -e "${BLUE}Processing deployments...${NC}"
for file in $BASE_DIR/base/deployments/*.yaml; do
if [ -f "$file" ]; then
add_image_pull_secrets "$file"
fi
done
echo ""
# Process nominatim service
if [ -f "$BASE_DIR/base/components/infrastructure/nominatim.yaml" ]; then
echo -e "${BLUE}Processing nominatim service...${NC}"
add_image_pull_secrets "$BASE_DIR/base/components/infrastructure/nominatim.yaml"
echo ""
fi
echo "======================================================"
echo -e "${GREEN}Completed! Updated $count file(s)${NC}"
echo ""
echo "Next steps:"
echo "1. Review the changes: git diff"
echo "2. Apply to cluster: kubectl apply -k infrastructure/kubernetes/overlays/dev"
echo "3. Verify pods are running: kubectl get pods -n bakery-ia"

View File

@@ -0,0 +1,94 @@
#!/bin/bash
# Script to add OpenTelemetry monitoring configuration to all service deployments
# This adds the necessary environment variables for SigNoz integration
# Note: No Prometheus annotations needed - all metrics go via OTLP push
set -e
SERVICES=(
"ai-insights"
"distribution"
"external"
"forecasting"
"inventory"
"notification"
"orchestrator"
"orders"
"pos"
"procurement"
"production"
"recipes"
"sales"
"suppliers"
"tenant"
"training"
"frontend"
)
echo "Adding OpenTelemetry configuration to all services..."
echo ""
for service in "${SERVICES[@]}"; do
SERVICE_FILE="infrastructure/kubernetes/base/components/${service}/${service}-service.yaml"
if [ ! -f "$SERVICE_FILE" ]; then
echo "⚠️ Skipping $service (file not found: $SERVICE_FILE)"
continue
fi
echo "📝 Processing $service-service..."
# Check if already has OTEL env vars
if grep -q "OTEL_COLLECTOR_ENDPOINT" "$SERVICE_FILE"; then
echo " ✓ Already has OpenTelemetry configuration"
else
echo " + Adding OpenTelemetry environment variables"
# Create a YAML patch
cat > "/tmp/${service}-otel-patch.yaml" << 'EOF'
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "SERVICE_NAME_PLACEHOLDER"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration (all via OTLP, no Prometheus)
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
EOF
# Replace placeholder with actual service name
sed -i.bak "s/SERVICE_NAME_PLACEHOLDER/${service}-service/g" "/tmp/${service}-otel-patch.yaml"
echo " ⚠️ Manual step required: Add env vars from /tmp/${service}-otel-patch.yaml"
echo " Insert after 'ports:' section and before 'envFrom:' in $SERVICE_FILE"
fi
echo "$service-service processed"
echo ""
done
echo ""
echo "✅ Monitoring configuration prepared for all services!"
echo ""
echo "Next steps:"
echo "1. Review the changes and manually add env vars from /tmp/*-otel-patch.yaml files"
echo "2. Update SigNoz: helm upgrade signoz signoz/signoz -n signoz -f infrastructure/helm/signoz-values-dev.yaml"
echo "3. Restart services: kubectl rollout restart deployment -n bakery-ia"
echo "4. Check SigNoz UI at https://monitoring.bakery-ia.local for incoming data"
echo ""
echo "What metrics you'll see:"
echo " - HTTP requests (method, endpoint, status code, duration)"
echo " - System metrics (CPU, memory usage per process)"
echo " - System-wide metrics (total CPU, memory, disk I/O, network I/O)"
echo " - Custom business metrics (registrations, orders, etc.)"
echo " - All pushed via OpenTelemetry OTLP (no Prometheus scraping)"

View File

@@ -0,0 +1,162 @@
#!/usr/bin/env python3
"""
Script to automatically add OpenTelemetry monitoring configuration to all service deployments.
This adds environment variables for metrics, logs, and traces export to SigNoz.
"""
import os
import re
import sys
from pathlib import Path
# Services to configure
SERVICES = [
"ai-insights",
"distribution",
"external",
"forecasting",
"inventory",
"notification",
"orchestrator",
"orders",
"pos",
"procurement",
"production",
"recipes",
"sales",
"suppliers",
"tenant",
"training",
]
OTEL_ENV_VARS_TEMPLATE = """ env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "{service_name}"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration (all via OTLP, no Prometheus)
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
"""
def has_otel_config(content: str) -> bool:
"""Check if file already has OTEL configuration"""
return "OTEL_COLLECTOR_ENDPOINT" in content
def add_otel_config(content: str, service_name: str) -> str:
"""Add OTEL configuration to service deployment"""
# Prepare the env vars with the service name
env_vars = OTEL_ENV_VARS_TEMPLATE.format(service_name=f"{service_name}-service")
# Find the container section and add env vars before envFrom
# Pattern: find " containers:" then first " envFrom:" after it
pattern = r'( containers:\n - name: [^\n]+\n image: [^\n]+\n(?: ports:\n(?: - [^\n]+\n)+)?)( envFrom:)'
replacement = r'\1' + env_vars + r'\2'
# Try to replace
new_content = re.sub(pattern, replacement, content, count=1)
if new_content == content:
print(f" ⚠️ Warning: Could not find insertion point automatically")
return content
return new_content
def process_service(service_name: str, base_path: Path) -> bool:
"""Process a single service deployment file"""
service_file = base_path / "components" / service_name / f"{service_name}-service.yaml"
if not service_file.exists():
print(f" ⚠️ File not found: {service_file}")
return False
# Read file
with open(service_file, 'r') as f:
content = f.read()
# Check if already configured
if has_otel_config(content):
print(f" ✓ Already configured")
return True
# Add configuration
new_content = add_otel_config(content, service_name)
if new_content == content:
return False
# Write back
with open(service_file, 'w') as f:
f.write(new_content)
print(f" ✅ Updated successfully")
return True
def main():
"""Main function"""
# Find base path
script_dir = Path(__file__).parent
base_path = script_dir / "base"
if not base_path.exists():
print(f"❌ Error: Base path not found: {base_path}")
sys.exit(1)
print("=" * 60)
print("Adding OpenTelemetry Monitoring Configuration")
print("=" * 60)
print()
success_count = 0
skip_count = 0
fail_count = 0
for service in SERVICES:
print(f"📝 Processing {service}-service...")
result = process_service(service, base_path)
if result:
if has_otel_config(open(base_path / "components" / service / f"{service}-service.yaml").read()):
success_count += 1
else:
fail_count += 1
print()
print("=" * 60)
print(f"✅ Successfully configured: {success_count}")
if fail_count > 0:
print(f"⚠️ Failed to configure: {fail_count}")
print("=" * 60)
print()
print("Next steps:")
print("1. Review the changes: git diff infrastructure/kubernetes/base/components/")
print("2. Update SigNoz: helm upgrade signoz signoz/signoz -n signoz -f infrastructure/helm/signoz-values-dev.yaml")
print("3. Apply changes: kubectl apply -k infrastructure/kubernetes/overlays/dev/")
print("4. Verify: kubectl logs -n bakery-ia deployment/<service-name> | grep -i 'otel\\|metrics'")
if __name__ == "__main__":
main()

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: ai-insights-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "ai-insights-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: auth-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -93,6 +95,21 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "auth-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: ai-insights-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: alert-processor-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: auth-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: distribution-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: external-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: forecasting-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: inventory-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: notification-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: orchestrator-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: orders-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: pos-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: {{SERVICE_NAME}}-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: postgres
image: postgres:17-alpine
@@ -121,4 +123,4 @@ spec:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storage: 1Gi

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: procurement-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: production-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: rabbitmq
app.kubernetes.io/component: message-broker
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: rabbitmq
image: rabbitmq:4.1-management-alpine
@@ -120,4 +122,4 @@ spec:
- ReadWriteOnce
resources:
requests:
storage: 2Gi
storage: 2Gi

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: recipes-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: redis
app.kubernetes.io/component: cache
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 999 # redis group
initContainers:
@@ -166,4 +168,4 @@ spec:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storage: 1Gi

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: sales-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: suppliers-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: tenant-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: training-db
app.kubernetes.io/component: database
spec:
imagePullSecrets:
- name: dockerhub-creds
securityContext:
fsGroup: 70
initContainers:

View File

@@ -16,6 +16,8 @@ spec:
app: distribution-service
tier: backend
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: distribution-service
image: bakery/distribution-service:latest
@@ -58,6 +60,25 @@ spec:
value: "30"
- name: HTTP_RETRIES
value: "3"
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "distribution-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
livenessProbe:
httpGet:
path: /health
@@ -107,4 +128,4 @@ spec:
port: 8000
targetPort: 8000
name: http
type: ClusterIP
type: ClusterIP

View File

@@ -23,6 +23,8 @@ spec:
app.kubernetes.io/component: microservice
version: "2.0"
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -85,6 +87,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "external-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: forecasting-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "forecasting-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: frontend
app.kubernetes.io/component: frontend
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: frontend
image: bakery/dashboard:latest

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: gateway
app.kubernetes.io/component: gateway
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: gateway
image: bakery/gateway:latest

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: inventory-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "inventory-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -1,501 +0,0 @@
# Bakery IA - Production Monitoring Stack
This directory contains the complete production-ready monitoring infrastructure for the Bakery IA platform.
## 📊 Components
### Core Monitoring
- **Prometheus v3.0.1** - Time-series metrics database (2 replicas with HA)
- **Grafana v12.3.0** - Visualization and dashboarding
- **AlertManager v0.27.0** - Alert routing and notification (3 replicas with HA)
### Distributed Tracing
- **Jaeger v1.51** - Distributed tracing with persistent storage
### Exporters
- **PostgreSQL Exporter v0.15.0** - Database metrics and health
- **Node Exporter v1.7.0** - Infrastructure and OS-level metrics (DaemonSet)
## 🚀 Deployment
### Prerequisites
1. Kubernetes cluster (v1.24+)
2. kubectl configured
3. kustomize (v4.0+) or kubectl with kustomize support
4. Storage class available for PersistentVolumeClaims
### Production Deployment
```bash
# 1. Update secrets with production values
kubectl create secret generic grafana-admin \
--from-literal=admin-user=admin \
--from-literal=admin-password=$(openssl rand -base64 32) \
--namespace monitoring --dry-run=client -o yaml > secrets.yaml
# 2. Update AlertManager SMTP credentials
kubectl create secret generic alertmanager-secrets \
--from-literal=smtp-host="smtp.gmail.com:587" \
--from-literal=smtp-username="alerts@yourdomain.com" \
--from-literal=smtp-password="YOUR_SMTP_PASSWORD" \
--from-literal=smtp-from="alerts@yourdomain.com" \
--from-literal=slack-webhook-url="https://hooks.slack.com/services/YOUR/WEBHOOK/URL" \
--namespace monitoring --dry-run=client -o yaml >> secrets.yaml
# 3. Update PostgreSQL exporter connection string
kubectl create secret generic postgres-exporter \
--from-literal=data-source-name="postgresql://user:password@postgres.bakery-ia:5432/bakery?sslmode=require" \
--namespace monitoring --dry-run=client -o yaml >> secrets.yaml
# 4. Deploy monitoring stack
kubectl apply -k infrastructure/kubernetes/overlays/prod
# 5. Verify deployment
kubectl get pods -n monitoring
kubectl get pvc -n monitoring
```
### Local Development Deployment
For local Kind clusters, monitoring is disabled by default to save resources. To enable:
```bash
# Uncomment monitoring in overlays/dev/kustomization.yaml
# Then apply:
kubectl apply -k infrastructure/kubernetes/overlays/dev
```
## 🔐 Security Configuration
### Important Security Notes
⚠️ **NEVER commit real secrets to Git!**
The `secrets.yaml` file contains placeholder values. In production, use one of:
1. **Sealed Secrets** (Recommended)
```bash
kubectl apply -f https://github.com/bitnami-labs/sealed-secrets/releases/download/v0.24.0/controller.yaml
kubeseal --format=yaml < secrets.yaml > sealed-secrets.yaml
```
2. **External Secrets Operator**
```bash
helm install external-secrets external-secrets/external-secrets -n external-secrets
```
3. **Cloud Provider Secrets**
- AWS Secrets Manager
- GCP Secret Manager
- Azure Key Vault
### Grafana Admin Password
Change the default password immediately:
```bash
# Generate strong password
NEW_PASSWORD=$(openssl rand -base64 32)
# Update secret
kubectl patch secret grafana-admin -n monitoring \
-p="{\"data\":{\"admin-password\":\"$(echo -n $NEW_PASSWORD | base64)\"}}"
# Restart Grafana
kubectl rollout restart deployment grafana -n monitoring
```
## 📈 Accessing Monitoring Services
### Via Ingress (Production)
```
https://monitoring.yourdomain.com/grafana
https://monitoring.yourdomain.com/prometheus
https://monitoring.yourdomain.com/alertmanager
https://monitoring.yourdomain.com/jaeger
```
### Via Port Forwarding (Development)
```bash
# Grafana
kubectl port-forward -n monitoring svc/grafana 3000:3000
# Prometheus
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
# AlertManager
kubectl port-forward -n monitoring svc/alertmanager-external 9093:9093
# Jaeger
kubectl port-forward -n monitoring svc/jaeger-query 16686:16686
```
Then access:
- Grafana: http://localhost:3000
- Prometheus: http://localhost:9090
- AlertManager: http://localhost:9093
- Jaeger: http://localhost:16686
## 📊 Grafana Dashboards
### Pre-configured Dashboards
1. **Gateway Metrics** - API gateway performance
- Request rate by endpoint
- P95 latency
- Error rates
- Authentication metrics
2. **Services Overview** - Microservices health
- Request rate by service
- P99 latency
- Error rates by service
- Service health status
3. **Circuit Breakers** - Resilience patterns
- Circuit breaker states
- Trip rates
- Rejected requests
4. **PostgreSQL Monitoring** - Database health
- Connections, transactions, cache hit ratio
- Slow queries, locks, replication lag
5. **Node Metrics** - Infrastructure monitoring
- CPU, memory, disk, network per node
6. **AlertManager** - Alert management
- Active alerts, firing rate, notifications
7. **Business Metrics** - KPIs
- Service performance, tenant activity, ML metrics
### Creating Custom Dashboards
1. Login to Grafana (admin/[your-password])
2. Click "+ → Dashboard"
3. Add panels with Prometheus queries
4. Save dashboard
5. Export JSON and add to `grafana-dashboards.yaml`
## 🚨 Alert Configuration
### Alert Rules
Alert rules are defined in `alert-rules.yaml` and organized by category:
- **bakery_services** - Service health, errors, latency, memory
- **bakery_business** - Training jobs, ML accuracy, API limits
- **alert_system_health** - Alert system components, RabbitMQ, Redis
- **alert_system_performance** - Processing errors, delivery failures
- **alert_system_business** - Alert volume, response times
- **alert_system_capacity** - Queue sizes, storage performance
- **alert_system_critical** - System failures, data loss
- **monitoring_health** - Prometheus, AlertManager self-monitoring
### Alert Routing
Alerts are routed based on:
- **Severity** (critical, warning, info)
- **Component** (alert-system, database, infrastructure)
- **Service** name
### Notification Channels
Configure in `alertmanager.yaml`:
1. **Email** (default)
- critical-alerts@yourdomain.com
- oncall@yourdomain.com
2. **Slack** (optional, commented out)
- Update slack-webhook-url in secrets
- Uncomment slack_configs in alertmanager.yaml
3. **PagerDuty** (add if needed)
```yaml
pagerduty_configs:
- routing_key: YOUR_ROUTING_KEY
severity: '{{ .Labels.severity }}'
```
### Testing Alerts
```bash
# Fire a test alert
kubectl run test-alert --image=busybox -n bakery-ia --restart=Never -- sleep 3600
# Check alert in Prometheus
# Navigate to http://localhost:9090/alerts
# Check AlertManager
# Navigate to http://localhost:9093
```
## 🔍 Troubleshooting
### Prometheus Issues
```bash
# Check Prometheus logs
kubectl logs -n monitoring prometheus-0 -f
# Check Prometheus targets
kubectl port-forward -n monitoring svc/prometheus-external 9090:9090
# Visit http://localhost:9090/targets
# Check Prometheus configuration
kubectl get configmap prometheus-config -n monitoring -o yaml
```
### AlertManager Issues
```bash
# Check AlertManager logs
kubectl logs -n monitoring alertmanager-0 -f
# Check AlertManager configuration
kubectl exec -n monitoring alertmanager-0 -- cat /etc/alertmanager/alertmanager.yml
# Test SMTP connection
kubectl exec -n monitoring alertmanager-0 -- \
wget --spider --server-response --timeout=10 smtp://smtp.gmail.com:587
```
### Grafana Issues
```bash
# Check Grafana logs
kubectl logs -n monitoring deployment/grafana -f
# Reset Grafana admin password
kubectl exec -n monitoring deployment/grafana -- \
grafana-cli admin reset-admin-password NEW_PASSWORD
```
### PostgreSQL Exporter Issues
```bash
# Check exporter logs
kubectl logs -n monitoring deployment/postgres-exporter -f
# Test database connection
kubectl exec -n monitoring deployment/postgres-exporter -- \
wget -O- http://localhost:9187/metrics | grep pg_up
```
### Node Exporter Issues
```bash
# Check node exporter on specific node
kubectl logs -n monitoring daemonset/node-exporter --selector=kubernetes.io/hostname=NODE_NAME -f
# Check metrics endpoint
kubectl exec -n monitoring daemonset/node-exporter -- \
wget -O- http://localhost:9100/metrics | head -n 20
```
## 📏 Resource Requirements
### Minimum Requirements (Development)
- CPU: 2 cores
- Memory: 4Gi
- Storage: 30Gi
### Recommended Requirements (Production)
- CPU: 6-8 cores
- Memory: 16Gi
- Storage: 100Gi
### Component Resource Allocation
| Component | Replicas | CPU Request | Memory Request | CPU Limit | Memory Limit |
|-----------|----------|-------------|----------------|-----------|--------------|
| Prometheus | 2 | 500m | 1Gi | 1 | 2Gi |
| AlertManager | 3 | 100m | 128Mi | 500m | 256Mi |
| Grafana | 1 | 100m | 256Mi | 500m | 512Mi |
| Postgres Exporter | 1 | 50m | 64Mi | 200m | 128Mi |
| Node Exporter | 1/node | 50m | 64Mi | 200m | 128Mi |
| Jaeger | 1 | 250m | 512Mi | 500m | 1Gi |
## 🔄 High Availability
### Prometheus HA
- 2 replicas in StatefulSet
- Each has independent storage (volumeClaimTemplates)
- Anti-affinity to spread across nodes
- Both scrape the same targets independently
- Use Thanos for long-term storage and global query view (future enhancement)
### AlertManager HA
- 3 replicas in StatefulSet
- Clustered mode (gossip protocol)
- Automatic leader election
- Alert deduplication across instances
- Anti-affinity to spread across nodes
### PodDisruptionBudgets
Ensure minimum availability during:
- Node maintenance
- Cluster upgrades
- Rolling updates
```yaml
Prometheus: minAvailable=1 (out of 2)
AlertManager: minAvailable=2 (out of 3)
Grafana: minAvailable=1 (out of 1)
```
## 📊 Metrics Reference
### Application Metrics (from services)
```promql
# HTTP request rate
rate(http_requests_total[5m])
# HTTP error rate
rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])
# Request latency (P95)
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# Active connections
active_connections
```
### PostgreSQL Metrics
```promql
# Active connections
pg_stat_database_numbackends
# Transaction rate
rate(pg_stat_database_xact_commit[5m])
# Cache hit ratio
rate(pg_stat_database_blks_hit[5m]) /
(rate(pg_stat_database_blks_hit[5m]) + rate(pg_stat_database_blks_read[5m]))
# Replication lag
pg_replication_lag_seconds
```
### Node Metrics
```promql
# CPU usage
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Memory usage
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100
# Disk I/O
rate(node_disk_read_bytes_total[5m])
rate(node_disk_written_bytes_total[5m])
# Network traffic
rate(node_network_receive_bytes_total[5m])
rate(node_network_transmit_bytes_total[5m])
```
## 🔗 Distributed Tracing
### Jaeger Configuration
Services automatically send traces when `JAEGER_ENABLED=true`:
```yaml
# In prod-configmap.yaml
JAEGER_ENABLED: "true"
JAEGER_AGENT_HOST: "jaeger-agent.monitoring.svc.cluster.local"
JAEGER_AGENT_PORT: "6831"
```
### Viewing Traces
1. Access Jaeger UI: https://monitoring.yourdomain.com/jaeger
2. Select service from dropdown
3. Click "Find Traces"
4. Explore trace details, spans, and timing
### Trace Sampling
Current sampling: 100% (all traces collected)
For high-traffic production:
```yaml
# Adjust in shared/monitoring/tracing.py
JAEGER_SAMPLE_RATE: "0.1" # 10% of traces
```
## 📚 Additional Resources
- [Prometheus Documentation](https://prometheus.io/docs/)
- [Grafana Documentation](https://grafana.com/docs/)
- [AlertManager Documentation](https://prometheus.io/docs/alerting/latest/alertmanager/)
- [Jaeger Documentation](https://www.jaegertracing.io/docs/)
- [PostgreSQL Exporter](https://github.com/prometheus-community/postgres_exporter)
- [Node Exporter](https://github.com/prometheus/node_exporter)
## 🆘 Support
For monitoring issues:
1. Check component logs (see Troubleshooting section)
2. Verify Prometheus targets are UP
3. Check AlertManager configuration and routing
4. Review resource usage and quotas
5. Contact platform team: platform-team@yourdomain.com
## 🔄 Maintenance
### Regular Tasks
**Daily:**
- Review critical alerts
- Check service health dashboards
**Weekly:**
- Review alert noise and adjust thresholds
- Check storage usage for Prometheus and Jaeger
- Review slow queries in PostgreSQL dashboard
**Monthly:**
- Update dashboard with new metrics
- Review and update alert runbooks
- Capacity planning based on trends
### Backup and Recovery
**Prometheus Data:**
```bash
# Backup Prometheus data
kubectl exec -n monitoring prometheus-0 -- tar czf /tmp/prometheus-backup.tar.gz /prometheus
kubectl cp monitoring/prometheus-0:/tmp/prometheus-backup.tar.gz ./prometheus-backup.tar.gz
# Restore (stop Prometheus first)
kubectl cp ./prometheus-backup.tar.gz monitoring/prometheus-0:/tmp/
kubectl exec -n monitoring prometheus-0 -- tar xzf /tmp/prometheus-backup.tar.gz -C /
```
**Grafana Dashboards:**
```bash
# Export all dashboards via API
curl -u admin:password http://localhost:3000/api/search | \
jq -r '.[] | .uid' | \
xargs -I{} curl -u admin:password http://localhost:3000/api/dashboards/uid/{} > dashboards-backup.json
```
## 📝 Version History
- **v1.0.0** (2026-01-07) - Initial production-ready monitoring stack
- Prometheus v3.0.1 with HA
- AlertManager v0.27.0 with clustering
- Grafana v12.3.0 with 7 dashboards
- PostgreSQL and Node exporters
- 50+ alert rules
- Comprehensive documentation

View File

@@ -1,20 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
# Minimal Monitoring Infrastructure
# SigNoz is now managed via Helm in the 'signoz' namespace
# This kustomization only maintains:
# - Namespace for legacy resources (if needed)
# - Node exporter for infrastructure metrics
# - PostgreSQL exporter for database metrics
# - Optional OTEL collector (can be disabled if using SigNoz's built-in collector)
resources:
- namespace.yaml
- secrets.yaml
# Exporters for metrics collection
- node-exporter.yaml
- postgres-exporter.yaml
# Optional: Keep OTEL collector or use SigNoz's built-in one
# Uncomment if you want a dedicated OTEL collector in monitoring namespace
# - otel-collector.yaml

View File

@@ -1,7 +0,0 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
name: monitoring
app.kubernetes.io/part-of: bakery-ia

View File

@@ -1,103 +0,0 @@
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: monitoring
labels:
app: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
template:
metadata:
labels:
app: node-exporter
spec:
hostNetwork: true
hostPID: true
nodeSelector:
kubernetes.io/os: linux
tolerations:
# Run on all nodes including master
- operator: Exists
effect: NoSchedule
containers:
- name: node-exporter
image: quay.io/prometheus/node-exporter:v1.7.0
args:
- '--path.sysfs=/host/sys'
- '--path.rootfs=/host/root'
- '--path.procfs=/host/proc'
- '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)'
- '--collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$'
- '--collector.netclass.ignored-devices=^(veth.*|[a-f0-9]{15})$'
- '--collector.netdev.device-exclude=^(veth.*|[a-f0-9]{15})$'
- '--web.listen-address=:9100'
ports:
- containerPort: 9100
protocol: TCP
name: metrics
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "128Mi"
cpu: "200m"
volumeMounts:
- name: sys
mountPath: /host/sys
mountPropagation: HostToContainer
readOnly: true
- name: root
mountPath: /host/root
mountPropagation: HostToContainer
readOnly: true
- name: proc
mountPath: /host/proc
mountPropagation: HostToContainer
readOnly: true
securityContext:
runAsNonRoot: true
runAsUser: 65534
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
volumes:
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /
- name: proc
hostPath:
path: /proc
---
apiVersion: v1
kind: Service
metadata:
name: node-exporter
namespace: monitoring
labels:
app: node-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
spec:
clusterIP: None
ports:
- name: metrics
port: 9100
protocol: TCP
targetPort: 9100
selector:
app: node-exporter

View File

@@ -1,167 +0,0 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: otel-collector-config
namespace: monitoring
data:
otel-collector-config.yaml: |
extensions:
health_check:
endpoint: 0.0.0.0:13133
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 10s
send_batch_size: 1024
# Memory limiter to prevent OOM
memory_limiter:
check_interval: 1s
limit_mib: 512
spike_limit_mib: 128
exporters:
# Export metrics to Prometheus
prometheus:
endpoint: "0.0.0.0:8889"
namespace: otelcol
const_labels:
source: otel-collector
# Export to SigNoz
otlp/signoz:
endpoint: "signoz-query-service.monitoring.svc.cluster.local:8080"
tls:
insecure: true
# Logging exporter for debugging traces and logs
logging:
loglevel: info
sampling_initial: 5
sampling_thereafter: 200
service:
extensions: [health_check]
pipelines:
# Traces pipeline: receive -> process -> export to SigNoz
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/signoz, logging]
# Metrics pipeline: receive -> process -> export to both Prometheus and SigNoz
metrics:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [prometheus, otlp/signoz]
# Logs pipeline: receive -> process -> export to SigNoz
logs:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/signoz, logging]
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: otel-collector
namespace: monitoring
labels:
app: otel-collector
spec:
replicas: 1
selector:
matchLabels:
app: otel-collector
template:
metadata:
labels:
app: otel-collector
spec:
containers:
- name: otel-collector
image: otel/opentelemetry-collector-contrib:0.91.0
args:
- --config=/conf/otel-collector-config.yaml
ports:
- containerPort: 4317
protocol: TCP
name: otlp-grpc
- containerPort: 4318
protocol: TCP
name: otlp-http
- containerPort: 8889
protocol: TCP
name: prometheus
- containerPort: 13133
protocol: TCP
name: health-check
volumeMounts:
- name: otel-collector-config
mountPath: /conf
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /
port: 13133
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /
port: 13133
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: otel-collector-config
configMap:
name: otel-collector-config
items:
- key: otel-collector-config.yaml
path: otel-collector-config.yaml
---
apiVersion: v1
kind: Service
metadata:
name: otel-collector
namespace: monitoring
labels:
app: otel-collector
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8889"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
ports:
- port: 4317
targetPort: 4317
protocol: TCP
name: otlp-grpc
- port: 4318
targetPort: 4318
protocol: TCP
name: otlp-http
- port: 8889
targetPort: 8889
protocol: TCP
name: prometheus
selector:
app: otel-collector

View File

@@ -1,306 +0,0 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: postgres-exporter
namespace: monitoring
labels:
app: postgres-exporter
spec:
replicas: 1
selector:
matchLabels:
app: postgres-exporter
template:
metadata:
labels:
app: postgres-exporter
spec:
containers:
- name: postgres-exporter
image: prometheuscommunity/postgres-exporter:v0.15.0
ports:
- containerPort: 9187
name: metrics
env:
- name: DATA_SOURCE_NAME
valueFrom:
secretKeyRef:
name: postgres-exporter
key: data-source-name
# Enable extended metrics
- name: PG_EXPORTER_EXTEND_QUERY_PATH
value: "/etc/postgres-exporter/queries.yaml"
# Disable default metrics (we'll use custom ones)
- name: PG_EXPORTER_DISABLE_DEFAULT_METRICS
value: "false"
# Disable settings metrics (can be noisy)
- name: PG_EXPORTER_DISABLE_SETTINGS_METRICS
value: "false"
volumeMounts:
- name: queries
mountPath: /etc/postgres-exporter
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "128Mi"
cpu: "200m"
livenessProbe:
httpGet:
path: /
port: 9187
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /
port: 9187
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: queries
configMap:
name: postgres-exporter-queries
---
apiVersion: v1
kind: ConfigMap
metadata:
name: postgres-exporter-queries
namespace: monitoring
data:
queries.yaml: |
# Custom PostgreSQL queries for bakery-ia metrics
pg_database:
query: |
SELECT
datname,
numbackends as connections,
xact_commit as transactions_committed,
xact_rollback as transactions_rolled_back,
blks_read as blocks_read,
blks_hit as blocks_hit,
tup_returned as tuples_returned,
tup_fetched as tuples_fetched,
tup_inserted as tuples_inserted,
tup_updated as tuples_updated,
tup_deleted as tuples_deleted,
conflicts as conflicts,
temp_files as temp_files,
temp_bytes as temp_bytes,
deadlocks as deadlocks
FROM pg_stat_database
WHERE datname NOT IN ('template0', 'template1', 'postgres')
metrics:
- datname:
usage: "LABEL"
description: "Name of the database"
- connections:
usage: "GAUGE"
description: "Number of backends currently connected to this database"
- transactions_committed:
usage: "COUNTER"
description: "Number of transactions in this database that have been committed"
- transactions_rolled_back:
usage: "COUNTER"
description: "Number of transactions in this database that have been rolled back"
- blocks_read:
usage: "COUNTER"
description: "Number of disk blocks read in this database"
- blocks_hit:
usage: "COUNTER"
description: "Number of times disk blocks were found in the buffer cache"
- tuples_returned:
usage: "COUNTER"
description: "Number of rows returned by queries in this database"
- tuples_fetched:
usage: "COUNTER"
description: "Number of rows fetched by queries in this database"
- tuples_inserted:
usage: "COUNTER"
description: "Number of rows inserted by queries in this database"
- tuples_updated:
usage: "COUNTER"
description: "Number of rows updated by queries in this database"
- tuples_deleted:
usage: "COUNTER"
description: "Number of rows deleted by queries in this database"
- conflicts:
usage: "COUNTER"
description: "Number of queries canceled due to conflicts with recovery"
- temp_files:
usage: "COUNTER"
description: "Number of temporary files created by queries"
- temp_bytes:
usage: "COUNTER"
description: "Total amount of data written to temporary files by queries"
- deadlocks:
usage: "COUNTER"
description: "Number of deadlocks detected in this database"
pg_replication:
query: |
SELECT
CASE WHEN pg_is_in_recovery() THEN 1 ELSE 0 END as is_replica,
EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::INT as lag_seconds
metrics:
- is_replica:
usage: "GAUGE"
description: "1 if this is a replica, 0 if primary"
- lag_seconds:
usage: "GAUGE"
description: "Replication lag in seconds (only on replicas)"
pg_slow_queries:
query: |
SELECT
datname,
usename,
state,
COUNT(*) as count,
MAX(EXTRACT(EPOCH FROM (now() - query_start))) as max_duration_seconds
FROM pg_stat_activity
WHERE state != 'idle'
AND query NOT LIKE '%pg_stat_activity%'
AND query_start < now() - interval '30 seconds'
GROUP BY datname, usename, state
metrics:
- datname:
usage: "LABEL"
description: "Database name"
- usename:
usage: "LABEL"
description: "User name"
- state:
usage: "LABEL"
description: "Query state"
- count:
usage: "GAUGE"
description: "Number of slow queries"
- max_duration_seconds:
usage: "GAUGE"
description: "Maximum query duration in seconds"
pg_table_stats:
query: |
SELECT
schemaname,
relname,
seq_scan,
seq_tup_read,
idx_scan,
idx_tup_fetch,
n_tup_ins,
n_tup_upd,
n_tup_del,
n_tup_hot_upd,
n_live_tup,
n_dead_tup,
n_mod_since_analyze,
last_vacuum,
last_autovacuum,
last_analyze,
last_autoanalyze
FROM pg_stat_user_tables
WHERE schemaname = 'public'
ORDER BY n_live_tup DESC
LIMIT 20
metrics:
- schemaname:
usage: "LABEL"
description: "Schema name"
- relname:
usage: "LABEL"
description: "Table name"
- seq_scan:
usage: "COUNTER"
description: "Number of sequential scans"
- seq_tup_read:
usage: "COUNTER"
description: "Number of tuples read by sequential scans"
- idx_scan:
usage: "COUNTER"
description: "Number of index scans"
- idx_tup_fetch:
usage: "COUNTER"
description: "Number of tuples fetched by index scans"
- n_tup_ins:
usage: "COUNTER"
description: "Number of tuples inserted"
- n_tup_upd:
usage: "COUNTER"
description: "Number of tuples updated"
- n_tup_del:
usage: "COUNTER"
description: "Number of tuples deleted"
- n_tup_hot_upd:
usage: "COUNTER"
description: "Number of tuples HOT updated"
- n_live_tup:
usage: "GAUGE"
description: "Estimated number of live rows"
- n_dead_tup:
usage: "GAUGE"
description: "Estimated number of dead rows"
- n_mod_since_analyze:
usage: "GAUGE"
description: "Number of rows modified since last analyze"
pg_locks:
query: |
SELECT
mode,
locktype,
COUNT(*) as count
FROM pg_locks
GROUP BY mode, locktype
metrics:
- mode:
usage: "LABEL"
description: "Lock mode"
- locktype:
usage: "LABEL"
description: "Lock type"
- count:
usage: "GAUGE"
description: "Number of locks"
pg_connection_pool:
query: |
SELECT
state,
COUNT(*) as count,
MAX(EXTRACT(EPOCH FROM (now() - state_change))) as max_state_duration_seconds
FROM pg_stat_activity
GROUP BY state
metrics:
- state:
usage: "LABEL"
description: "Connection state"
- count:
usage: "GAUGE"
description: "Number of connections in this state"
- max_state_duration_seconds:
usage: "GAUGE"
description: "Maximum time a connection has been in this state"
---
apiVersion: v1
kind: Service
metadata:
name: postgres-exporter
namespace: monitoring
labels:
app: postgres-exporter
spec:
type: ClusterIP
ports:
- port: 9187
targetPort: 9187
protocol: TCP
name: metrics
selector:
app: postgres-exporter

View File

@@ -1,52 +0,0 @@
---
# NOTE: This file contains example secrets for development.
# For production, use one of the following:
# 1. Sealed Secrets (bitnami-labs/sealed-secrets)
# 2. External Secrets Operator
# 3. HashiCorp Vault
# 4. Cloud provider secret managers (AWS Secrets Manager, GCP Secret Manager, Azure Key Vault)
#
# NEVER commit real production secrets to git!
apiVersion: v1
kind: Secret
metadata:
name: grafana-admin
namespace: monitoring
type: Opaque
stringData:
admin-user: admin
# CHANGE THIS PASSWORD IN PRODUCTION!
# Generate with: openssl rand -base64 32
admin-password: "CHANGE_ME_IN_PRODUCTION"
---
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-secrets
namespace: monitoring
type: Opaque
stringData:
# SMTP configuration for email alerts
# CHANGE THESE VALUES IN PRODUCTION!
smtp-host: "smtp.gmail.com:587"
smtp-username: "alerts@yourdomain.com"
smtp-password: "CHANGE_ME_IN_PRODUCTION"
smtp-from: "alerts@yourdomain.com"
# Slack webhook URL (optional)
slack-webhook-url: "https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
---
apiVersion: v1
kind: Secret
metadata:
name: postgres-exporter
namespace: monitoring
type: Opaque
stringData:
# PostgreSQL connection string
# Format: postgresql://username:password@hostname:port/database?sslmode=disable
# CHANGE THIS IN PRODUCTION!
data-source-name: "postgresql://postgres:postgres@postgres.bakery-ia:5432/bakery?sslmode=disable"

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: notification-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "notification-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: orchestrator-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "orchestrator-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: orders-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "orders-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: pos-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "pos-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: procurement-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "procurement-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: production-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "production-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: recipes-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "recipes-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: sales-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "sales-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: suppliers-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "suppliers-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: tenant-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "tenant-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -19,6 +19,8 @@ spec:
app.kubernetes.io/name: training-service
app.kubernetes.io/component: microservice
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
# Wait for Redis to be ready
- name: wait-for-redis
@@ -92,6 +94,26 @@ spec:
ports:
- containerPort: 8000
name: http
env:
# OpenTelemetry Configuration
- name: OTEL_COLLECTOR_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4318"
- name: OTEL_SERVICE_NAME
value: "training-service"
- name: ENABLE_TRACING
value: "true"
# Logging Configuration
- name: OTEL_LOGS_EXPORTER
value: "otlp"
- name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED
value: "true"
# Metrics Configuration
- name: ENABLE_OTEL_METRICS
value: "true"
- name: ENABLE_SYSTEM_METRICS
value: "true"
envFrom:
- configMapRef:
name: bakery-config

View File

@@ -17,6 +17,8 @@ spec:
labels:
app: demo-cleanup
spec:
imagePullSecrets:
- name: dockerhub-creds
template:
metadata:
labels:

View File

@@ -22,6 +22,8 @@ spec:
app: external-service
job: data-rotation
spec:
imagePullSecrets:
- name: dockerhub-creds
ttlSecondsAfterFinished: 172800
backoffLimit: 2

View File

@@ -19,6 +19,8 @@ spec:
component: background-jobs
service: demo-session
spec:
imagePullSecrets:
- name: dockerhub-creds
containers:
- name: worker
image: bakery/demo-session-service

View File

@@ -20,25 +20,23 @@ metadata:
nginx.ingress.kubernetes.io/upstream-keepalive-timeout: "3600"
# WebSocket upgrade support
nginx.ingress.kubernetes.io/websocket-services: "gateway-service"
# CORS configuration for HTTPS and local development
# CORS configuration for HTTPS
nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakery-ia.local,https://api.bakery-ia.local,https://monitoring.bakery-ia.local,https://localhost"
nginx.ingress.kubernetes.io/cors-allow-origin: "https://your-domain.com" # To be overridden in overlays
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH"
nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin, Cache-Control"
nginx.ingress.kubernetes.io/cors-allow-credentials: "true"
# Cert-manager annotations for automatic certificate issuance
cert-manager.io/cluster-issuer: "letsencrypt-staging"
cert-manager.io/acme-challenge-type: http01
# Using issuer appropriate for environment
cert-manager.io/cluster-issuer: "letsencrypt-prod" # To be overridden in dev overlay
spec:
ingressClassName: nginx
tls:
- hosts:
- bakery-ia.local
- api.bakery-ia.local
- monitoring.bakery-ia.local
secretName: bakery-ia-tls-cert
- your-domain.com # To be overridden in overlays
secretName: bakery-tls-cert # To be overridden in overlays
rules:
- host: bakery-ia.local
- host: your-domain.com # To be overridden in overlays
http:
paths:
- path: /
@@ -55,7 +53,7 @@ spec:
name: gateway-service
port:
number: 8000
- host: api.bakery-ia.local
- host: api.your-domain.com # To be overridden in overlays
http:
paths:
- path: /
@@ -65,20 +63,22 @@ spec:
name: gateway-service
port:
number: 8000
- host: monitoring.bakery-ia.local
- host: monitoring.your-domain.com # To be overridden in overlays
http:
paths:
- path: /grafana
pathType: Prefix
# SigNoz Frontend UI and API (consolidated in newer versions)
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: grafana-service
name: signoz
port:
number: 3000
- path: /prometheus
pathType: Prefix
number: 8080
# SigNoz API endpoints
- path: /signoz-api(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: prometheus-service
name: signoz
port:
number: 9090
number: 8080

View File

@@ -17,6 +17,8 @@ spec:
app: external-service
job: data-init
spec:
imagePullSecrets:
- name: dockerhub-creds
restartPolicy: OnFailure
initContainers:

View File

@@ -15,6 +15,8 @@ spec:
app.kubernetes.io/name: nominatim-init
app.kubernetes.io/component: data-init
spec:
imagePullSecrets:
- name: dockerhub-creds
restartPolicy: OnFailure
containers:
- name: nominatim-import

View File

@@ -66,6 +66,10 @@ resources:
# Persistent storage
- components/volumes/model-storage-pvc.yaml
# Cert manager cluster issuers
- components/cert-manager/cluster-issuer-staging.yaml
- components/cert-manager/local-ca-issuer.yaml
# Database services
- components/databases/auth-db.yaml
- components/databases/tenant-db.yaml

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: ai-insights-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: alert-processor-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: auth-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -29,4 +29,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: demo-seed-sa
namespace: bakery-ia
namespace: bakery-ia

View File

@@ -15,6 +15,8 @@ spec:
app.kubernetes.io/name: demo-session-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: distribution-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: external-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: forecasting-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: inventory-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: notification-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: orchestrator-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: orders-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: pos-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: procurement-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: production-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: recipes-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: sales-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: suppliers-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: tenant-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: tenant-seed-pilot-coupon
app.kubernetes.io/component: seed
spec:
imagePullSecrets:
- name: dockerhub-creds
serviceAccountName: demo-seed-sa
initContainers:
- name: wait-for-tenant-migration

View File

@@ -16,6 +16,8 @@ spec:
app.kubernetes.io/name: training-migration
app.kubernetes.io/component: migration
spec:
imagePullSecrets:
- name: dockerhub-creds
initContainers:
- name: wait-for-db
image: postgres:17-alpine

View File

@@ -1,29 +0,0 @@
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: selfsigned-issuer
spec:
selfSigned: {}
---
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-staging
spec:
acme:
# The ACME server URL (Let's Encrypt staging)
server: https://acme-staging-v02.api.letsencrypt.org/directory
# Email address used for ACME registration
email: admin@bakery-ia.local # Change this to your email
# Name of a secret used to store the ACME account private key
privateKeySecretRef:
name: letsencrypt-staging
# Enable the HTTP-01 challenge provider
solvers:
- http01:
ingress:
class: nginx
podTemplate:
spec:
nodeSelector:
"kubernetes.io/os": linux

View File

@@ -24,6 +24,7 @@ spec:
- localhost
- bakery-ia.local
- api.bakery-ia.local
- monitoring.bakery-ia.local
- "*.bakery-ia.local"
# IP addresses (for localhost)

View File

@@ -36,6 +36,7 @@ spec:
- hosts:
- localhost
- bakery-ia.local
- monitoring.bakery-ia.local
secretName: bakery-dev-tls-cert
rules:
- host: localhost
@@ -54,4 +55,32 @@ spec:
service:
name: gateway-service
port:
number: 8000
number: 8000
- host: bakery-ia.local
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: frontend-service
port:
number: 3000
- path: /api
pathType: Prefix
backend:
service:
name: gateway-service
port:
number: 8000
- host: monitoring.bakery-ia.local
http:
paths:
# SigNoz Frontend UI
- path: /
pathType: Prefix
backend:
service:
name: signoz
port:
number: 8080

View File

@@ -9,15 +9,12 @@ metadata:
resources:
- ../../base
# Monitoring enabled for dev environment
- ../../base/components/monitoring
- dev-ingress.yaml
# SigNoz ingress is applied by Tilt (see Tiltfile)
# - signoz-ingress.yaml
# SigNoz is managed via Helm deployment (see Tiltfile signoz-deploy)
# Monitoring is handled by SigNoz (no separate monitoring components needed)
# Dev-Prod Parity: Enable HTTPS with self-signed certificates
- dev-certificate.yaml
- monitoring-certificate.yaml
- cluster-issuer-staging.yaml
# SigNoz paths are now included in the main ingress (ingress-https.yaml)
# Exclude nominatim from dev to save resources
# Using scale to 0 for StatefulSet to prevent pod creation
@@ -611,39 +608,6 @@ patches:
limits:
memory: "512Mi"
cpu: "300m"
# Optional exporters resource patches for dev
- target:
group: apps
version: v1
kind: DaemonSet
name: node-exporter
namespace: monitoring
patch: |-
- op: replace
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "32Mi"
cpu: "25m"
limits:
memory: "64Mi"
cpu: "100m"
- target:
group: apps
version: v1
kind: Deployment
name: postgres-exporter
namespace: monitoring
patch: |-
- op: replace
path: /spec/template/spec/containers/0/resources
value:
requests:
memory: "32Mi"
cpu: "25m"
limits:
memory: "64Mi"
cpu: "100m"
secretGenerator:
- name: dev-secrets

View File

@@ -1,49 +0,0 @@
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: bakery-dev-monitoring-tls-cert
namespace: monitoring
spec:
# Self-signed certificate for local development
secretName: bakery-ia-tls-cert
# Certificate duration
duration: 2160h # 90 days
renewBefore: 360h # 15 days
# Subject configuration
subject:
organizations:
- Bakery IA Development
# Common name
commonName: localhost
# DNS names this certificate is valid for
dnsNames:
- localhost
- monitoring.bakery-ia.local
# IP addresses (for localhost)
ipAddresses:
- 127.0.0.1
- ::1
# Use self-signed issuer for development
issuerRef:
name: selfsigned-issuer
kind: ClusterIssuer
group: cert-manager.io
# Private key configuration
privateKey:
algorithm: RSA
encoding: PKCS1
size: 2048
# Usages
usages:
- server auth
- client auth
- digital signature
- key encipherment

View File

@@ -1,39 +0,0 @@
---
# SigNoz Ingress for Development (localhost)
# SigNoz is deployed via Helm in the 'signoz' namespace
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: signoz-ingress-localhost
namespace: signoz
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/use-regex: "true"
spec:
ingressClassName: nginx
tls:
- hosts:
- localhost
secretName: bakery-ia-tls-cert
rules:
- host: localhost
http:
paths:
# SigNoz Frontend UI
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz-frontend
port:
number: 3301
# SigNoz Query Service API
- path: /signoz-api(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz-query-service
port:
number: 8080

View File

@@ -8,13 +8,13 @@ namespace: bakery-ia
resources:
- ../../base
- ../../base/components/monitoring
- prod-ingress.yaml
- prod-configmap.yaml
# SigNoz is managed via Helm deployment (see infrastructure/helm/deploy-signoz.sh)
# Monitoring is handled by SigNoz (no separate monitoring components needed)
# SigNoz paths are now included in the main ingress (ingress-https.yaml)
patchesStrategicMerge:
- storage-patch.yaml
- monitoring-ingress-patch.yaml
labels:
- includeSelectors: true
@@ -22,8 +22,83 @@ labels:
environment: production
tier: production
# SigNoz resource patches for production
# Production configuration patches
patches:
# Override ConfigMap values for production
- target:
kind: ConfigMap
name: bakery-config
patch: |-
- op: replace
path: /data/ENVIRONMENT
value: "production"
- op: replace
path: /data/DEBUG
value: "false"
- op: replace
path: /data/LOG_LEVEL
value: "INFO"
- op: replace
path: /data/PROFILING_ENABLED
value: "false"
- op: replace
path: /data/MOCK_EXTERNAL_APIS
value: "false"
- op: add
path: /data/REQUEST_TIMEOUT
value: "30"
- op: add
path: /data/MAX_CONNECTIONS
value: "100"
- op: replace
path: /data/ENABLE_TRACING
value: "true"
- op: replace
path: /data/ENABLE_METRICS
value: "true"
- op: replace
path: /data/ENABLE_LOGS
value: "true"
- op: add
path: /data/OTEL_EXPORTER_OTLP_ENDPOINT
value: "http://signoz-otel-collector.signoz.svc.cluster.local:4317"
- op: add
path: /data/OTEL_EXPORTER_OTLP_PROTOCOL
value: "grpc"
- op: add
path: /data/OTEL_SERVICE_NAME
value: "bakery-ia"
- op: add
path: /data/OTEL_RESOURCE_ATTRIBUTES
value: "deployment.environment=production,cluster.name=bakery-ia-prod"
- op: add
path: /data/SIGNOZ_ENDPOINT
value: "http://signoz-query-service.signoz.svc.cluster.local:8080"
- op: add
path: /data/SIGNOZ_FRONTEND_URL
value: "https://monitoring.bakewise.ai/signoz"
- op: add
path: /data/SIGNOZ_ROOT_URL
value: "https://monitoring.bakewise.ai/signoz"
- op: add
path: /data/RATE_LIMIT_ENABLED
value: "true"
- op: add
path: /data/RATE_LIMIT_PER_MINUTE
value: "60"
- op: add
path: /data/CORS_ORIGINS
value: "https://bakewise.ai"
- op: add
path: /data/CORS_ALLOW_CREDENTIALS
value: "true"
- op: add
path: /data/VITE_API_URL
value: "/api"
- op: add
path: /data/VITE_ENVIRONMENT
value: "production"
# SigNoz resource patches for production
# SigNoz ClickHouse production configuration
- target:
group: apps

View File

@@ -60,5 +60,6 @@ spec:
name: gateway-service
port:
number: 8000
# Monitoring (monitoring.bakewise.ai) is now handled by signoz-ingress.yaml in the signoz namespace
# Note: SigNoz monitoring is deployed via Helm in the 'signoz' namespace
# SigNoz creates its own Ingress via Helm chart configuration
# Access at: https://monitoring.bakewise.ai (configured in signoz-values-prod.yaml)

View File

@@ -1,78 +0,0 @@
---
# SigNoz Ingress for Production
# SigNoz is deployed via Helm in the 'signoz' namespace
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: signoz-ingress-prod
namespace: signoz
labels:
app.kubernetes.io/name: signoz
app.kubernetes.io/component: ingress
annotations:
# Nginx ingress controller annotations
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
nginx.ingress.kubernetes.io/proxy-body-size: "50m"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "600"
nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
nginx.ingress.kubernetes.io/rewrite-target: /$2
nginx.ingress.kubernetes.io/use-regex: "true"
# CORS configuration
nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/cors-allow-origin: "https://bakewise.ai,https://monitoring.bakewise.ai"
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS, PATCH"
nginx.ingress.kubernetes.io/cors-allow-headers: "Content-Type, Authorization, X-Requested-With, Accept, Origin"
nginx.ingress.kubernetes.io/cors-allow-credentials: "true"
# Security headers
nginx.ingress.kubernetes.io/configuration-snippet: |
more_set_headers "X-Frame-Options: SAMEORIGIN";
more_set_headers "X-Content-Type-Options: nosniff";
more_set_headers "X-XSS-Protection: 1; mode=block";
more_set_headers "Referrer-Policy: strict-origin-when-cross-origin";
# Rate limiting
nginx.ingress.kubernetes.io/limit-rps: "100"
nginx.ingress.kubernetes.io/limit-connections: "50"
# Cert-manager annotations for automatic certificate issuance
cert-manager.io/cluster-issuer: "letsencrypt-production"
cert-manager.io/acme-challenge-type: http01
spec:
ingressClassName: nginx
tls:
- hosts:
- monitoring.bakewise.ai
secretName: signoz-prod-tls-cert
rules:
- host: monitoring.bakewise.ai
http:
paths:
# SigNoz Frontend UI
- path: /signoz(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz-frontend
port:
number: 3301
# SigNoz Query Service API
- path: /signoz-api(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz-query-service
port:
number: 8080
# SigNoz AlertManager
- path: /signoz-alerts(/|$)(.*)
pathType: ImplementationSpecific
backend:
service:
name: signoz-alertmanager
port:
number: 9093

View File

@@ -0,0 +1,133 @@
#!/bin/bash
# Setup script for database monitoring with OpenTelemetry and SigNoz
# This script creates monitoring users in PostgreSQL and deploys the collector
set -e
echo "========================================="
echo "Database Monitoring Setup for SigNoz"
echo "========================================="
echo ""
# Configuration
NAMESPACE="bakery-ia"
MONITOR_USER="otel_monitor"
MONITOR_PASSWORD=$(openssl rand -base64 32)
# PostgreSQL databases to monitor
DATABASES=(
"auth-db-service:auth_db"
"inventory-db-service:inventory_db"
"orders-db-service:orders_db"
"tenant-db-service:tenant_db"
"sales-db-service:sales_db"
"production-db-service:production_db"
"recipes-db-service:recipes_db"
"procurement-db-service:procurement_db"
"distribution-db-service:distribution_db"
"forecasting-db-service:forecasting_db"
"external-db-service:external_db"
"suppliers-db-service:suppliers_db"
"pos-db-service:pos_db"
"training-db-service:training_db"
"notification-db-service:notification_db"
"orchestrator-db-service:orchestrator_db"
"ai-insights-db-service:ai_insights_db"
)
echo "Step 1: Creating monitoring user in PostgreSQL databases"
echo "========================================="
echo ""
for db_entry in "${DATABASES[@]}"; do
IFS=':' read -r service dbname <<< "$db_entry"
echo "Creating monitoring user in $dbname..."
# Create monitoring user via kubectl exec
kubectl exec -n "$NAMESPACE" "deployment/${service%-service}" -- psql -U postgres -d "$dbname" -c "
DO \$\$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '$MONITOR_USER') THEN
CREATE USER $MONITOR_USER WITH PASSWORD '$MONITOR_PASSWORD';
GRANT pg_monitor TO $MONITOR_USER;
GRANT CONNECT ON DATABASE $dbname TO $MONITOR_USER;
RAISE NOTICE 'User $MONITOR_USER created successfully';
ELSE
RAISE NOTICE 'User $MONITOR_USER already exists';
END IF;
END
\$\$;
" 2>/dev/null || echo " ⚠️ Warning: Could not create user in $dbname (may already exist or database not ready)"
echo ""
done
echo "✅ Monitoring users created"
echo ""
echo "Step 2: Creating Kubernetes secret for monitoring credentials"
echo "========================================="
echo ""
# Create secret for database monitoring
kubectl create secret generic database-monitor-secrets \
-n "$NAMESPACE" \
--from-literal=POSTGRES_MONITOR_USER="$MONITOR_USER" \
--from-literal=POSTGRES_MONITOR_PASSWORD="$MONITOR_PASSWORD" \
--dry-run=client -o yaml | kubectl apply -f -
echo "✅ Secret created: database-monitor-secrets"
echo ""
echo "Step 3: Deploying OpenTelemetry collector for database monitoring"
echo "========================================="
echo ""
kubectl apply -f infrastructure/kubernetes/base/monitoring/database-otel-collector.yaml
echo "✅ Database monitoring collector deployed"
echo ""
echo "Step 4: Waiting for collector to be ready"
echo "========================================="
echo ""
kubectl wait --for=condition=available --timeout=60s \
deployment/database-otel-collector -n "$NAMESPACE"
echo "✅ Collector is ready"
echo ""
echo "========================================="
echo "Database Monitoring Setup Complete!"
echo "========================================="
echo ""
echo "What's been configured:"
echo " ✅ Monitoring user created in all PostgreSQL databases"
echo " ✅ OpenTelemetry collector deployed for database metrics"
echo " ✅ Metrics exported to SigNoz"
echo ""
echo "Metrics being collected:"
echo " 📊 PostgreSQL: connections, commits, rollbacks, deadlocks, table sizes"
echo " 📊 Redis: memory usage, keyspace hits/misses, connected clients"
echo " 📊 RabbitMQ: queue depth, message rates, consumer count"
echo ""
echo "Next steps:"
echo " 1. Check collector logs:"
echo " kubectl logs -n $NAMESPACE deployment/database-otel-collector"
echo ""
echo " 2. View metrics in SigNoz:"
echo " - Go to https://monitoring.bakery-ia.local"
echo " - Create dashboard with queries like:"
echo " * postgresql.backends (connections)"
echo " * postgresql.database.size (database size)"
echo " * redis.memory.used (Redis memory)"
echo " * rabbitmq.message.current (queue depth)"
echo ""
echo " 3. Create alerts for:"
echo " - High connection count (approaching max_connections)"
echo " - Slow query detection (via application traces)"
echo " - High Redis memory usage"
echo " - RabbitMQ queue buildup"
echo ""

View File

@@ -0,0 +1,65 @@
#!/bin/bash
# Setup Docker Hub image pull secrets for all namespaces
# This script creates docker-registry secrets for pulling images from Docker Hub
set -e
# Docker Hub credentials
DOCKER_SERVER="docker.io"
DOCKER_USERNAME="uals"
DOCKER_PASSWORD="dckr_pat_zzEY5Q58x1S0puraIoKEtbpue3A"
DOCKER_EMAIL="ualfaro@gmail.com"
SECRET_NAME="dockerhub-creds"
# List of namespaces used in the project
NAMESPACES=(
"bakery-ia"
"bakery-ia-dev"
"bakery-ia-prod"
"default"
)
echo "Setting up Docker Hub image pull secrets..."
echo "==========================================="
echo ""
for namespace in "${NAMESPACES[@]}"; do
echo "Processing namespace: $namespace"
# Create namespace if it doesn't exist
if ! kubectl get namespace "$namespace" >/dev/null 2>&1; then
echo " Creating namespace: $namespace"
kubectl create namespace "$namespace"
fi
# Delete existing secret if it exists
if kubectl get secret "$SECRET_NAME" -n "$namespace" >/dev/null 2>&1; then
echo " Deleting existing secret in namespace: $namespace"
kubectl delete secret "$SECRET_NAME" -n "$namespace"
fi
# Create the docker-registry secret
echo " Creating Docker Hub secret in namespace: $namespace"
kubectl create secret docker-registry "$SECRET_NAME" \
--docker-server="$DOCKER_SERVER" \
--docker-username="$DOCKER_USERNAME" \
--docker-password="$DOCKER_PASSWORD" \
--docker-email="$DOCKER_EMAIL" \
-n "$namespace"
echo " ✓ Secret created successfully in namespace: $namespace"
echo ""
done
echo "==========================================="
echo "Docker Hub secrets setup completed!"
echo ""
echo "The secret '$SECRET_NAME' has been created in all namespaces:"
for namespace in "${NAMESPACES[@]}"; do
echo " - $namespace"
done
echo ""
echo "Next steps:"
echo "1. Apply Kubernetes manifests with imagePullSecrets configured"
echo "2. Verify pods can pull images: kubectl get pods -A"