diff --git a/PRODUCTION_DEPLOYMENT_GUIDE.md b/PRODUCTION_DEPLOYMENT_GUIDE.md index 915a0a32..7200cbdf 100644 --- a/PRODUCTION_DEPLOYMENT_GUIDE.md +++ b/PRODUCTION_DEPLOYMENT_GUIDE.md @@ -1047,8 +1047,15 @@ kubectl exec -n bakery-ia deployment/gateway -- curl -s http://localhost:8000/he ### Step 7.1: Deploy Unbound DNS (Required for Mailu) +> **Why Unbound?** Mailu requires DNSSEC validation for email security (DKIM/SPF/DMARC via rspamd). +> CoreDNS does NOT support DNSSEC natively, so Unbound provides this capability. + ```bash -# Deploy Unbound DNS resolver +# Clean up any stuck Unbound deployments from previous attempts +kubectl delete deployment -n bakery-ia -l app.kubernetes.io/name=unbound --ignore-not-found + +# Deploy Unbound DNS resolver with minimal resources +# Note: prod/values.yaml uses 50m CPU, 64Mi memory - very lightweight helm upgrade --install unbound infrastructure/platform/networking/dns/unbound-helm \ -n bakery-ia \ -f infrastructure/platform/networking/dns/unbound-helm/values.yaml \ @@ -1056,26 +1063,88 @@ helm upgrade --install unbound infrastructure/platform/networking/dns/unbound-he --timeout 5m \ --wait -# Get Unbound service IP +# Verify Unbound pod is running +kubectl get pods -n bakery-ia -l app.kubernetes.io/name=unbound +# Expected: 1/1 Running + +# Get Unbound service IP (will be used in subsequent steps) UNBOUND_IP=$(kubectl get svc unbound-dns -n bakery-ia -o jsonpath='{.spec.clusterIP}') echo "Unbound DNS IP: $UNBOUND_IP" +# Save this IP - you'll need it for Step 7.2 and 7.3 + +# Test Unbound is working (from inside the cluster) +kubectl run -it --rm dns-test --image=busybox --restart=Never -- \ + nslookup google.com $UNBOUND_IP +# Expected: Should resolve google.com successfully ``` -### Step 7.2: Configure CoreDNS for DNSSEC +**Troubleshooting Unbound:** ```bash -# Patch CoreDNS to forward to Unbound +# If pod is Pending, check resources +kubectl describe pod -n bakery-ia -l app.kubernetes.io/name=unbound | grep -A 5 Events + +# Check node resource availability +kubectl describe node | grep -A 10 "Allocated resources" + +# If resources are exhausted, scale down non-critical services temporarily +kubectl scale deployment signoz-frontend -n bakery-ia --replicas=0 --ignore-not-found +``` + +### Step 7.2: Configure CoreDNS (Choose ONE Option) + +> **Architecture Decision:** You have two options for DNS configuration. +> Choose based on your cluster size and requirements. + +#### Option A: Mailu-Only DNSSEC (Recommended for Single-Node) + +Only Mailu pods use Unbound for DNSSEC. CoreDNS uses public DNS for everything else. +This is simpler and avoids making Unbound a single point of failure for the entire cluster. + +```bash +# Ensure CoreDNS uses public DNS (8.8.8.8, 1.1.1.1) +# This is likely already the default, but verify: +kubectl get configmap coredns -n kube-system -o yaml | grep forward + +# If it shows forwarding to Unbound IP, restore to public DNS: +kubectl patch configmap coredns -n kube-system --type merge -p '{ + "data": { + "Corefile": ".:53 {\n errors\n health {\n lameduck 5s\n }\n ready\n kubernetes cluster.local in-addr.arpa ip6.arpa {\n pods insecure\n fallthrough in-addr.arpa ip6.arpa\n ttl 30\n }\n prometheus :9153\n forward . 8.8.8.8 1.1.1.1 {\n max_concurrent 1000\n }\n cache 30\n loop\n reload\n loadbalance\n}\n" + } +}' + +kubectl rollout restart deployment coredns -n kube-system +kubectl rollout status deployment coredns -n kube-system --timeout=60s +``` + +#### Option B: Cluster-Wide DNSSEC (For Multi-Node HA) + +All cluster DNS queries go through Unbound. Provides DNSSEC for all pods. +Only use this if you have multiple Unbound replicas for high availability. + +```bash +# Get Unbound IP +UNBOUND_IP=$(kubectl get svc unbound-dns -n bakery-ia -o jsonpath='{.spec.clusterIP}') + +# Patch CoreDNS to forward ALL external queries to Unbound kubectl patch configmap coredns -n kube-system --type merge -p "{ \"data\": { \"Corefile\": \".:53 {\\n errors\\n health {\\n lameduck 5s\\n }\\n ready\\n kubernetes cluster.local in-addr.arpa ip6.arpa {\\n pods insecure\\n fallthrough in-addr.arpa ip6.arpa\\n ttl 30\\n }\\n prometheus :9153\\n forward . $UNBOUND_IP {\\n max_concurrent 1000\\n }\\n cache 30\\n loop\\n reload\\n loadbalance\\n}\\n\" } }" -# Restart CoreDNS kubectl rollout restart deployment coredns -n kube-system kubectl rollout status deployment coredns -n kube-system --timeout=60s ``` +**Verify DNS is working:** + +```bash +# Test DNS resolution from a pod +kubectl run -it --rm dns-test --image=busybox --restart=Never -- nslookup google.com +# Expected: Should resolve successfully +``` + ### Step 7.3: Deploy Mailu Email Server ```bash @@ -1084,27 +1153,34 @@ helm repo add mailu https://mailu.github.io/helm-charts helm repo update # Apply Mailu configuration secrets -# These are pre-configured with secure defaults kubectl apply -f infrastructure/platform/mail/mailu-helm/configs/mailu-admin-credentials-secret.yaml -n bakery-ia kubectl apply -f infrastructure/platform/mail/mailu-helm/configs/mailu-certificates-secret.yaml -n bakery-ia +# Get Unbound DNS IP dynamically +UNBOUND_IP=$(kubectl get svc unbound-dns -n bakery-ia -o jsonpath='{.spec.clusterIP}') +echo "Using Unbound DNS IP: $UNBOUND_IP" + # Install Mailu with production configuration -# The Helm chart uses the pre-configured secrets for admin credentials and TLS certificates +# The --set flag dynamically passes the Unbound IP for DNSSEC validation helm upgrade --install mailu mailu/mailu \ -n bakery-ia \ -f infrastructure/platform/mail/mailu-helm/values.yaml \ -f infrastructure/platform/mail/mailu-helm/prod/values.yaml \ + --set global.custom_dns_servers="$UNBOUND_IP" \ + --set admin.dnsConfig.nameservers[0]="$UNBOUND_IP" \ --timeout 10m -# Wait for Mailu to be ready +# Wait for Mailu to be ready (may take 5-10 minutes) kubectl wait --for=condition=available --timeout=600s deployment/mailu-front -n bakery-ia # Verify Mailu pods are running kubectl get pods -n bakery-ia | grep mailu -# Get the admin password from the pre-configured secret +# Get the admin password MAILU_ADMIN_PASSWORD=$(kubectl get secret mailu-admin-credentials -n bakery-ia -o jsonpath='{.data.password}' | base64 -d) +echo "============================================" echo "Mailu Admin Password: $MAILU_ADMIN_PASSWORD" +echo "============================================" echo "⚠️ SAVE THIS PASSWORD SECURELY!" # Check Mailu initialization status @@ -1113,53 +1189,46 @@ kubectl logs -n bakery-ia deployment/mailu-front --tail=10 > **Important Notes about Mailu Deployment:** > -> 1. **Pre-Configured Secrets:** Mailu uses pre-configured secrets for admin credentials and TLS certificates. These are defined in the configuration files. +> 1. **Pre-Configured Secrets:** Mailu uses pre-configured secrets for admin credentials and TLS certificates. > -> 2. **Password Management:** The admin password is stored in `mailu-admin-credentials-secret.yaml`. For production, you should update this with a secure password before deployment. +> 2. **Password Management:** Update `mailu-admin-credentials-secret.yaml` with a secure password before deployment. > -> 3. **TLS Certificates:** The self-signed certificates in `mailu-certificates-secret.yaml` are for initial setup. For production, replace these with proper certificates from cert-manager (see Step 7.3.1). +> 3. **TLS Certificates:** Self-signed certificates are used internally. External traffic uses Let's Encrypt via Ingress. > -> 4. **Initialization Time:** Mailu may take 5-10 minutes to fully initialize. During this time, some pods may restart as the system configures itself. +> 4. **Initialization Time:** Mailu may take 5-10 minutes to fully initialize. Pods may restart during setup. > > 5. **Accessing Mailu:** > - Webmail: `https://mail.bakewise.ai/webmail` > - Admin Interface: `https://mail.bakewise.ai/admin` > - Username: `admin@bakewise.ai` -> - Password: (from `mailu-admin-credentials-secret.yaml`) +> - Password: (from secret above) > -> 6. **Mailgun Relay:** The production configuration includes Mailgun SMTP relay. Configure your Mailgun credentials in `mailu-mailgun-credentials-secret.yaml` before deployment. +> 6. **Mailgun Relay:** Configure credentials in `mailu-mailgun-credentials-secret.yaml` before deployment. ### Step 7.3.1: Mailu Configuration Notes -> **Important Information about Mailu Certificates:** +> **Certificate Architecture:** > -> 1. **Dual Certificate Architecture:** -> - **Internal Communication:** Uses self-signed certificates (`mailu-certificates-secret.yaml`) -> - **External Communication:** Uses Let's Encrypt certificates via NGINX Ingress (`bakery-ia-prod-tls-cert`) +> ``` +> External Client → NGINX Ingress (Let's Encrypt) → Internal Network → Mailu Services (Self-signed) +> ``` > -> 2. **No Certificate Replacement Needed:** The self-signed certificates are only used for internal communication between Mailu services. External clients connect through the NGINX Ingress Controller which uses the publicly trusted Let's Encrypt certificates. -> -> 3. **Certificate Flow:** -> ``` -> External Client → NGINX Ingress (Let's Encrypt) → Internal Network → Mailu Services (Self-signed) -> ``` -> -> 4. **Security:** This architecture is secure because: -> - External connections use publicly trusted certificates -> - Internal connections are still encrypted (even if self-signed) -> - Ingress terminates TLS, reducing load on Mailu services -> -> 5. **Mailgun Relay Configuration:** For outbound email delivery, configure your Mailgun credentials: -> ```bash -> # Edit the Mailgun credentials secret -> nano infrastructure/platform/mail/mailu-helm/configs/mailu-mailgun-credentials-secret.yaml -> -> # Apply the secret -> kubectl apply -f infrastructure/platform/mail/mailu-helm/configs/mailu-mailgun-credentials-secret.yaml -n bakery-ia -> -> # Restart Mailu to pick up the new relay configuration -> kubectl rollout restart deployment -n bakery-ia -l app.kubernetes.io/instance=mailu -> ``` +> - **External:** Uses publicly trusted Let's Encrypt certificates via NGINX Ingress +> - **Internal:** Uses self-signed certificates for inter-service communication +> - **No replacement needed:** This dual-certificate architecture is intentional and secure + +**Configure Mailgun Relay (for outbound email):** + +```bash +# Edit the Mailgun credentials secret +nano infrastructure/platform/mail/mailu-helm/configs/mailu-mailgun-credentials-secret.yaml + +# Apply the secret +kubectl apply -f infrastructure/platform/mail/mailu-helm/configs/mailu-mailgun-credentials-secret.yaml -n bakery-ia + +# Restart Mailu to pick up the new relay configuration +kubectl rollout restart deployment -n bakery-ia -l app.kubernetes.io/instance=mailu +``` ### Step 7.4: Deploy SigNoz Monitoring diff --git a/infrastructure/environments/prod/k8s-manifests/kustomization.yaml b/infrastructure/environments/prod/k8s-manifests/kustomization.yaml index 66279a60..09fa03d7 100644 --- a/infrastructure/environments/prod/k8s-manifests/kustomization.yaml +++ b/infrastructure/environments/prod/k8s-manifests/kustomization.yaml @@ -204,6 +204,115 @@ patches: memory: "1Gi" cpu: "500m" + # ============================================================================= + # CPU Request Optimization for Production + # Reduce CPU requests to match actual usage (was 100m, actual ~5-10m) + # This prevents scheduler rejections due to overcommitted requests + # ============================================================================= + + # Database deployments - reduce CPU request from 100m to 25m + - target: + group: apps + version: v1 + kind: Deployment + name: ".*-db$" + namespace: bakery-ia + patch: |- + - op: replace + path: /spec/template/spec/containers/0/resources/requests/cpu + value: "25m" + + # Microservice deployments - reduce CPU request from 100m to 25m + - target: + group: apps + version: v1 + kind: Deployment + name: ".*-service$" + namespace: bakery-ia + patch: |- + - op: replace + path: /spec/template/spec/containers/0/resources/requests/cpu + value: "25m" + + # Other core services + - target: + group: apps + version: v1 + kind: Deployment + name: gateway + namespace: bakery-ia + patch: |- + - op: replace + path: /spec/template/spec/containers/0/resources/requests/cpu + value: "25m" + + - target: + group: apps + version: v1 + kind: Deployment + name: alert-processor + namespace: bakery-ia + patch: |- + - op: replace + path: /spec/template/spec/containers/0/resources/requests/cpu + value: "25m" + + - target: + group: apps + version: v1 + kind: Deployment + name: frontend + namespace: bakery-ia + patch: |- + - op: replace + path: /spec/template/spec/containers/0/resources/requests/cpu + value: "50m" + + - target: + group: apps + version: v1 + kind: Deployment + name: redis + namespace: bakery-ia + patch: |- + - op: replace + path: /spec/template/spec/containers/0/resources/requests/cpu + value: "25m" + + - target: + group: apps + version: v1 + kind: Deployment + name: rabbitmq + namespace: bakery-ia + patch: |- + - op: replace + path: /spec/template/spec/containers/0/resources/requests/cpu + value: "50m" + + - target: + group: apps + version: v1 + kind: Deployment + name: minio + namespace: bakery-ia + patch: |- + - op: replace + path: /spec/template/spec/containers/0/resources/requests/cpu + value: "50m" + + # Migration jobs - reduce CPU request from 100m to 25m + - target: + group: batch + version: v1 + kind: Job + name: ".*-migration$" + namespace: bakery-ia + patch: |- + - op: replace + path: /spec/template/spec/containers/0/resources/requests/cpu + value: "25m" + images: # Application services - name: bakery/auth-service diff --git a/infrastructure/platform/mail/mailu-helm/dev/values.yaml b/infrastructure/platform/mail/mailu-helm/dev/values.yaml index 955e8349..b1e690cc 100644 --- a/infrastructure/platform/mail/mailu-helm/dev/values.yaml +++ b/infrastructure/platform/mail/mailu-helm/dev/values.yaml @@ -1,9 +1,11 @@ # Development-tuned Mailu configuration global: # Using Unbound DNS for DNSSEC validation (required by Mailu admin) - # Unbound service is available at unbound-dns.bakery-ia.svc.cluster.local - # Static ClusterIP configured in unbound-helm/values.yaml - custom_dns_servers: "10.96.53.53" # Unbound DNS static ClusterIP + # This value is dynamically set via --set during helm install: + # UNBOUND_IP=$(kubectl get svc unbound-dns -n bakery-ia -o jsonpath='{.spec.clusterIP}') + # helm upgrade --install mailu ... --set global.custom_dns_servers="$UNBOUND_IP" + # Default fallback to Kubernetes DNS (will be overridden by --set) + custom_dns_servers: "10.96.0.10" # Override with Unbound IP via --set # Redis configuration - use built-in Mailu Redis (no authentication needed) externalRedis: @@ -11,11 +13,12 @@ externalRedis: # Component-specific DNS configuration # Admin requires DNSSEC validation - use Unbound DNS (forwards cluster.local to kube-dns) +# NOTE: dnsConfig.nameservers is dynamically set via --set during helm install admin: dnsPolicy: "None" dnsConfig: nameservers: - - "10.96.53.53" # Unbound DNS static ClusterIP (forwards cluster.local to kube-dns) + - "10.96.0.10" # Override with Unbound IP via --set admin.dnsConfig.nameservers[0] searches: - "bakery-ia.svc.cluster.local" - "svc.cluster.local" diff --git a/infrastructure/platform/networking/dns/unbound-helm/prod/values.yaml b/infrastructure/platform/networking/dns/unbound-helm/prod/values.yaml index b5d78041..ee065faa 100644 --- a/infrastructure/platform/networking/dns/unbound-helm/prod/values.yaml +++ b/infrastructure/platform/networking/dns/unbound-helm/prod/values.yaml @@ -1,5 +1,18 @@ # Production-specific values for unbound DNS resolver # Overrides for the production environment +# +# ARCHITECTURE NOTE: +# Unbound provides DNSSEC validation required by Mailu (rspamd for DKIM/SPF/DMARC). +# CoreDNS does NOT support DNSSEC, so we need Unbound as a dedicated resolver. +# +# Two deployment options: +# 1. Mailu-only: Only Mailu pods use Unbound (via dnsPolicy: None) +# - CoreDNS forwards to public DNS (8.8.8.8, 1.1.1.1) +# - Lower resource usage, simpler architecture +# +# 2. Cluster-wide: CoreDNS forwards ALL external queries to Unbound +# - All pods get DNSSEC validation +# - Higher resource usage, single point of failure for DNS # Use official image for production image: @@ -7,44 +20,47 @@ image: tag: "latest" pullPolicy: "IfNotPresent" -# Production resource settings (higher limits for reliability) +# Production resource settings - MINIMAL for single-node clusters +# Unbound is very lightweight - DNS queries use minimal CPU resources: requests: + cpu: "50m" + memory: "64Mi" + limits: cpu: "200m" memory: "256Mi" - limits: - cpu: "500m" - memory: "512Mi" -# Production-specific settings -replicaCount: 2 +# Single replica for single-node clusters (saves resources) +# Increase to 2 for multi-node HA deployments +replicaCount: 1 # Production annotations podAnnotations: environment: "production" critical: "true" -# Anti-affinity for high availability in production -affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app.kubernetes.io/name - operator: In - values: - - unbound - topologyKey: "kubernetes.io/hostname" +# Anti-affinity disabled for single-node clusters +# Uncomment for multi-node HA deployments +# affinity: +# podAntiAffinity: +# preferredDuringSchedulingIgnoredDuringExecution: +# - weight: 100 +# podAffinityTerm: +# labelSelector: +# matchExpressions: +# - key: app.kubernetes.io/name +# operator: In +# values: +# - unbound +# topologyKey: "kubernetes.io/hostname" # Production probe settings (more conservative) probes: readiness: - initialDelaySeconds: 20 + initialDelaySeconds: 10 periodSeconds: 30 command: "sh -c 'echo \"\" | nc -w 3 127.0.0.1 53 || exit 1'" liveness: - initialDelaySeconds: 60 + initialDelaySeconds: 30 periodSeconds: 60 command: "sh -c 'echo \"\" | nc -w 3 127.0.0.1 53 || exit 1'" \ No newline at end of file diff --git a/infrastructure/platform/networking/dns/unbound-helm/values.yaml b/infrastructure/platform/networking/dns/unbound-helm/values.yaml index 6b9855a8..b11d4619 100644 --- a/infrastructure/platform/networking/dns/unbound-helm/values.yaml +++ b/infrastructure/platform/networking/dns/unbound-helm/values.yaml @@ -1,6 +1,10 @@ # Default values for unbound DNS resolver # This is a YAML-formatted file. # Declare variables to be passed into your templates. +# +# PURPOSE: Provides DNSSEC validation for Mailu email server +# CoreDNS does NOT support DNSSEC, so Unbound fills this gap. +# Mailu's rspamd requires DNSSEC for DKIM/SPF/DMARC validation. # Global settings global: @@ -18,13 +22,14 @@ image: replicaCount: 1 # Resource limits and requests +# Unbound is very lightweight - these minimal resources are sufficient resources: requests: + cpu: "25m" + memory: "32Mi" + limits: cpu: "100m" memory: "128Mi" - limits: - cpu: "300m" - memory: "384Mi" # Security context securityContext: