Add new infra architecture

This commit is contained in:
Urtzi Alfaro
2026-01-19 11:55:17 +01:00
parent 21d35ea92b
commit 35f164f0cd
311 changed files with 13241 additions and 3700 deletions

View File

@@ -1,5 +1,6 @@
#!/bin/bash
# Improved Kubernetes restart script with better error handling and resource management
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
@@ -7,6 +8,18 @@ YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration variables
COLIMA_PROFILE="k8s-local"
KIND_CLUSTER="bakery-ia-local"
REGISTRY_NAME="kind-registry"
REGISTRY_PORT="5000"
NAMESPACE="bakery-ia"
# Resource configuration (adjustable)
COLIMA_CPU=12
COLIMA_MEMORY=24
COLIMA_DISK=120
# Function to print colored output
print_status() {
echo -e "${BLUE}[INFO]${NC} $1"
@@ -24,6 +37,15 @@ print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to check command availability
check_command() {
if ! command -v "$1" &> /dev/null; then
print_error "Required command '$1' not found. Please install it first."
return 1
fi
return 0
}
# Function to wait for pods with retry logic
wait_for_pods() {
local namespace=$1
@@ -56,53 +78,97 @@ wait_for_pods() {
return 1
}
# Function to handle cleanup
# Function to check if Colima is running
is_colima_running() {
colima list | grep -q "$COLIMA_PROFILE" && colima status --profile "$COLIMA_PROFILE" | grep -q "Running"
}
# Function to check if Kind cluster exists
is_kind_cluster_running() {
kind get clusters | grep -q "$KIND_CLUSTER"
}
# Function to check if registry is running
is_registry_running() {
docker inspect -f '{{.State.Running}}' "$REGISTRY_NAME" 2>/dev/null | grep -q "true"
}
# Function to handle cleanup with better error handling
cleanup() {
print_status "Starting cleanup process..."
# Delete Kubernetes namespace with timeout
print_status "Deleting namespace bakery-ia..."
if kubectl get namespace bakery-ia &>/dev/null; then
kubectl delete namespace bakery-ia 2>/dev/null &
PID=$!
sleep 2
if ps -p $PID &>/dev/null; then
print_warning "kubectl delete namespace command taking too long, forcing termination..."
kill $PID 2>/dev/null
print_status "Deleting namespace $NAMESPACE..."
if kubectl get namespace "$NAMESPACE" &>/dev/null; then
print_status "Found namespace $NAMESPACE, attempting to delete..."
# Try graceful deletion first
kubectl delete namespace "$NAMESPACE" --wait=false 2>/dev/null
# Wait a bit for deletion to start
sleep 5
# Check if namespace is still terminating
if kubectl get namespace "$NAMESPACE" --no-headers 2>/dev/null | grep -q "Terminating"; then
print_warning "Namespace $NAMESPACE is stuck in Terminating state"
print_status "Attempting to force delete..."
# Get the namespace JSON and remove finalizers
kubectl get namespace "$NAMESPACE" -o json > /tmp/namespace.json 2>/dev/null
if [ $? -eq 0 ]; then
# Remove finalizers
jq 'del(.spec.finalizers)' /tmp/namespace.json > /tmp/namespace-fixed.json 2>/dev/null
if [ $? -eq 0 ]; then
kubectl replace --raw "/api/v1/namespaces/$NAMESPACE/finalize" -f /tmp/namespace-fixed.json 2>/dev/null
print_success "Namespace $NAMESPACE force deleted"
else
print_error "Failed to remove finalizers from namespace $NAMESPACE"
fi
rm -f /tmp/namespace.json /tmp/namespace-fixed.json
fi
else
print_success "Namespace $NAMESPACE deletion initiated"
fi
print_success "Namespace deletion attempted"
else
print_status "Namespace bakery-ia not found"
print_status "Namespace $NAMESPACE not found"
fi
# Delete Kind cluster
print_status "Deleting Kind cluster..."
if kind get clusters | grep -q "bakery-ia-local"; then
kind delete cluster --name bakery-ia-local
print_success "Kind cluster deleted"
print_status "Deleting Kind cluster $KIND_CLUSTER..."
if is_kind_cluster_running; then
kind delete cluster --name "$KIND_CLUSTER"
if [ $? -eq 0 ]; then
print_success "Kind cluster $KIND_CLUSTER deleted"
else
print_error "Failed to delete Kind cluster $KIND_CLUSTER"
fi
else
print_status "Kind cluster bakery-ia-local not found"
print_status "Kind cluster $KIND_CLUSTER not found"
fi
# Stop local registry
print_status "Stopping local registry..."
if docker ps -a | grep -q "kind-registry"; then
docker stop kind-registry 2>/dev/null || true
docker rm kind-registry 2>/dev/null || true
print_success "Local registry removed"
print_status "Stopping local registry $REGISTRY_NAME..."
if is_registry_running; then
docker stop "$REGISTRY_NAME" 2>/dev/null || true
docker rm "$REGISTRY_NAME" 2>/dev/null || true
print_success "Local registry $REGISTRY_NAME removed"
else
print_status "Local registry not found"
print_status "Local registry $REGISTRY_NAME not found"
fi
# Stop Colima
print_status "Stopping Colima..."
if colima list | grep -q "k8s-local"; then
colima stop --profile k8s-local
print_success "Colima stopped"
print_status "Stopping Colima profile $COLIMA_PROFILE..."
if is_colima_running; then
colima stop --profile "$COLIMA_PROFILE"
if [ $? -eq 0 ]; then
print_success "Colima profile $COLIMA_PROFILE stopped"
else
print_error "Failed to stop Colima profile $COLIMA_PROFILE"
fi
else
print_status "Colima profile k8s-local not found"
print_status "Colima profile $COLIMA_PROFILE not found or not running"
fi
print_success "Cleanup completed!"
echo "----------------------------------------"
}
@@ -119,57 +185,126 @@ check_config_files() {
fi
# Check for encryption directory if referenced in config
if grep -q "infrastructure/kubernetes/encryption" kind-config.yaml; then
if [ ! -d "./infrastructure/kubernetes/encryption" ]; then
print_warning "Encryption directory './infrastructure/kubernetes/encryption' not found"
print_warning "Some encryption configurations may not work properly"
if grep -q "infrastructure/platform/security/encryption" kind-config.yaml; then
if [ ! -d "./infrastructure/platform/security/encryption" ]; then
print_error "Encryption directory './infrastructure/platform/security/encryption' not found"
print_error "This directory is required for Kubernetes secrets encryption"
print_status "Attempting to create encryption configuration..."
# Create the directory
mkdir -p "./infrastructure/platform/security/encryption"
# Generate a new encryption key
ENCRYPTION_KEY=$(openssl rand -base64 32)
# Create the encryption configuration file
cat > "./infrastructure/platform/security/encryption/encryption-config.yaml" <<EOF
# Kubernetes Secrets Encryption Configuration
apiVersion: apiserver.config.k8s.io/v1
kind: EncryptionConfiguration
resources:
- resources:
- secrets
providers:
- aescbc:
keys:
- name: key1
secret: ${ENCRYPTION_KEY}
- identity: {}
EOF
if [ $? -eq 0 ]; then
print_success "Created encryption configuration with new key"
print_warning "Please protect this encryption key - it's used to encrypt all Kubernetes secrets"
else
print_error "Failed to create encryption configuration"
exit 1
fi
else
# Check if encryption config file exists
if [ ! -f "./infrastructure/platform/security/encryption/encryption-config.yaml" ]; then
print_error "Encryption directory exists but encryption-config.yaml is missing"
print_error "Please ensure the encryption configuration file is present"
exit 1
fi
print_success "Encryption configuration found"
fi
fi
print_success "Configuration files check completed"
}
# Function to create local registry
# Function to create local registry with better error handling
create_local_registry() {
local reg_name='kind-registry'
local reg_port='5001'
print_status "Setting up local Docker registry..."
# Create registry container unless it already exists
if [ "$(docker inspect -f '{{.State.Running}}' "${reg_name}" 2>/dev/null || true)" != 'true' ]; then
print_status "Creating registry container on port ${reg_port}..."
if ! is_registry_running; then
print_status "Creating registry container on port ${REGISTRY_PORT}..."
# Check if container exists but is stopped
if docker ps -a | grep -q "$REGISTRY_NAME"; then
print_status "Registry container exists but is stopped, removing it..."
docker rm "$REGISTRY_NAME" 2>/dev/null || true
fi
docker run \
-d --restart=always \
-p "127.0.0.1:${reg_port}:5000" \
--name "${reg_name}" \
-p "127.0.0.1:${REGISTRY_PORT}:5000" \
--name "${REGISTRY_NAME}" \
registry:2
if [ $? -eq 0 ]; then
print_success "Local registry created at localhost:${reg_port}"
print_success "Local registry created at localhost:${REGISTRY_PORT}"
else
print_error "Failed to create local registry"
exit 1
print_status "Attempting to pull registry image..."
docker pull registry:2
if [ $? -eq 0 ]; then
print_status "Registry image pulled, trying to create container again..."
docker run \
-d --restart=always \
-p "127.0.0.1:${REGISTRY_PORT}:5000" \
--name "${REGISTRY_NAME}" \
registry:2
if [ $? -eq 0 ]; then
print_success "Local registry created at localhost:${REGISTRY_PORT}"
else
print_error "Failed to create local registry after pulling image"
exit 1
fi
else
print_error "Failed to pull registry image"
exit 1
fi
fi
else
print_success "Local registry already running at localhost:${reg_port}"
print_success "Local registry already running at localhost:${REGISTRY_PORT}"
fi
# Store registry info for later use
echo "${reg_name}:${reg_port}"
echo "${REGISTRY_NAME}:${REGISTRY_PORT}"
}
# Function to connect registry to Kind
# Function to connect registry to Kind with better error handling
connect_registry_to_kind() {
local reg_name='kind-registry'
local reg_port='5001'
print_status "Connecting registry to Kind network..."
# Check if Kind network exists
if ! docker network ls | grep -q "kind"; then
print_error "Kind network not found. Please create Kind cluster first."
return 1
fi
# Connect the registry to the cluster network if not already connected
if [ "$(docker inspect -f='{{json .NetworkSettings.Networks.kind}}' "${reg_name}")" = 'null' ]; then
docker network connect "kind" "${reg_name}"
print_success "Registry connected to Kind network"
if [ "$(docker inspect -f='{{json .NetworkSettings.Networks.kind}}' "${REGISTRY_NAME}")" = 'null' ]; then
docker network connect "kind" "${REGISTRY_NAME}"
if [ $? -eq 0 ]; then
print_success "Registry connected to Kind network"
else
print_error "Failed to connect registry to Kind network"
return 1
fi
else
print_success "Registry already connected to Kind network"
fi
@@ -177,22 +312,60 @@ connect_registry_to_kind() {
# Configure containerd in the Kind node to use the registry
print_status "Configuring containerd to use local registry..."
# Create the registry config directory
docker exec bakery-ia-local-control-plane mkdir -p /etc/containerd/certs.d/localhost:${reg_port}
# Check if control plane container exists
if ! docker ps | grep -q "${KIND_CLUSTER}-control-plane"; then
print_error "Control plane container not found. Kind cluster may not be running."
return 1
fi
# Add registry configuration
docker exec bakery-ia-local-control-plane sh -c "cat > /etc/containerd/certs.d/localhost:${reg_port}/hosts.toml <<EOF
server = \"http://localhost:${reg_port}\"
# Create registry config directories for all registry names
print_status "Creating registry configuration directories..."
docker exec "${KIND_CLUSTER}-control-plane" mkdir -p "/etc/containerd/certs.d/localhost:${REGISTRY_PORT}"
docker exec "${KIND_CLUSTER}-control-plane" mkdir -p "/etc/containerd/certs.d/localhost:5001"
docker exec "${KIND_CLUSTER}-control-plane" mkdir -p "/etc/containerd/certs.d/${REGISTRY_NAME}:5000"
[host.\"http://${reg_name}:5000\"]
# Add registry configuration for localhost:5000
docker exec "${KIND_CLUSTER}-control-plane" sh -c "cat > /etc/containerd/certs.d/localhost:${REGISTRY_PORT}/hosts.toml <<EOF
server = \"http://localhost:${REGISTRY_PORT}\"
[host.\"http://${REGISTRY_NAME}:5000\"]
capabilities = [\"pull\", \"resolve\", \"push\"]
skip_verify = true
EOF"
# Restart containerd to pick up new configuration
docker exec bakery-ia-local-control-plane systemctl restart containerd
# Add registry configuration for localhost:5001 (Tilt uses this)
docker exec "${KIND_CLUSTER}-control-plane" sh -c "cat > /etc/containerd/certs.d/localhost:5001/hosts.toml <<EOF
server = \"http://localhost:5001\"
print_success "Containerd configured for local registry"
[host.\"http://${REGISTRY_NAME}:5000\"]
capabilities = [\"pull\", \"resolve\", \"push\"]
skip_verify = true
EOF"
# Add registry configuration for kind-registry:5000 (used by migration jobs)
docker exec "${KIND_CLUSTER}-control-plane" sh -c "cat > /etc/containerd/certs.d/${REGISTRY_NAME}:5000/hosts.toml <<EOF
server = \"http://${REGISTRY_NAME}:5000\"
[host.\"http://${REGISTRY_NAME}:5000\"]
capabilities = [\"pull\", \"resolve\", \"push\"]
skip_verify = true
EOF"
print_success "Registry host configurations created"
# Restart containerd to pick up new configuration
print_status "Restarting containerd..."
docker exec "${KIND_CLUSTER}-control-plane" systemctl restart containerd
if [ $? -eq 0 ]; then
print_success "Containerd configured for local registry"
else
print_error "Failed to restart containerd"
return 1
fi
# Wait for containerd to be ready
sleep 3
# Document the local registry
print_status "Documenting local registry in cluster..."
@@ -204,7 +377,9 @@ metadata:
namespace: kube-public
data:
localRegistryHosting.v1: |
host: "localhost:${reg_port}"
host: "localhost:${REGISTRY_PORT}"
hostFromContainerRuntime: "${REGISTRY_NAME}:5000"
hostFromClusterNetwork: "${REGISTRY_NAME}:5000"
help: "https://kind.sigs.k8s.io/docs/user/local-registry/"
EOF
@@ -215,75 +390,82 @@ EOF
fi
}
# Function to handle setup
# Function to handle setup with better error handling
setup() {
print_status "Starting setup process..."
# Check for required config files
check_config_files
# 1. Start Colima with adequate resources for SigNoz
print_status "Starting Colima with 8 CPU, 16GB memory, 120GB disk..."
colima start --cpu 8 --memory 16 --disk 120 --runtime docker --profile k8s-local
if [ $? -eq 0 ]; then
print_success "Colima started successfully"
# 1. Start Colima with adequate resources
print_status "Starting Colima with ${COLIMA_CPU} CPU, ${COLIMA_MEMORY}GB memory, ${COLIMA_DISK}GB disk..."
if ! is_colima_running; then
colima start --cpu "$COLIMA_CPU" --memory "$COLIMA_MEMORY" --disk "$COLIMA_DISK" --runtime docker --profile "$COLIMA_PROFILE"
# Increase inotify limits for Colima to prevent "too many open files" errors
print_status "Increasing inotify limits in Colima VM..."
colima ssh --profile k8s-local "sudo sysctl -w fs.inotify.max_user_watches=524288"
colima ssh --profile k8s-local "sudo sysctl -w fs.inotify.max_user_instances=512"
print_success "Inotify limits increased"
else
print_error "Failed to start Colima"
exit 1
fi
# 2. Create local registry before Kind cluster
create_local_registry
# 3. Create Kind cluster using existing configuration with registry support
print_status "Creating Kind cluster with registry configuration..."
if [ -f kind-config.yaml ]; then
print_status "Using kind-config.yaml with local registry support"
# Extract cluster name from config for verification
CLUSTER_NAME=$(grep -E "name:\s*" kind-config.yaml | head -1 | sed 's/name:\s*//' | tr -d '[:space:]' || echo "bakery-ia-local")
print_status "Creating cluster: $CLUSTER_NAME"
kind create cluster --config kind-config.yaml
if [ $? -eq 0 ]; then
print_success "Kind cluster created successfully"
print_success "Colima started successfully"
else
print_error "Failed to create Kind cluster"
print_error "Failed to start Colima"
print_status "Checking Colima status..."
colima status --profile "$COLIMA_PROFILE"
exit 1
fi
else
print_error "kind-config.yaml file not found!"
exit 1
print_success "Colima is already running"
fi
# 2. Create local registry before Kind cluster
create_local_registry
# 3. Create Kind cluster using existing configuration with registry support
print_status "Creating Kind cluster with registry configuration..."
if ! is_kind_cluster_running; then
if [ -f kind-config.yaml ]; then
print_status "Using kind-config.yaml with local registry support"
# Extract cluster name from config for verification
CLUSTER_NAME=$(grep -E "name:\s*" kind-config.yaml | head -1 | sed 's/name:\s*//' | tr -d '[:space:]' || echo "$KIND_CLUSTER")
print_status "Creating cluster: $CLUSTER_NAME"
kind create cluster --config kind-config.yaml
if [ $? -eq 0 ]; then
print_success "Kind cluster created successfully"
else
print_error "Failed to create Kind cluster"
print_status "Checking for existing clusters..."
kind get clusters
exit 1
fi
else
print_error "kind-config.yaml file not found!"
exit 1
fi
else
print_success "Kind cluster $KIND_CLUSTER is already running"
fi
# 4. Connect registry to Kind network
connect_registry_to_kind
# 5. Install NGINX Ingress Controller
print_status "Installing NGINX Ingress Controller..."
# Apply the ingress-nginx manifest
kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml
if [ $? -eq 0 ]; then
print_success "NGINX Ingress Controller manifest applied"
else
print_error "Failed to apply NGINX Ingress Controller manifest"
exit 1
fi
# Wait for ingress-nginx pods to be ready with retry logic
wait_for_pods "ingress-nginx" "app.kubernetes.io/component=controller" 300
if [ $? -ne 0 ]; then
print_error "NGINX Ingress Controller failed to become ready"
print_status "Checking pod status for debugging..."
@@ -291,10 +473,26 @@ setup() {
kubectl describe pods -n ingress-nginx
exit 1
fi
print_success "NGINX Ingress Controller ready (using Kind's built-in NodePort configuration)"
# 6. Verify port mappings from kind-config.yaml
# 6. Install cert-manager
print_status "Installing cert-manager..."
# Check if cert-manager is already installed
if kubectl get namespace cert-manager &>/dev/null; then
print_status "cert-manager namespace already exists, checking if it's properly installed..."
if kubectl get deployment -n cert-manager cert-manager-webhook &>/dev/null; then
print_success "cert-manager is already installed and running"
else
print_status "cert-manager namespace exists but components are not running, reinstalling..."
install_cert_manager
fi
else
install_cert_manager
fi
# 7. Verify port mappings from kind-config.yaml
print_status "Verifying port mappings from configuration..."
# Extract ports from kind-config.yaml
@@ -306,9 +504,9 @@ setup() {
print_success "Setup completed successfully!"
echo "----------------------------------------"
print_status "Cluster Information:"
echo " - Colima profile: k8s-local"
echo " - Colima profile: $COLIMA_PROFILE"
echo " - Kind cluster: $CLUSTER_NAME"
echo " - Local registry: localhost:5001"
echo " - Local registry: localhost:${REGISTRY_PORT}"
echo ""
print_status "Port Mappings (configured in kind-config.yaml):"
echo " - HTTP Ingress: localhost:${HTTP_HOST_PORT} -> Kind NodePort 30080"
@@ -324,9 +522,9 @@ setup() {
echo " - Tilt UI: http://localhost:10350"
echo "----------------------------------------"
print_status "Local Registry Information:"
echo " - Registry URL: localhost:5001"
echo " - Images pushed to: localhost:5001/bakery/<service>"
echo " - Tiltfile already configured: default_registry('localhost:5001')"
echo " - Registry URL: localhost:${REGISTRY_PORT}"
echo " - Images pushed to: localhost:${REGISTRY_PORT}/bakery/<service>"
echo " - Tiltfile already configured: default_registry('localhost:${REGISTRY_PORT}')"
echo "----------------------------------------"
}
@@ -342,28 +540,117 @@ usage() {
echo ""
echo "Requirements:"
echo " - kind-config.yaml must exist in current directory"
echo " - For encryption: ./infrastructure/kubernetes/encryption directory"
echo " - For encryption: ./infrastructure/platform/security/encryption directory"
echo " - Docker, Colima, Kind, kubectl must be installed"
}
# Function to install cert-manager
install_cert_manager() {
print_status "Installing cert-manager..."
# Create cert-manager namespace
kubectl create namespace cert-manager --dry-run=client -o yaml | kubectl apply -f -
# Install cert-manager CRDs and components
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml
if [ $? -eq 0 ]; then
print_success "cert-manager manifests applied"
else
print_error "Failed to apply cert-manager manifests"
exit 1
fi
# Wait for cert-manager pods to be ready with retry logic
print_status "Waiting for cert-manager pods to be ready..."
local max_retries=30
local retry_count=0
while [ $retry_count -lt $max_retries ]; do
# Check if all cert-manager pods are ready
local ready_pods=$(kubectl get pods -n cert-manager --no-headers 2>/dev/null | grep -c "Running" || echo "0")
local total_pods=$(kubectl get pods -n cert-manager --no-headers 2>/dev/null | grep -v "NAME" | wc -l)
total_pods=$(echo "$total_pods" | tr -d ' ')
if [ "$ready_pods" -eq "$total_pods" ] && [ "$total_pods" -gt 0 ]; then
# Double-check that all pods are actually ready (1/1, 2/2, etc.)
local all_ready=true
while IFS= read -r line; do
local ready_status=$(echo "$line" | awk '{print $2}')
local desired_ready=($(echo "$ready_status" | tr '/' ' '))
if [ "${desired_ready[0]}" -ne "${desired_ready[1]}" ]; then
all_ready=false
break
fi
done <<< "$(kubectl get pods -n cert-manager --no-headers 2>/dev/null | grep -v "NAME")"
if [ "$all_ready" = true ]; then
print_success "cert-manager is ready with $ready_pods/$total_pods pods running"
return 0
fi
fi
retry_count=$((retry_count + 1))
print_status "Waiting for cert-manager pods to be ready... (attempt $retry_count/$max_retries)"
sleep 10
done
print_error "Timed out waiting for cert-manager pods after $((max_retries * 10)) seconds"
print_status "Checking cert-manager pod status for debugging..."
kubectl get pods -n cert-manager
kubectl describe pods -n cert-manager
exit 1
}
# Function to check prerequisites
check_prerequisites() {
print_status "Checking prerequisites..."
local missing_commands=()
for cmd in docker colima kind kubectl jq; do
if ! check_command "$cmd"; then
missing_commands+=("$cmd")
fi
done
if [ ${#missing_commands[@]} -gt 0 ]; then
print_error "Missing required commands: ${missing_commands[*]}"
print_error "Please install them before running this script."
exit 1
fi
print_success "All prerequisites are met"
}
# Main script logic
case "${1:-full}" in
"cleanup")
cleanup
;;
"setup")
setup
;;
"full")
cleanup
setup
;;
"help"|"-h"|"--help")
usage
;;
*)
print_warning "Unknown option: $1"
echo ""
usage
exit 1
;;
esac
main() {
# Check prerequisites first
check_prerequisites
case "${1:-full}" in
"cleanup")
cleanup
;;
"setup")
setup
;;
"full")
cleanup
setup
;;
"help"|"-h"|"--help")
usage
;;
*)
print_warning "Unknown option: $1"
echo ""
usage
exit 1
;;
esac
}
# Run main function
main "$@"