#!/bin/bash # Improved Kubernetes restart script with better error handling and resource management # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Configuration variables COLIMA_PROFILE="k8s-local" KIND_CLUSTER="bakery-ia-local" REGISTRY_NAME="kind-registry" REGISTRY_PORT="5000" NAMESPACE="bakery-ia" # Resource configuration (adjustable) COLIMA_CPU=12 COLIMA_MEMORY=24 COLIMA_DISK=120 # Function to print colored output print_status() { echo -e "${BLUE}[INFO]${NC} $1" } print_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } print_warning() { echo -e "${YELLOW}[WARNING]${NC} $1" } print_error() { echo -e "${RED}[ERROR]${NC} $1" } # Function to check command availability check_command() { if ! command -v "$1" &> /dev/null; then print_error "Required command '$1' not found. Please install it first." return 1 fi return 0 } # Function to wait for pods with retry logic wait_for_pods() { local namespace=$1 local selector=$2 local timeout=$3 local max_retries=30 local retry_count=0 print_status "Waiting for pods with selector '$selector' in namespace '$namespace'..." while [ $retry_count -lt $max_retries ]; do # Check if any pods exist first if kubectl get pods -n "$namespace" --selector="$selector" 2>/dev/null | grep -v "No resources found" | grep -v "NAME" > /dev/null; then # Pods exist, now wait for them to be ready if kubectl wait --namespace "$namespace" \ --for=condition=ready pod \ --selector="$selector" \ --timeout="${timeout}s" 2>/dev/null; then print_success "Pods are ready" return 0 fi fi retry_count=$((retry_count + 1)) print_status "Waiting for pods to be created... (attempt $retry_count/$max_retries)" sleep 5 done print_error "Timed out waiting for pods after $((max_retries * 5)) seconds" return 1 } # Function to check if Colima is running is_colima_running() { colima list | grep -q "$COLIMA_PROFILE" && colima status --profile "$COLIMA_PROFILE" | grep -q "Running" } # Function to check if Kind cluster exists is_kind_cluster_running() { kind get clusters | grep -q "$KIND_CLUSTER" } # Function to check if registry is running is_registry_running() { docker inspect -f '{{.State.Running}}' "$REGISTRY_NAME" 2>/dev/null | grep -q "true" } # Function to handle cleanup with better error handling cleanup() { print_status "Starting cleanup process..." # Ensure correct Docker context is used ensure_docker_context # Delete Kubernetes namespace with timeout print_status "Deleting namespace $NAMESPACE..." if kubectl get namespace "$NAMESPACE" &>/dev/null; then print_status "Found namespace $NAMESPACE, attempting to delete..." # Try graceful deletion first kubectl delete namespace "$NAMESPACE" --wait=false 2>/dev/null # Wait a bit for deletion to start sleep 5 # Check if namespace is still terminating if kubectl get namespace "$NAMESPACE" --no-headers 2>/dev/null | grep -q "Terminating"; then print_warning "Namespace $NAMESPACE is stuck in Terminating state" print_status "Attempting to force delete..." # Get the namespace JSON and remove finalizers kubectl get namespace "$NAMESPACE" -o json > /tmp/namespace.json 2>/dev/null if [ $? -eq 0 ]; then # Remove finalizers jq 'del(.spec.finalizers)' /tmp/namespace.json > /tmp/namespace-fixed.json 2>/dev/null if [ $? -eq 0 ]; then kubectl replace --raw "/api/v1/namespaces/$NAMESPACE/finalize" -f /tmp/namespace-fixed.json 2>/dev/null print_success "Namespace $NAMESPACE force deleted" else print_error "Failed to remove finalizers from namespace $NAMESPACE" fi rm -f /tmp/namespace.json /tmp/namespace-fixed.json fi else print_success "Namespace $NAMESPACE deletion initiated" fi else print_status "Namespace $NAMESPACE not found" fi # Delete Kind cluster print_status "Deleting Kind cluster $KIND_CLUSTER..." if is_kind_cluster_running; then kind delete cluster --name "$KIND_CLUSTER" if [ $? -eq 0 ]; then print_success "Kind cluster $KIND_CLUSTER deleted" else print_error "Failed to delete Kind cluster $KIND_CLUSTER" fi else print_status "Kind cluster $KIND_CLUSTER not found" fi # Stop local registry print_status "Stopping local registry $REGISTRY_NAME..." if is_registry_running; then docker stop "$REGISTRY_NAME" 2>/dev/null || true docker rm "$REGISTRY_NAME" 2>/dev/null || true print_success "Local registry $REGISTRY_NAME removed" else print_status "Local registry $REGISTRY_NAME not found" fi # Stop Colima print_status "Stopping Colima profile $COLIMA_PROFILE..." if is_colima_running; then colima stop --profile "$COLIMA_PROFILE" if [ $? -eq 0 ]; then print_success "Colima profile $COLIMA_PROFILE stopped" else print_error "Failed to stop Colima profile $COLIMA_PROFILE" fi else print_status "Colima profile $COLIMA_PROFILE not found or not running" fi print_success "Cleanup completed!" echo "----------------------------------------" } # Function to check for required configuration files check_config_files() { print_status "Checking for required configuration files..." # Check for kind-config.yaml if [ ! -f kind-config.yaml ]; then print_error "kind-config.yaml not found in current directory!" print_error "Please ensure kind-config.yaml exists with your cluster configuration." exit 1 fi # Check for encryption directory if referenced in config if grep -q "infrastructure/platform/security/encryption" kind-config.yaml; then if [ ! -d "./infrastructure/platform/security/encryption" ]; then print_error "Encryption directory './infrastructure/platform/security/encryption' not found" print_error "This directory is required for Kubernetes secrets encryption" print_status "Attempting to create encryption configuration..." # Create the directory mkdir -p "./infrastructure/platform/security/encryption" # Generate a new encryption key ENCRYPTION_KEY=$(openssl rand -base64 32) # Create the encryption configuration file cat > "./infrastructure/platform/security/encryption/encryption-config.yaml" </dev/null || true fi docker run \ -d --restart=always \ -p "127.0.0.1:${REGISTRY_PORT}:5000" \ --name "${REGISTRY_NAME}" \ registry:2 if [ $? -eq 0 ]; then print_success "Local registry created at localhost:${REGISTRY_PORT}" else print_error "Failed to create local registry" print_status "Attempting to pull registry image..." docker pull registry:2 if [ $? -eq 0 ]; then print_status "Registry image pulled, trying to create container again..." docker run \ -d --restart=always \ -p "127.0.0.1:${REGISTRY_PORT}:5000" \ --name "${REGISTRY_NAME}" \ registry:2 if [ $? -eq 0 ]; then print_success "Local registry created at localhost:${REGISTRY_PORT}" else print_error "Failed to create local registry after pulling image" exit 1 fi else print_error "Failed to pull registry image" exit 1 fi fi else print_success "Local registry already running at localhost:${REGISTRY_PORT}" fi # Store registry info for later use echo "${REGISTRY_NAME}:${REGISTRY_PORT}" } # Function to connect registry to Kind with better error handling connect_registry_to_kind() { print_status "Connecting registry to Kind network..." # Check if Kind network exists if ! docker network ls | grep -q "kind"; then print_error "Kind network not found. Please create Kind cluster first." return 1 fi # Connect the registry to the cluster network if not already connected if [ "$(docker inspect -f='{{json .NetworkSettings.Networks.kind}}' "${REGISTRY_NAME}")" = 'null' ]; then docker network connect "kind" "${REGISTRY_NAME}" if [ $? -eq 0 ]; then print_success "Registry connected to Kind network" else print_error "Failed to connect registry to Kind network" return 1 fi else print_success "Registry already connected to Kind network" fi # Configure containerd in the Kind node to use the registry print_status "Configuring containerd to use local registry..." # Check if control plane container exists if ! docker ps | grep -q "${KIND_CLUSTER}-control-plane"; then print_error "Control plane container not found. Kind cluster may not be running." return 1 fi # Create registry config directories for all registry names print_status "Creating registry configuration directories..." docker exec "${KIND_CLUSTER}-control-plane" mkdir -p "/etc/containerd/certs.d/localhost:${REGISTRY_PORT}" docker exec "${KIND_CLUSTER}-control-plane" mkdir -p "/etc/containerd/certs.d/localhost:5001" docker exec "${KIND_CLUSTER}-control-plane" mkdir -p "/etc/containerd/certs.d/${REGISTRY_NAME}:5000" # Add registry configuration for localhost:5000 docker exec "${KIND_CLUSTER}-control-plane" sh -c "cat > /etc/containerd/certs.d/localhost:${REGISTRY_PORT}/hosts.toml < /etc/containerd/certs.d/localhost:5001/hosts.toml < /etc/containerd/certs.d/${REGISTRY_NAME}:5000/hosts.toml < "$temp_config" << 'DOCKERCONFIG' { "exec-opts": ["native.cgroupdriver=cgroupfs"], "features": { "buildkit": true, "containerd-snapshotter": true } } DOCKERCONFIG # Copy the configuration to Colima VM using stdin if cat "$temp_config" | colima --profile "$COLIMA_PROFILE" ssh -- sudo tee /etc/docker/daemon.json > /dev/null; then print_success "Docker daemon configuration written" rm -f "$temp_config" # Restart Docker service to apply the configuration print_status "Restarting Docker service to apply configuration..." if colima --profile "$COLIMA_PROFILE" ssh -- sudo systemctl restart docker; then print_success "Docker service restarted successfully" # Wait for Docker to be ready sleep 3 return 0 else print_error "Failed to restart Docker service" return 1 fi else print_error "Failed to write Docker daemon configuration" rm -f "$temp_config" return 1 fi } # Function to ensure correct Docker context is used ensure_docker_context() { if ! docker version >/dev/null 2>&1; then print_warning "Docker daemon is not accessible, attempting to set correct context..." if is_colima_running; then # Look for the correct Colima Docker context COLIMA_CONTEXT=$(docker context ls --format='{{.Name}}' | grep -E "^colima($|-[[:alnum:]_-]+)" | head -1) if [ -n "$COLIMA_CONTEXT" ]; then print_status "Switching Docker context to $COLIMA_CONTEXT" docker context use "$COLIMA_CONTEXT" >/dev/null 2>&1 if [ $? -eq 0 ]; then print_success "Docker context switched to $COLIMA_CONTEXT" sleep 2 # Give Docker a moment to establish connection else print_error "Failed to switch Docker context to $COLIMA_CONTEXT" exit 1 fi else print_error "No Colima Docker context found. Please ensure Colima is properly configured." exit 1 fi fi fi } # Function to handle setup with better error handling setup() { print_status "Starting setup process..." # Ensure correct Docker context is used ensure_docker_context # Check for required config files check_config_files # 1. Start Colima with adequate resources print_status "Starting Colima with ${COLIMA_CPU} CPU, ${COLIMA_MEMORY}GB memory, ${COLIMA_DISK}GB disk..." if ! is_colima_running; then colima start --cpu "$COLIMA_CPU" --memory "$COLIMA_MEMORY" --disk "$COLIMA_DISK" --runtime docker --profile "$COLIMA_PROFILE" if [ $? -eq 0 ]; then print_success "Colima started successfully" # Configure Docker daemon with insecure registries configure_docker_daemon else print_error "Failed to start Colima" print_status "Checking Colima status..." colima status --profile "$COLIMA_PROFILE" exit 1 fi else print_success "Colima is already running" # Configure Docker daemon with insecure registries even if Colima was already running configure_docker_daemon fi # 2. Create local registry before Kind cluster create_local_registry # 3. Create Kind cluster using existing configuration with registry support print_status "Creating Kind cluster with registry configuration..." if ! is_kind_cluster_running; then if [ -f kind-config.yaml ]; then print_status "Using kind-config.yaml with local registry support" # Extract cluster name from config for verification CLUSTER_NAME=$(grep -E "name:\s*" kind-config.yaml | head -1 | sed 's/name:\s*//' | tr -d '[:space:]' || echo "$KIND_CLUSTER") print_status "Creating cluster: $CLUSTER_NAME" kind create cluster --config kind-config.yaml if [ $? -eq 0 ]; then print_success "Kind cluster created successfully" else print_error "Failed to create Kind cluster" print_status "Checking for existing clusters..." kind get clusters exit 1 fi else print_error "kind-config.yaml file not found!" exit 1 fi else print_success "Kind cluster $KIND_CLUSTER is already running" fi # 4. Connect registry to Kind network connect_registry_to_kind # 5. Install NGINX Ingress Controller print_status "Installing NGINX Ingress Controller..." # Apply the ingress-nginx manifest kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/main/deploy/static/provider/kind/deploy.yaml if [ $? -eq 0 ]; then print_success "NGINX Ingress Controller manifest applied" else print_error "Failed to apply NGINX Ingress Controller manifest" exit 1 fi # Wait for ingress-nginx pods to be ready with retry logic wait_for_pods "ingress-nginx" "app.kubernetes.io/component=controller" 300 if [ $? -ne 0 ]; then print_error "NGINX Ingress Controller failed to become ready" print_status "Checking pod status for debugging..." kubectl get pods -n ingress-nginx kubectl describe pods -n ingress-nginx exit 1 fi print_success "NGINX Ingress Controller ready (using Kind's built-in NodePort configuration)" # 6. Install cert-manager print_status "Installing cert-manager..." # Check if cert-manager is already installed if kubectl get namespace cert-manager &>/dev/null; then print_status "cert-manager namespace already exists, checking if it's properly installed..." if kubectl get deployment -n cert-manager cert-manager-webhook &>/dev/null; then print_success "cert-manager is already installed and running" else print_status "cert-manager namespace exists but components are not running, reinstalling..." install_cert_manager fi else install_cert_manager fi # 7. Verify port mappings from kind-config.yaml print_status "Verifying port mappings from configuration..." # Extract ports from kind-config.yaml HTTP_HOST_PORT=$(grep -A1 "containerPort: 30080" kind-config.yaml | grep "hostPort:" | awk '{print $2}' || echo "80") HTTPS_HOST_PORT=$(grep -A1 "containerPort: 30443" kind-config.yaml | grep "hostPort:" | awk '{print $2}' || echo "443") # Print cluster info echo "" print_success "Setup completed successfully!" echo "----------------------------------------" print_status "Cluster Information:" echo " - Colima profile: $COLIMA_PROFILE" echo " - Kind cluster: $CLUSTER_NAME" echo " - Local registry: localhost:${REGISTRY_PORT}" echo "" print_status "Port Mappings (configured in kind-config.yaml):" echo " - HTTP Ingress: localhost:${HTTP_HOST_PORT} -> Kind NodePort 30080" echo " - HTTPS Ingress: localhost:${HTTPS_HOST_PORT} -> Kind NodePort 30443" echo " - Frontend Direct: localhost:3000 -> container:30300" echo " - Gateway Direct: localhost:8000 -> container:30800" echo "" print_status "How to access your application:" echo " 1. Start Tilt: tilt up" echo " 2. Access via:" echo " - Ingress: http://localhost (or https://localhost)" echo " - Direct: http://localhost:3000 (frontend), http://localhost:8000 (gateway)" echo " - Tilt UI: http://localhost:10350" echo "----------------------------------------" print_status "Local Registry Information:" echo " - Registry URL: localhost:${REGISTRY_PORT}" echo " - Images pushed to: localhost:${REGISTRY_PORT}/bakery/" echo " - Tiltfile already configured: default_registry('localhost:${REGISTRY_PORT}')" echo "----------------------------------------" } # Function to show usage usage() { echo "Usage: $0 [option]" echo "" echo "Options:" echo " cleanup Clean up all resources (namespace, cluster, colima)" echo " setup Set up the complete environment" echo " full Clean up first, then set up (default)" echo " help Show this help message" echo "" echo "Requirements:" echo " - kind-config.yaml must exist in current directory" echo " - For encryption: ./infrastructure/platform/security/encryption directory" echo " - Docker, Colima, Kind, kubectl must be installed" } # Function to install cert-manager install_cert_manager() { print_status "Installing cert-manager..." # Create cert-manager namespace kubectl create namespace cert-manager --dry-run=client -o yaml | kubectl apply -f - # Install cert-manager CRDs and components kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.4/cert-manager.yaml if [ $? -eq 0 ]; then print_success "cert-manager manifests applied" else print_error "Failed to apply cert-manager manifests" exit 1 fi # Wait for cert-manager pods to be ready with retry logic print_status "Waiting for cert-manager pods to be ready..." local max_retries=30 local retry_count=0 while [ $retry_count -lt $max_retries ]; do # Check if all cert-manager pods are ready local ready_pods=$(kubectl get pods -n cert-manager --no-headers 2>/dev/null | grep -c "Running" || echo "0") local total_pods=$(kubectl get pods -n cert-manager --no-headers 2>/dev/null | grep -v "NAME" | wc -l) total_pods=$(echo "$total_pods" | tr -d ' ') if [ "$ready_pods" -eq "$total_pods" ] && [ "$total_pods" -gt 0 ]; then # Double-check that all pods are actually ready (1/1, 2/2, etc.) local all_ready=true while IFS= read -r line; do local ready_status=$(echo "$line" | awk '{print $2}') local desired_ready=($(echo "$ready_status" | tr '/' ' ')) if [ "${desired_ready[0]}" -ne "${desired_ready[1]}" ]; then all_ready=false break fi done <<< "$(kubectl get pods -n cert-manager --no-headers 2>/dev/null | grep -v "NAME")" if [ "$all_ready" = true ]; then print_success "cert-manager is ready with $ready_pods/$total_pods pods running" return 0 fi fi retry_count=$((retry_count + 1)) print_status "Waiting for cert-manager pods to be ready... (attempt $retry_count/$max_retries)" sleep 10 done print_error "Timed out waiting for cert-manager pods after $((max_retries * 10)) seconds" print_status "Checking cert-manager pod status for debugging..." kubectl get pods -n cert-manager kubectl describe pods -n cert-manager exit 1 } # Function to check prerequisites check_prerequisites() { print_status "Checking prerequisites..." local missing_commands=() for cmd in docker colima kind kubectl jq; do if ! check_command "$cmd"; then missing_commands+=("$cmd") fi done if [ ${#missing_commands[@]} -gt 0 ]; then print_error "Missing required commands: ${missing_commands[*]}" print_error "Please install them before running this script." exit 1 fi # Check if Docker daemon is accessible if ! docker version >/dev/null 2>&1; then print_warning "Docker daemon is not accessible with current context" # Check if Colima is running and try to set Docker context accordingly if is_colima_running; then print_status "Colima is running, checking for correct Docker context..." # Look for the correct Colima Docker context COLIMA_CONTEXT=$(docker context ls --format='{{.Name}}' | grep -E "^colima($|-[[:alnum:]_-]+)" | head -1) if [ -n "$COLIMA_CONTEXT" ]; then print_status "Switching Docker context to $COLIMA_CONTEXT" docker context use "$COLIMA_CONTEXT" >/dev/null 2>&1 if [ $? -eq 0 ]; then print_success "Docker context switched to $COLIMA_CONTEXT" else print_error "Failed to switch Docker context to $COLIMA_CONTEXT" exit 1 fi else print_error "No Colima Docker context found. Please ensure Colima is properly configured." exit 1 fi else print_warning "Docker daemon is not running and Colima is not running. Will start Colima during setup." # For setup operations, we can continue without Docker being accessible yet # since Colima will be started as part of the setup process if [[ "${1:-full}" == "setup" ]]; then print_status "Continuing with setup since this is a setup operation..." elif [[ "${1:-full}" == "full" ]]; then print_status "Continuing with full operation (cleanup + setup)..." else print_error "Docker daemon is not running and Colima is not running. Please start Docker or Colima first." exit 1 fi fi fi print_success "All prerequisites are met" } # Main script logic main() { # Check prerequisites first check_prerequisites case "${1:-full}" in "cleanup") cleanup ;; "setup") setup ;; "full") cleanup setup ;; "help"|"-h"|"--help") usage ;; *) print_warning "Unknown option: $1" echo "" usage exit 1 ;; esac } # Run main function main "$@"