|
| 1 | +#!/usr/bin/env bash |
| 2 | +set -eo pipefail |
| 3 | + |
| 4 | +# ============================================== |
| 5 | +# Cluster Initialization Script - Production Grade |
| 6 | +# ============================================== |
| 7 | + |
| 8 | +# Global Configuration |
| 9 | +CLUSTER_NAME="biconic-prod" |
| 10 | +CLOUD_PROVIDER="aws" # aws|azure|gcp |
| 11 | +ENVIRONMENT="prod" |
| 12 | +REGION="us-west-2" |
| 13 | +K8S_VERSION="1.28" |
| 14 | +DEPLOY_NAMESPACE="biconic-system" |
| 15 | +ADMIN_EMAIL="admin@biconic.ai" |
| 16 | +ROOT_DIR="/opt/aelion" |
| 17 | +BACKUP_DIR="${ROOT_DIR}/backups" |
| 18 | +LOG_FILE="/var/log/cluster-init.log" |
| 19 | + |
| 20 | +# Infrastructure Sizing |
| 21 | +CONTROL_PLANE_NODES=3 |
| 22 | +WORKER_NODES=5 |
| 23 | +NODE_INSTANCE_TYPE="m6i.4xlarge" |
| 24 | +STORAGE_SIZE="500Gi" |
| 25 | + |
| 26 | +# Security Parameters |
| 27 | +TLS_VALIDITY_DAYS=3650 |
| 28 | +ENCRYPTION_KEY="$(openssl rand -base64 32)" |
| 29 | +ADMIN_CERT_DAYS=730 |
| 30 | + |
| 31 | +# Dependency Versions |
| 32 | +HELM_VERSION="3.12.3" |
| 33 | +TERRAFORM_VERSION="1.5.7" |
| 34 | +KUSTOMIZE_VERSION="5.0.3" |
| 35 | + |
| 36 | +# Initialize logging |
| 37 | +exec > >(tee -a "${LOG_FILE}") 2>&1 |
| 38 | + |
| 39 | +# Phase 1: Pre-flight Checks |
| 40 | +function preflight_checks() { |
| 41 | + echo "=== STARTING PREFLIGHT CHECKS ===" |
| 42 | + |
| 43 | + # Verify execution context |
| 44 | + if [[ $(id -u) -ne 0 ]]; then |
| 45 | + echo "ERROR: Must be run as root" |
| 46 | + exit 1 |
| 47 | + fi |
| 48 | + |
| 49 | + # Check dependencies |
| 50 | + declare -A REQUIRED_CMDS=( |
| 51 | + ["kubectl"]="1.28" |
| 52 | + ["helm"]="${HELM_VERSION}" |
| 53 | + ["terraform"]="${TERRAFORM_VERSION}" |
| 54 | + ["jq"]="1.6" |
| 55 | + ["openssl"]="3.0" |
| 56 | + ) |
| 57 | + |
| 58 | + for cmd in "${!REQUIRED_CMDS[@]}"; do |
| 59 | + if ! command -v "${cmd}" &> /dev/null; then |
| 60 | + echo "ERROR: ${cmd} not found" |
| 61 | + exit 1 |
| 62 | + fi |
| 63 | + |
| 64 | + version=$(${cmd} --version 2>&1 | head -n1) |
| 65 | + if [[ ! "${version}" =~ ${REQUIRED_CMDS[$cmd]} ]]; then |
| 66 | + echo "ERROR: ${cmd} version mismatch" |
| 67 | + exit 1 |
| 68 | + fi |
| 69 | + done |
| 70 | + |
| 71 | + # Validate cloud credentials |
| 72 | + case "${CLOUD_PROVIDER}" in |
| 73 | + aws) |
| 74 | + if [[ -z "${AWS_ACCESS_KEY_ID}" || -z "${AWS_SECRET_ACCESS_KEY}" ]]; then |
| 75 | + echo "ERROR: AWS credentials not configured" |
| 76 | + exit 1 |
| 77 | + fi |
| 78 | + ;; |
| 79 | + azure) |
| 80 | + if [[ -z "${AZURE_SUBSCRIPTION_ID}" || -z "${AZURE_TENANT_ID}" ]]; then |
| 81 | + echo "ERROR: Azure credentials not configured" |
| 82 | + exit 1 |
| 83 | + fi |
| 84 | + ;; |
| 85 | + gcp) |
| 86 | + if [[ -z "${GOOGLE_CREDENTIALS}" ]]; then |
| 87 | + echo "ERROR: GCP credentials not configured" |
| 88 | + exit 1 |
| 89 | + fi |
| 90 | + ;; |
| 91 | + *) |
| 92 | + echo "ERROR: Unsupported cloud provider" |
| 93 | + exit 1 |
| 94 | + ;; |
| 95 | + esac |
| 96 | + |
| 97 | + # Check storage availability |
| 98 | + if [[ $(df --output=avail / | tail -1) -lt 52428800 ]]; then |
| 99 | + echo "ERROR: Insufficient disk space" |
| 100 | + exit 1 |
| 101 | + fi |
| 102 | + |
| 103 | + echo "=== PREFLIGHT CHECKS PASSED ===" |
| 104 | +} |
| 105 | + |
| 106 | +# Phase 2: Infrastructure Provisioning |
| 107 | +function provision_infrastructure() { |
| 108 | + echo "=== PROVISIONING CLUSTER INFRASTRUCTURE ===" |
| 109 | + |
| 110 | + # Generate Terraform configuration |
| 111 | + cat <<EOF > cluster.tf |
| 112 | +module "aelion_cluster" { |
| 113 | + source = "terraform-${CLOUD_PROVIDER}-modules/kubernetes-cluster/${CLOUD_PROVIDER}" |
| 114 | + version = "4.12.0" |
| 115 | +
|
| 116 | + cluster_name = "${CLUSTER_NAME}" |
| 117 | + region = "${REGION}" |
| 118 | + k8s_version = "${K8S_VERSION}" |
| 119 | + node_count = ${WORKER_NODES} |
| 120 | + control_plane_count = ${CONTROL_PLANE_NODES} |
| 121 | + node_instance_type = "${NODE_INSTANCE_TYPE}" |
| 122 | + storage_size = "${STORAGE_SIZE}" |
| 123 | + |
| 124 | + enable_autoscaling = true |
| 125 | + min_nodes = 3 |
| 126 | + max_nodes = 10 |
| 127 | + |
| 128 | + enable_encryption = true |
| 129 | + encryption_key = "${ENCRYPTION_KEY}" |
| 130 | + |
| 131 | + tags = { |
| 132 | + Environment = "${ENVIRONMENT}" |
| 133 | + ManagedBy = "Aelion AI" |
| 134 | + } |
| 135 | +} |
| 136 | +EOF |
| 137 | + |
| 138 | + # Initialize and apply Terraform |
| 139 | + terraform init |
| 140 | + terraform apply -auto-approve |
| 141 | + |
| 142 | + # Configure kubectl context |
| 143 | + case "${CLOUD_PROVIDER}" in |
| 144 | + aws) |
| 145 | + aws eks update-kubeconfig --name "${CLUSTER_NAME}" --region "${REGION}" |
| 146 | + ;; |
| 147 | + azure) |
| 148 | + az aks get-credentials --resource-group "${CLUSTER_NAME}-rg" --name "${CLUSTER_NAME}" |
| 149 | + ;; |
| 150 | + gcp) |
| 151 | + gcloud container clusters get-credentials "${CLUSTER_NAME}" --region "${REGION}" |
| 152 | + ;; |
| 153 | + esac |
| 154 | + |
| 155 | + # Verify cluster access |
| 156 | + if ! kubectl cluster-info; then |
| 157 | + echo "ERROR: Cluster connection failed" |
| 158 | + exit 1 |
| 159 | + fi |
| 160 | + |
| 161 | + echo "=== INFRASTRUCTURE PROVISIONING COMPLETE ===" |
| 162 | +} |
| 163 | + |
| 164 | +# Phase 3: Cluster Bootstrapping |
| 165 | +function bootstrap_cluster() { |
| 166 | + echo "=== BOOTSTRAPPING CLUSTER COMPONENTS ===" |
| 167 | + |
| 168 | + # Create namespace |
| 169 | + kubectl create namespace "${DEPLOY_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - |
| 170 | + |
| 171 | + # Deploy core components |
| 172 | + deploy_cni |
| 173 | + deploy_csi |
| 174 | + deploy_ingress_controller |
| 175 | + deploy_cert_manager |
| 176 | + deploy_metrics_server |
| 177 | + deploy_prometheus_stack |
| 178 | + deploy_efk_logging |
| 179 | + deploy_vault |
| 180 | + deploy_backup_operator |
| 181 | +} |
| 182 | + |
| 183 | +function deploy_cni() { |
| 184 | + echo "--- Deploying CNI (Cilium) ---" |
| 185 | + helm repo add cilium https://helm.cilium.io/ |
| 186 | + helm upgrade --install cilium cilium/cilium \ |
| 187 | + --namespace kube-system \ |
| 188 | + --set kubeProxyReplacement=strict \ |
| 189 | + --set k8sServiceHost=api-server.${CLUSTER_NAME}.internal \ |
| 190 | + --set hubble.relay.enabled=true \ |
| 191 | + --set hubble.ui.enabled=true |
| 192 | +} |
| 193 | + |
| 194 | +function deploy_cert_manager() { |
| 195 | + echo "--- Deploying Cert Manager ---" |
| 196 | + kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.12.0/cert-manager.yaml |
| 197 | + |
| 198 | + # Wait for readiness |
| 199 | + kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=300s |
| 200 | + |
| 201 | + # Create cluster issuer |
| 202 | + cat <<EOF | kubectl apply -f - |
| 203 | +apiVersion: cert-manager.io/v1 |
| 204 | +kind: ClusterIssuer |
| 205 | +metadata: |
| 206 | + name: letsencrypt-prod |
| 207 | +spec: |
| 208 | + acme: |
| 209 | + server: https://acme-v02.api.letsencrypt.org/directory |
| 210 | + email: ${ADMIN_EMAIL} |
| 211 | + privateKeySecretRef: |
| 212 | + name: letsencrypt-prod |
| 213 | + solvers: |
| 214 | + - http01: |
| 215 | + ingress: |
| 216 | + class: nginx |
| 217 | +EOF |
| 218 | +} |
| 219 | + |
| 220 | +function deploy_prometheus_stack() { |
| 221 | + echo "--- Deploying Monitoring Stack ---" |
| 222 | + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts |
| 223 | + helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack \ |
| 224 | + --namespace monitoring \ |
| 225 | + --create-namespace \ |
| 226 | + --set alertmanager.enabled=true \ |
| 227 | + --set grafana.enabled=true \ |
| 228 | + --set prometheus.prometheusSpec.retention=30d |
| 229 | +} |
| 230 | + |
| 231 | +# Phase 4: Security Hardening |
| 232 | +function harden_security() { |
| 233 | + echo "=== APPLYING SECURITY CONFIGURATIONS ===" |
| 234 | + |
| 235 | + # Generate TLS certificates |
| 236 | + openssl req -x509 -newkey rsa:4096 -days ${TLS_VALIDITY_DAYS} -nodes \ |
| 237 | + -keyout ${ROOT_DIR}/tls.key -out ${ROOT_DIR}/tls.crt \ |
| 238 | + -subj "/CN=aelion.ai/O=Aelion AI" |
| 239 | + |
| 240 | + # Create secret |
| 241 | + kubectl create secret tls aelion-tls \ |
| 242 | + --key=${ROOT_DIR}/tls.key \ |
| 243 | + --cert=${ROOT_DIR}/tls.crt \ |
| 244 | + --namespace=${DEPLOY_NAMESPACE} |
| 245 | + |
| 246 | + # Apply network policies |
| 247 | + cat <<EOF | kubectl apply -f - |
| 248 | +apiVersion: networking.k8s.io/v1 |
| 249 | +kind: NetworkPolicy |
| 250 | +metadata: |
| 251 | + name: default-deny |
| 252 | + namespace: ${DEPLOY_NAMESPACE} |
| 253 | +spec: |
| 254 | + podSelector: {} |
| 255 | + policyTypes: |
| 256 | + - Ingress |
| 257 | + - Egress |
| 258 | +EOF |
| 259 | + |
| 260 | + # Configure RBAC |
| 261 | + kubectl apply -f - <<EOF |
| 262 | +apiVersion: rbac.authorization.k8s.io/v1 |
| 263 | +kind: ClusterRole |
| 264 | +metadata: |
| 265 | + name: aelion-admin |
| 266 | +rules: |
| 267 | +- apiGroups: ["*"] |
| 268 | + resources: ["*"] |
| 269 | + verbs: ["*"] |
| 270 | +EOF |
| 271 | + |
| 272 | + # Create admin certificate |
| 273 | + openssl genrsa -out ${ROOT_DIR}/admin.key 2048 |
| 274 | + openssl req -new -key ${ROOT_DIR}/admin.key \ |
| 275 | + -out ${ROOT_DIR}/admin.csr \ |
| 276 | + -subj "/CN=admin/O=system:masters" |
| 277 | + openssl x509 -req -in ${ROOT_DIR}/admin.csr \ |
| 278 | + -CA ${ROOT_DIR}/tls.crt -CAkey ${ROOT_DIR}/tls.key -CAcreateserial \ |
| 279 | + -out ${ROOT_DIR}/admin.crt -days ${ADMIN_CERT_DAYS} |
| 280 | + |
| 281 | + # Configure kubectl context |
| 282 | + kubectl config set-credentials admin \ |
| 283 | + --client-certificate=${ROOT_DIR}/admin.crt \ |
| 284 | + --client-key=${ROOT_DIR}/admin.key |
| 285 | +} |
| 286 | + |
| 287 | +# Phase 5: Application Deployment |
| 288 | +function deploy_applications() { |
| 289 | + echo "=== DEPLOYING AELION COMPONENTS ===" |
| 290 | + |
| 291 | + # Create persistent volumes |
| 292 | + kubectl apply -f ${ROOT_DIR}/storage/ |
| 293 | + |
| 294 | + # Deploy databases |
| 295 | + helm upgrade --install postgresql bitnami/postgresql-ha \ |
| 296 | + --namespace=${DEPLOY_NAMESPACE} \ |
| 297 | + --values ${ROOT_DIR}/postgresql-values.yaml |
| 298 | + |
| 299 | + # Deploy Kafka |
| 300 | + helm upgrade --install kafka bitnami/kafka \ |
| 301 | + --namespace=${DEPLOY_NAMESPACE} \ |
| 302 | + --set replicas=3 \ |
| 303 | + --set persistence.size=${STORAGE_SIZE} |
| 304 | + |
| 305 | + # Deploy core services |
| 306 | + kubectl apply -k ${ROOT_DIR}/kustomize/overlays/${ENVIRONMENT} |
| 307 | + |
| 308 | + # Wait for readiness |
| 309 | + kubectl rollout status deployment/aelion-orchestrator -n ${DEPLOY_NAMESPACE} --timeout=600s |
| 310 | + kubectl rollout status statefulset/aelion-agents -n ${DEPLOY_NAMESPACE} --timeout=600s |
| 311 | +} |
| 312 | + |
| 313 | +# Phase 6: Validation & Testing |
| 314 | +function validate_deployment() { |
| 315 | + echo "=== VALIDATING DEPLOYMENT ===" |
| 316 | + |
| 317 | + # Verify component status |
| 318 | + declare -A DEPLOYMENTS=( |
| 319 | + ["aelion-orchestrator"]=3 |
| 320 | + ["aelion-api-gateway"]=2 |
| 321 | + ["aelion-metrics-collector"]=2 |
| 322 | + ) |
| 323 | + |
| 324 | + for dep in "${!DEPLOYMENTS[@]}"; do |
| 325 | + replicas=$(kubectl get deployment/${dep} -n ${DEPLOY_NAMESPACE} -o jsonpath='{.status.readyReplicas}') |
| 326 | + if [[ ${replicas} -ne ${DEPLOYMENTS[$dep]} ]]; then |
| 327 | + echo "ERROR: ${dep} not ready" |
| 328 | + exit 1 |
| 329 | + fi |
| 330 | + done |
| 331 | + |
| 332 | + # Run smoke tests |
| 333 | + API_ENDPOINT="https://api.aelion.ai/health" |
| 334 | + if ! curl -sk ${API_ENDPOINT} | grep "OK"; then |
| 335 | + echo "ERROR: API health check failed" |
| 336 | + exit 1 |
| 337 | + fi |
| 338 | + |
| 339 | + # Validate data pipeline |
| 340 | + kubectl apply -f ${ROOT_DIR}/tests/pipeline-test.yaml |
| 341 | + kubectl wait --for=condition=complete job/pipeline-test -n ${DEPLOY_NAMESPACE} --timeout=300s |
| 342 | +} |
| 343 | + |
| 344 | +# Phase 7: Backup Configuration |
| 345 | +function configure_backups() { |
| 346 | + echo "=== CONFIGURING BACKUP SYSTEMS ===" |
| 347 | + |
| 348 | + # Create backup schedule |
| 349 | + helm upgrade --install velero vmware-tanzu/velero \ |
| 350 | + --namespace velero \ |
| 351 | + --create-namespace \ |
| 352 | + --set configuration.backupStorageLocation[0].name=aws \ |
| 353 | + --set configuration.backupStorageLocation[0].provider=aws \ |
| 354 | + --set configuration.backupStorageLocation[0].bucket=aelion-backups \ |
| 355 | + --set schedules.daily.schedule="0 2 * * *" \ |
| 356 | + --set schedules.daily.ttl="720h" |
| 357 | + |
| 358 | + # Initial backup |
| 359 | + velero backup create initial-deployment --include-namespaces=${DEPLOY_NAMESPACE} |
| 360 | +} |
| 361 | + |
| 362 | +# Main Execution |
| 363 | +function main() { |
| 364 | + preflight_checks |
| 365 | + provision_infrastructure |
| 366 | + bootstrap_cluster |
| 367 | + harden_security |
| 368 | + deploy_applications |
| 369 | + validate_deployment |
| 370 | + configure_backups |
| 371 | + |
| 372 | + echo "=== CLUSTER INITIALIZATION COMPLETE ===" |
| 373 | + echo "Dashboard URL: https://dashboard.aelion.ai" |
| 374 | + echo "Admin credentials stored in: ${ROOT_DIR}/admin.crt" |
| 375 | +} |
| 376 | + |
| 377 | +# Execute with error trapping |
| 378 | +trap 'echo "ERROR at line ${LINENO}"; exit 1' ERR |
| 379 | +main |
0 commit comments