Skip to content

Commit 1918c35

Browse files
authored
Create cluster-init.sh
1 parent 6aa7696 commit 1918c35

File tree

1 file changed

+379
-0
lines changed

1 file changed

+379
-0
lines changed

scripts/deploy/cluster-init.sh

Lines changed: 379 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,379 @@
1+
#!/usr/bin/env bash
2+
set -eo pipefail
3+
4+
# ==============================================
5+
# Cluster Initialization Script - Production Grade
6+
# ==============================================
7+
8+
# Global Configuration
9+
CLUSTER_NAME="biconic-prod"
10+
CLOUD_PROVIDER="aws" # aws|azure|gcp
11+
ENVIRONMENT="prod"
12+
REGION="us-west-2"
13+
K8S_VERSION="1.28"
14+
DEPLOY_NAMESPACE="biconic-system"
15+
ADMIN_EMAIL="admin@biconic.ai"
16+
ROOT_DIR="/opt/aelion"
17+
BACKUP_DIR="${ROOT_DIR}/backups"
18+
LOG_FILE="/var/log/cluster-init.log"
19+
20+
# Infrastructure Sizing
21+
CONTROL_PLANE_NODES=3
22+
WORKER_NODES=5
23+
NODE_INSTANCE_TYPE="m6i.4xlarge"
24+
STORAGE_SIZE="500Gi"
25+
26+
# Security Parameters
27+
TLS_VALIDITY_DAYS=3650
28+
ENCRYPTION_KEY="$(openssl rand -base64 32)"
29+
ADMIN_CERT_DAYS=730
30+
31+
# Dependency Versions
32+
HELM_VERSION="3.12.3"
33+
TERRAFORM_VERSION="1.5.7"
34+
KUSTOMIZE_VERSION="5.0.3"
35+
36+
# Initialize logging
37+
exec > >(tee -a "${LOG_FILE}") 2>&1
38+
39+
# Phase 1: Pre-flight Checks
40+
function preflight_checks() {
41+
echo "=== STARTING PREFLIGHT CHECKS ==="
42+
43+
# Verify execution context
44+
if [[ $(id -u) -ne 0 ]]; then
45+
echo "ERROR: Must be run as root"
46+
exit 1
47+
fi
48+
49+
# Check dependencies
50+
declare -A REQUIRED_CMDS=(
51+
["kubectl"]="1.28"
52+
["helm"]="${HELM_VERSION}"
53+
["terraform"]="${TERRAFORM_VERSION}"
54+
["jq"]="1.6"
55+
["openssl"]="3.0"
56+
)
57+
58+
for cmd in "${!REQUIRED_CMDS[@]}"; do
59+
if ! command -v "${cmd}" &> /dev/null; then
60+
echo "ERROR: ${cmd} not found"
61+
exit 1
62+
fi
63+
64+
version=$(${cmd} --version 2>&1 | head -n1)
65+
if [[ ! "${version}" =~ ${REQUIRED_CMDS[$cmd]} ]]; then
66+
echo "ERROR: ${cmd} version mismatch"
67+
exit 1
68+
fi
69+
done
70+
71+
# Validate cloud credentials
72+
case "${CLOUD_PROVIDER}" in
73+
aws)
74+
if [[ -z "${AWS_ACCESS_KEY_ID}" || -z "${AWS_SECRET_ACCESS_KEY}" ]]; then
75+
echo "ERROR: AWS credentials not configured"
76+
exit 1
77+
fi
78+
;;
79+
azure)
80+
if [[ -z "${AZURE_SUBSCRIPTION_ID}" || -z "${AZURE_TENANT_ID}" ]]; then
81+
echo "ERROR: Azure credentials not configured"
82+
exit 1
83+
fi
84+
;;
85+
gcp)
86+
if [[ -z "${GOOGLE_CREDENTIALS}" ]]; then
87+
echo "ERROR: GCP credentials not configured"
88+
exit 1
89+
fi
90+
;;
91+
*)
92+
echo "ERROR: Unsupported cloud provider"
93+
exit 1
94+
;;
95+
esac
96+
97+
# Check storage availability
98+
if [[ $(df --output=avail / | tail -1) -lt 52428800 ]]; then
99+
echo "ERROR: Insufficient disk space"
100+
exit 1
101+
fi
102+
103+
echo "=== PREFLIGHT CHECKS PASSED ==="
104+
}
105+
106+
# Phase 2: Infrastructure Provisioning
107+
function provision_infrastructure() {
108+
echo "=== PROVISIONING CLUSTER INFRASTRUCTURE ==="
109+
110+
# Generate Terraform configuration
111+
cat <<EOF > cluster.tf
112+
module "aelion_cluster" {
113+
source = "terraform-${CLOUD_PROVIDER}-modules/kubernetes-cluster/${CLOUD_PROVIDER}"
114+
version = "4.12.0"
115+
116+
cluster_name = "${CLUSTER_NAME}"
117+
region = "${REGION}"
118+
k8s_version = "${K8S_VERSION}"
119+
node_count = ${WORKER_NODES}
120+
control_plane_count = ${CONTROL_PLANE_NODES}
121+
node_instance_type = "${NODE_INSTANCE_TYPE}"
122+
storage_size = "${STORAGE_SIZE}"
123+
124+
enable_autoscaling = true
125+
min_nodes = 3
126+
max_nodes = 10
127+
128+
enable_encryption = true
129+
encryption_key = "${ENCRYPTION_KEY}"
130+
131+
tags = {
132+
Environment = "${ENVIRONMENT}"
133+
ManagedBy = "Aelion AI"
134+
}
135+
}
136+
EOF
137+
138+
# Initialize and apply Terraform
139+
terraform init
140+
terraform apply -auto-approve
141+
142+
# Configure kubectl context
143+
case "${CLOUD_PROVIDER}" in
144+
aws)
145+
aws eks update-kubeconfig --name "${CLUSTER_NAME}" --region "${REGION}"
146+
;;
147+
azure)
148+
az aks get-credentials --resource-group "${CLUSTER_NAME}-rg" --name "${CLUSTER_NAME}"
149+
;;
150+
gcp)
151+
gcloud container clusters get-credentials "${CLUSTER_NAME}" --region "${REGION}"
152+
;;
153+
esac
154+
155+
# Verify cluster access
156+
if ! kubectl cluster-info; then
157+
echo "ERROR: Cluster connection failed"
158+
exit 1
159+
fi
160+
161+
echo "=== INFRASTRUCTURE PROVISIONING COMPLETE ==="
162+
}
163+
164+
# Phase 3: Cluster Bootstrapping
165+
function bootstrap_cluster() {
166+
echo "=== BOOTSTRAPPING CLUSTER COMPONENTS ==="
167+
168+
# Create namespace
169+
kubectl create namespace "${DEPLOY_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
170+
171+
# Deploy core components
172+
deploy_cni
173+
deploy_csi
174+
deploy_ingress_controller
175+
deploy_cert_manager
176+
deploy_metrics_server
177+
deploy_prometheus_stack
178+
deploy_efk_logging
179+
deploy_vault
180+
deploy_backup_operator
181+
}
182+
183+
function deploy_cni() {
184+
echo "--- Deploying CNI (Cilium) ---"
185+
helm repo add cilium https://helm.cilium.io/
186+
helm upgrade --install cilium cilium/cilium \
187+
--namespace kube-system \
188+
--set kubeProxyReplacement=strict \
189+
--set k8sServiceHost=api-server.${CLUSTER_NAME}.internal \
190+
--set hubble.relay.enabled=true \
191+
--set hubble.ui.enabled=true
192+
}
193+
194+
function deploy_cert_manager() {
195+
echo "--- Deploying Cert Manager ---"
196+
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.12.0/cert-manager.yaml
197+
198+
# Wait for readiness
199+
kubectl wait --for=condition=Available deployment/cert-manager -n cert-manager --timeout=300s
200+
201+
# Create cluster issuer
202+
cat <<EOF | kubectl apply -f -
203+
apiVersion: cert-manager.io/v1
204+
kind: ClusterIssuer
205+
metadata:
206+
name: letsencrypt-prod
207+
spec:
208+
acme:
209+
server: https://acme-v02.api.letsencrypt.org/directory
210+
email: ${ADMIN_EMAIL}
211+
privateKeySecretRef:
212+
name: letsencrypt-prod
213+
solvers:
214+
- http01:
215+
ingress:
216+
class: nginx
217+
EOF
218+
}
219+
220+
function deploy_prometheus_stack() {
221+
echo "--- Deploying Monitoring Stack ---"
222+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
223+
helm upgrade --install kube-prometheus prometheus-community/kube-prometheus-stack \
224+
--namespace monitoring \
225+
--create-namespace \
226+
--set alertmanager.enabled=true \
227+
--set grafana.enabled=true \
228+
--set prometheus.prometheusSpec.retention=30d
229+
}
230+
231+
# Phase 4: Security Hardening
232+
function harden_security() {
233+
echo "=== APPLYING SECURITY CONFIGURATIONS ==="
234+
235+
# Generate TLS certificates
236+
openssl req -x509 -newkey rsa:4096 -days ${TLS_VALIDITY_DAYS} -nodes \
237+
-keyout ${ROOT_DIR}/tls.key -out ${ROOT_DIR}/tls.crt \
238+
-subj "/CN=aelion.ai/O=Aelion AI"
239+
240+
# Create secret
241+
kubectl create secret tls aelion-tls \
242+
--key=${ROOT_DIR}/tls.key \
243+
--cert=${ROOT_DIR}/tls.crt \
244+
--namespace=${DEPLOY_NAMESPACE}
245+
246+
# Apply network policies
247+
cat <<EOF | kubectl apply -f -
248+
apiVersion: networking.k8s.io/v1
249+
kind: NetworkPolicy
250+
metadata:
251+
name: default-deny
252+
namespace: ${DEPLOY_NAMESPACE}
253+
spec:
254+
podSelector: {}
255+
policyTypes:
256+
- Ingress
257+
- Egress
258+
EOF
259+
260+
# Configure RBAC
261+
kubectl apply -f - <<EOF
262+
apiVersion: rbac.authorization.k8s.io/v1
263+
kind: ClusterRole
264+
metadata:
265+
name: aelion-admin
266+
rules:
267+
- apiGroups: ["*"]
268+
resources: ["*"]
269+
verbs: ["*"]
270+
EOF
271+
272+
# Create admin certificate
273+
openssl genrsa -out ${ROOT_DIR}/admin.key 2048
274+
openssl req -new -key ${ROOT_DIR}/admin.key \
275+
-out ${ROOT_DIR}/admin.csr \
276+
-subj "/CN=admin/O=system:masters"
277+
openssl x509 -req -in ${ROOT_DIR}/admin.csr \
278+
-CA ${ROOT_DIR}/tls.crt -CAkey ${ROOT_DIR}/tls.key -CAcreateserial \
279+
-out ${ROOT_DIR}/admin.crt -days ${ADMIN_CERT_DAYS}
280+
281+
# Configure kubectl context
282+
kubectl config set-credentials admin \
283+
--client-certificate=${ROOT_DIR}/admin.crt \
284+
--client-key=${ROOT_DIR}/admin.key
285+
}
286+
287+
# Phase 5: Application Deployment
288+
function deploy_applications() {
289+
echo "=== DEPLOYING AELION COMPONENTS ==="
290+
291+
# Create persistent volumes
292+
kubectl apply -f ${ROOT_DIR}/storage/
293+
294+
# Deploy databases
295+
helm upgrade --install postgresql bitnami/postgresql-ha \
296+
--namespace=${DEPLOY_NAMESPACE} \
297+
--values ${ROOT_DIR}/postgresql-values.yaml
298+
299+
# Deploy Kafka
300+
helm upgrade --install kafka bitnami/kafka \
301+
--namespace=${DEPLOY_NAMESPACE} \
302+
--set replicas=3 \
303+
--set persistence.size=${STORAGE_SIZE}
304+
305+
# Deploy core services
306+
kubectl apply -k ${ROOT_DIR}/kustomize/overlays/${ENVIRONMENT}
307+
308+
# Wait for readiness
309+
kubectl rollout status deployment/aelion-orchestrator -n ${DEPLOY_NAMESPACE} --timeout=600s
310+
kubectl rollout status statefulset/aelion-agents -n ${DEPLOY_NAMESPACE} --timeout=600s
311+
}
312+
313+
# Phase 6: Validation & Testing
314+
function validate_deployment() {
315+
echo "=== VALIDATING DEPLOYMENT ==="
316+
317+
# Verify component status
318+
declare -A DEPLOYMENTS=(
319+
["aelion-orchestrator"]=3
320+
["aelion-api-gateway"]=2
321+
["aelion-metrics-collector"]=2
322+
)
323+
324+
for dep in "${!DEPLOYMENTS[@]}"; do
325+
replicas=$(kubectl get deployment/${dep} -n ${DEPLOY_NAMESPACE} -o jsonpath='{.status.readyReplicas}')
326+
if [[ ${replicas} -ne ${DEPLOYMENTS[$dep]} ]]; then
327+
echo "ERROR: ${dep} not ready"
328+
exit 1
329+
fi
330+
done
331+
332+
# Run smoke tests
333+
API_ENDPOINT="https://api.aelion.ai/health"
334+
if ! curl -sk ${API_ENDPOINT} | grep "OK"; then
335+
echo "ERROR: API health check failed"
336+
exit 1
337+
fi
338+
339+
# Validate data pipeline
340+
kubectl apply -f ${ROOT_DIR}/tests/pipeline-test.yaml
341+
kubectl wait --for=condition=complete job/pipeline-test -n ${DEPLOY_NAMESPACE} --timeout=300s
342+
}
343+
344+
# Phase 7: Backup Configuration
345+
function configure_backups() {
346+
echo "=== CONFIGURING BACKUP SYSTEMS ==="
347+
348+
# Create backup schedule
349+
helm upgrade --install velero vmware-tanzu/velero \
350+
--namespace velero \
351+
--create-namespace \
352+
--set configuration.backupStorageLocation[0].name=aws \
353+
--set configuration.backupStorageLocation[0].provider=aws \
354+
--set configuration.backupStorageLocation[0].bucket=aelion-backups \
355+
--set schedules.daily.schedule="0 2 * * *" \
356+
--set schedules.daily.ttl="720h"
357+
358+
# Initial backup
359+
velero backup create initial-deployment --include-namespaces=${DEPLOY_NAMESPACE}
360+
}
361+
362+
# Main Execution
363+
function main() {
364+
preflight_checks
365+
provision_infrastructure
366+
bootstrap_cluster
367+
harden_security
368+
deploy_applications
369+
validate_deployment
370+
configure_backups
371+
372+
echo "=== CLUSTER INITIALIZATION COMPLETE ==="
373+
echo "Dashboard URL: https://dashboard.aelion.ai"
374+
echo "Admin credentials stored in: ${ROOT_DIR}/admin.crt"
375+
}
376+
377+
# Execute with error trapping
378+
trap 'echo "ERROR at line ${LINENO}"; exit 1' ERR
379+
main

0 commit comments

Comments
 (0)