mirror of
https://github.com/edgelesssys/constellation.git
synced 2025-01-03 11:50:57 -05:00
f97d351ad2
In the CI most configs use prerelease images. Config validation prevents this. Therefore we need to use the force flag for now.
85 lines
3.1 KiB
YAML
85 lines
3.1 KiB
YAML
name: Constellation recover
|
|
description: "Recover a Constellation cluster with an unavailable control plane."
|
|
|
|
inputs:
|
|
controlNodesCount:
|
|
description: "The amount of control plane nodes in the cluster."
|
|
required: true
|
|
kubeconfig:
|
|
description: "The kubeconfig for the cluster."
|
|
required: true
|
|
masterSecret:
|
|
description: "The master-secret for the cluster."
|
|
required: true
|
|
|
|
runs:
|
|
using: "composite"
|
|
steps:
|
|
- name: Restart worker node
|
|
shell: bash
|
|
run: |
|
|
WORKER_NODE=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o json | jq '.items[0].metadata.name' -r)
|
|
kubectl debug node/$WORKER_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh"
|
|
kubectl wait --for=condition=Ready=false --timeout=10m node/$WORKER_NODE
|
|
kubectl wait --for=condition=Ready=true --timeout=10m --all nodes
|
|
env:
|
|
KUBECONFIG: ${{ inputs.kubeconfig }}
|
|
- name: Restart all control plane nodes
|
|
shell: bash
|
|
run: |
|
|
CONTROL_PLANE_NODES=$(kubectl get nodes --selector='node-role.kubernetes.io/control-plane' -o json | jq '.items[].metadata.name' -r)
|
|
for CONTROL_PLANE_NODE in ${CONTROL_PLANE_NODES}; do
|
|
kubectl debug node/$CONTROL_PLANE_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh"
|
|
done
|
|
env:
|
|
KUBECONFIG: ${{ inputs.kubeconfig }}
|
|
- name: Constellation recover
|
|
shell: bash
|
|
run: |
|
|
timeout=600
|
|
start_time=$(date +%s)
|
|
recovered=0
|
|
while true; do
|
|
output=$(constellation recover --master-secret=${{ inputs.masterSecret }} --force)
|
|
if echo "$output" | grep -q "Pushed recovery key."; then
|
|
echo "$output"
|
|
i=$(echo "$output" | grep -o "Pushed recovery key." | wc -l | sed 's/ //g')
|
|
recovered=$((recovered+i))
|
|
if [[ $recovered -eq ${{ inputs.controlNodesCount }} ]]; then
|
|
break
|
|
fi
|
|
fi
|
|
|
|
current_time=$(date +%s)
|
|
if ((current_time - start_time > timeout)); then
|
|
echo "Control plane recovery timed out after $timeout seconds."
|
|
exit 1
|
|
fi
|
|
|
|
echo "Did not recover all nodes yet, retrying in 5 seconds [$recovered/${{ inputs.controlNodesCount }}]"
|
|
sleep 5
|
|
done
|
|
- name: Wait for control plane to get back up
|
|
shell: bash
|
|
run: |
|
|
timeout=600
|
|
start_time=$(date +%s)
|
|
while true; do
|
|
output=$(kubectl wait --for=condition=Ready --timeout=10m --all nodes || true)
|
|
if echo "$output" | grep -q "condition met"; then
|
|
echo "$output"
|
|
exit 0
|
|
fi
|
|
|
|
current_time=$(date +%s)
|
|
if ((current_time - start_time > timeout)); then
|
|
echo "Waiting for control plane to get back up timed out after $timeout seconds."
|
|
exit 1
|
|
fi
|
|
|
|
echo "Cannot reach control plane, retrying in 10 seconds"
|
|
sleep 10
|
|
done
|
|
env:
|
|
KUBECONFIG: ${{ inputs.kubeconfig }}
|