2023-01-19 04:41:07 -05:00
|
|
|
name: Constellation recover
|
|
|
|
description: "Recover a Constellation cluster with an unavailable control plane."
|
|
|
|
|
|
|
|
inputs:
|
|
|
|
controlNodesCount:
|
|
|
|
description: "The amount of control plane nodes in the cluster."
|
|
|
|
required: true
|
|
|
|
kubeconfig:
|
|
|
|
description: "The kubeconfig for the cluster."
|
|
|
|
required: true
|
|
|
|
|
|
|
|
runs:
|
|
|
|
using: "composite"
|
|
|
|
steps:
|
|
|
|
- name: Restart worker node
|
|
|
|
shell: bash
|
|
|
|
run: |
|
|
|
|
WORKER_NODE=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o json | jq '.items[0].metadata.name' -r)
|
|
|
|
kubectl debug node/$WORKER_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh"
|
|
|
|
kubectl wait --for=condition=Ready=false --timeout=10m node/$WORKER_NODE
|
|
|
|
kubectl wait --for=condition=Ready=true --timeout=10m --all nodes
|
|
|
|
env:
|
|
|
|
KUBECONFIG: ${{ inputs.kubeconfig }}
|
|
|
|
- name: Restart all control plane nodes
|
|
|
|
shell: bash
|
|
|
|
run: |
|
|
|
|
CONTROL_PLANE_NODES=$(kubectl get nodes --selector='node-role.kubernetes.io/control-plane' -o json | jq '.items[].metadata.name' -r)
|
|
|
|
for CONTROL_PLANE_NODE in ${CONTROL_PLANE_NODES}; do
|
|
|
|
kubectl debug node/$CONTROL_PLANE_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh"
|
|
|
|
done
|
|
|
|
env:
|
|
|
|
KUBECONFIG: ${{ inputs.kubeconfig }}
|
|
|
|
- name: Constellation recover
|
|
|
|
shell: bash
|
|
|
|
run: |
|
|
|
|
timeout=600
|
|
|
|
start_time=$(date +%s)
|
|
|
|
recovered=0
|
|
|
|
while true; do
|
2023-08-09 07:00:27 -04:00
|
|
|
output=$(constellation recover --force)
|
2023-01-19 04:41:07 -05:00
|
|
|
if echo "$output" | grep -q "Pushed recovery key."; then
|
|
|
|
echo "$output"
|
|
|
|
i=$(echo "$output" | grep -o "Pushed recovery key." | wc -l | sed 's/ //g')
|
|
|
|
recovered=$((recovered+i))
|
|
|
|
if [[ $recovered -eq ${{ inputs.controlNodesCount }} ]]; then
|
2023-01-27 09:53:53 -05:00
|
|
|
break
|
2023-01-19 04:41:07 -05:00
|
|
|
fi
|
|
|
|
fi
|
|
|
|
|
|
|
|
current_time=$(date +%s)
|
|
|
|
if ((current_time - start_time > timeout)); then
|
|
|
|
echo "Control plane recovery timed out after $timeout seconds."
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
echo "Did not recover all nodes yet, retrying in 5 seconds [$recovered/${{ inputs.controlNodesCount }}]"
|
|
|
|
sleep 5
|
|
|
|
done
|
2023-01-27 09:53:53 -05:00
|
|
|
- name: Wait for control plane to get back up
|
|
|
|
shell: bash
|
|
|
|
run: |
|
|
|
|
timeout=600
|
|
|
|
start_time=$(date +%s)
|
|
|
|
while true; do
|
|
|
|
output=$(kubectl wait --for=condition=Ready --timeout=10m --all nodes || true)
|
|
|
|
if echo "$output" | grep -q "condition met"; then
|
|
|
|
echo "$output"
|
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
|
|
|
|
current_time=$(date +%s)
|
|
|
|
if ((current_time - start_time > timeout)); then
|
|
|
|
echo "Waiting for control plane to get back up timed out after $timeout seconds."
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
echo "Cannot reach control plane, retrying in 10 seconds"
|
|
|
|
sleep 10
|
|
|
|
done
|
2023-01-19 04:41:07 -05:00
|
|
|
env:
|
|
|
|
KUBECONFIG: ${{ inputs.kubeconfig }}
|