2023-01-19 04:41:07 -05:00
name : Constellation recover
description : "Recover a Constellation cluster with an unavailable control plane."
inputs :
controlNodesCount :
description : "The amount of control plane nodes in the cluster."
required : true
kubeconfig :
description : "The kubeconfig for the cluster."
required : true
runs :
using : "composite"
steps :
- name : Restart worker node
shell : bash
2023-08-09 10:01:43 -04:00
env :
KUBECONFIG : ${{ inputs.kubeconfig }}
2023-01-19 04:41:07 -05:00
run : |
WORKER_NODE=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o json | jq '.items[0].metadata.name' -r)
2023-08-18 04:43:51 -04:00
echo "Disabling the join-service and waiting for the node to be unresponsive"
kubectl patch daemonset -n kube-system join-service -p '{"spec":{"template":{"spec":{"nodeSelector":{"some-tag":""}}}}}'
2023-01-19 04:41:07 -05:00
kubectl debug node/$WORKER_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh"
2023-08-18 04:43:51 -04:00
kubectl wait --for=condition=Ready=Unknown --timeout=10m node/$WORKER_NODE
echo "Re-enabling the join-service and waiting for the node to be back up"
kubectl patch daemonset -n kube-system join-service --type=json -p='[{"op": "remove", "path": "/spec/template/spec/nodeSelector/some-tag"}]'
2023-01-19 04:41:07 -05:00
kubectl wait --for=condition=Ready=true --timeout=10m --all nodes
2023-10-20 02:10:26 -04:00
2023-01-19 04:41:07 -05:00
- name : Restart all control plane nodes
shell : bash
2023-08-09 10:01:43 -04:00
env :
KUBECONFIG : ${{ inputs.kubeconfig }}
2023-01-19 04:41:07 -05:00
run : |
CONTROL_PLANE_NODES=$(kubectl get nodes --selector='node-role.kubernetes.io/control-plane' -o json | jq '.items[].metadata.name' -r)
for CONTROL_PLANE_NODE in ${CONTROL_PLANE_NODES}; do
kubectl debug node/$CONTROL_PLANE_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh"
done
2023-10-20 02:10:26 -04:00
2023-01-19 04:41:07 -05:00
- name : Constellation recover
shell : bash
run : |
timeout=600
start_time=$(date +%s)
recovered=0
while true; do
2023-10-20 02:10:26 -04:00
output=$(constellation recover)
2023-01-19 04:41:07 -05:00
if echo "$output" | grep -q "Pushed recovery key."; then
echo "$output"
i=$(echo "$output" | grep -o "Pushed recovery key." | wc -l | sed 's/ //g')
recovered=$((recovered+i))
2023-08-09 10:01:43 -04:00
if [[ $recovered -gt ${{ inputs.controlNodesCount }}/2 ]]; then
2023-01-27 09:53:53 -05:00
break
2023-01-19 04:41:07 -05:00
fi
fi
current_time=$(date +%s)
if ((current_time - start_time > timeout)); then
echo "Control plane recovery timed out after $timeout seconds."
exit 1
fi
2023-08-09 10:01:43 -04:00
echo "Did not recover a quorum (>${{inputs.controlNodesCount}}/2) of control-plane nodes yet, retrying in 5 seconds [$recovered/${{ inputs.controlNodesCount }}]"
2023-01-19 04:41:07 -05:00
sleep 5
done
2023-08-09 10:01:43 -04:00
2023-01-27 09:53:53 -05:00
- name : Wait for control plane to get back up
shell : bash
2023-08-09 10:01:43 -04:00
env :
KUBECONFIG : ${{ inputs.kubeconfig }}
2023-01-27 09:53:53 -05:00
run : |
timeout=600
start_time=$(date +%s)
while true; do
output=$(kubectl wait --for=condition=Ready --timeout=10m --all nodes || true)
if echo "$output" | grep -q "condition met"; then
echo "$output"
exit 0
fi
current_time=$(date +%s)
if ((current_time - start_time > timeout)); then
echo "Waiting for control plane to get back up timed out after $timeout seconds."
exit 1
fi
echo "Cannot reach control plane, retrying in 10 seconds"
sleep 10
done