name: Constellation recover description: "Recover a Constellation cluster with an unavailable control plane." inputs: controlNodesCount: description: "The amount of control plane nodes in the cluster." required: true kubeconfig: description: "The kubeconfig for the cluster." required: true runs: using: "composite" steps: - name: Restart worker node shell: bash env: KUBECONFIG: ${{ inputs.kubeconfig }} run: | WORKER_NODE=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o json | jq '.items[0].metadata.name' -r) echo "Disabling the join-service and waiting for the node to be unresponsive" kubectl patch daemonset -n kube-system join-service -p '{"spec":{"template":{"spec":{"nodeSelector":{"some-tag":""}}}}}' kubectl debug node/$WORKER_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh" kubectl wait --for=condition=Ready=Unknown --timeout=10m node/$WORKER_NODE echo "Re-enabling the join-service and waiting for the node to be back up" kubectl patch daemonset -n kube-system join-service --type=json -p='[{"op": "remove", "path": "/spec/template/spec/nodeSelector/some-tag"}]' kubectl wait --for=condition=Ready=true --timeout=10m --all nodes - name: Restart all control plane nodes shell: bash env: KUBECONFIG: ${{ inputs.kubeconfig }} run: | CONTROL_PLANE_NODES=$(kubectl get nodes --selector='node-role.kubernetes.io/control-plane' -o json | jq '.items[].metadata.name' -r) for CONTROL_PLANE_NODE in ${CONTROL_PLANE_NODES}; do kubectl debug node/$CONTROL_PLANE_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh" done - name: Constellation recover shell: bash run: | timeout=600 start_time=$(date +%s) recovered=0 while true; do output=$(constellation recover) if echo "$output" | grep -q "Pushed recovery key."; then echo "$output" i=$(echo "$output" | grep -o "Pushed recovery key." | wc -l | sed 's/ //g') recovered=$((recovered+i)) if [[ $recovered -gt ${{ inputs.controlNodesCount }}/2 ]]; then break fi fi current_time=$(date +%s) if ((current_time - start_time > timeout)); then echo "Control plane recovery timed out after $timeout seconds." exit 1 fi echo "Did not recover a quorum (>${{inputs.controlNodesCount}}/2) of control-plane nodes yet, retrying in 5 seconds [$recovered/${{ inputs.controlNodesCount }}]" sleep 5 done - name: Wait for control plane to get back up shell: bash env: KUBECONFIG: ${{ inputs.kubeconfig }} run: | timeout=600 start_time=$(date +%s) while true; do output=$(kubectl wait --for=condition=Ready --timeout=10m --all nodes || true) if echo "$output" | grep -q "condition met"; then echo "$output" exit 0 fi current_time=$(date +%s) if ((current_time - start_time > timeout)); then echo "Waiting for control plane to get back up timed out after $timeout seconds." exit 1 fi echo "Cannot reach control plane, retrying in 10 seconds" sleep 10 done