diff --git a/.github/actions/e2e_recover/action.yml b/.github/actions/e2e_recover/action.yml index cac7a9529..91bbfb852 100644 --- a/.github/actions/e2e_recover/action.yml +++ b/.github/actions/e2e_recover/action.yml @@ -14,22 +14,24 @@ runs: steps: - name: Restart worker node shell: bash + env: + KUBECONFIG: ${{ inputs.kubeconfig }} run: | WORKER_NODE=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o json | jq '.items[0].metadata.name' -r) kubectl debug node/$WORKER_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh" kubectl wait --for=condition=Ready=false --timeout=10m node/$WORKER_NODE kubectl wait --for=condition=Ready=true --timeout=10m --all nodes - env: - KUBECONFIG: ${{ inputs.kubeconfig }} + - name: Restart all control plane nodes shell: bash + env: + KUBECONFIG: ${{ inputs.kubeconfig }} run: | CONTROL_PLANE_NODES=$(kubectl get nodes --selector='node-role.kubernetes.io/control-plane' -o json | jq '.items[].metadata.name' -r) for CONTROL_PLANE_NODE in ${CONTROL_PLANE_NODES}; do kubectl debug node/$CONTROL_PLANE_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh" done - env: - KUBECONFIG: ${{ inputs.kubeconfig }} + - name: Constellation recover shell: bash run: | @@ -42,7 +44,7 @@ runs: echo "$output" i=$(echo "$output" | grep -o "Pushed recovery key." | wc -l | sed 's/ //g') recovered=$((recovered+i)) - if [[ $recovered -eq ${{ inputs.controlNodesCount }} ]]; then + if [[ $recovered -gt ${{ inputs.controlNodesCount }}/2 ]]; then break fi fi @@ -53,11 +55,14 @@ runs: exit 1 fi - echo "Did not recover all nodes yet, retrying in 5 seconds [$recovered/${{ inputs.controlNodesCount }}]" + echo "Did not recover a quorum (>${{inputs.controlNodesCount}}/2) of control-plane nodes yet, retrying in 5 seconds [$recovered/${{ inputs.controlNodesCount }}]" sleep 5 done + - name: Wait for control plane to get back up shell: bash + env: + KUBECONFIG: ${{ inputs.kubeconfig }} run: | timeout=600 start_time=$(date +%s) @@ -77,5 +82,3 @@ runs: echo "Cannot reach control plane, retrying in 10 seconds" sleep 10 done - env: - KUBECONFIG: ${{ inputs.kubeconfig }}