constellation/.github/actions/e2e_recover/action.yml

name: Constellation recover
description: "Recover a Constellation cluster with an unavailable control plane."

inputs:
  controlNodesCount:
    description: "The amount of control plane nodes in the cluster."
    required: true
  kubeconfig:
    description: "The kubeconfig for the cluster."
    required: true
  masterSecret:
    description: "The master-secret for the cluster."
    required: true

runs:
  using: "composite"
  steps:
    - name: Restart worker node
      shell: bash
      run: |
        WORKER_NODE=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o json | jq '.items[0].metadata.name' -r)
        kubectl debug node/$WORKER_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh"
        kubectl wait --for=condition=Ready=false --timeout=10m node/$WORKER_NODE
        kubectl wait --for=condition=Ready=true --timeout=10m --all nodes
      env:
        KUBECONFIG: ${{ inputs.kubeconfig }}
    - name: Restart all control plane nodes
      shell: bash
      run: |
        CONTROL_PLANE_NODES=$(kubectl get nodes --selector='node-role.kubernetes.io/control-plane' -o json | jq '.items[].metadata.name' -r)
        for CONTROL_PLANE_NODE in ${CONTROL_PLANE_NODES}; do
          kubectl debug node/$CONTROL_PLANE_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh"
        done
      env:
        KUBECONFIG: ${{ inputs.kubeconfig }}
    - name: Constellation recover
      shell: bash
      run: |
        timeout=600
        start_time=$(date +%s)
        recovered=0
        while true; do
            output=$(constellation recover --master-secret=${{ inputs.masterSecret }} --force)
            if echo "$output" | grep -q "Pushed recovery key."; then
                echo "$output"
                i=$(echo "$output" | grep -o "Pushed recovery key." | wc -l | sed 's/ //g')
                recovered=$((recovered+i))
                if [[ $recovered -eq ${{ inputs.controlNodesCount }} ]]; then
                    break
                fi
            fi

            current_time=$(date +%s)
            if ((current_time - start_time > timeout)); then
                echo "Control plane recovery timed out after $timeout seconds."
                exit 1
            fi

            echo "Did not recover all nodes yet, retrying in 5 seconds [$recovered/${{ inputs.controlNodesCount }}]"
            sleep 5
        done
    - name: Wait for control plane to get back up
      shell: bash
      run: |
        timeout=600
        start_time=$(date +%s)
        while true; do
          output=$(kubectl wait --for=condition=Ready --timeout=10m --all nodes || true)
          if echo "$output" | grep -q "condition met"; then
              echo "$output"
              exit 0
          fi

          current_time=$(date +%s)
          if ((current_time - start_time > timeout)); then
              echo "Waiting for control plane to get back up timed out after $timeout seconds."
              exit 1
          fi

          echo "Cannot reach control plane, retrying in 10 seconds"
          sleep 10
        done
      env:
        KUBECONFIG: ${{ inputs.kubeconfig }}