constellation/.github/actions/e2e_recover/action.yml

74 lines
2.8 KiB
YAML
Raw Normal View History

name: Constellation recover
description: "Recover a Constellation cluster with an unavailable control plane."
inputs:
controlNodesCount:
description: "The amount of control plane nodes in the cluster."
required: true
kubeconfig:
description: "The kubeconfig for the cluster."
required: true
masterSecret:
description: "The master-secret for the cluster."
required: true
cloudProvider:
description: "Which cloud provider to use."
required: true
gcpProject:
description: "The GCP project Constellation is deployed in."
required: false
resourceGroup:
description: "The Azure resource group Constellation is deployed in."
required: false
runs:
using: "composite"
steps:
- name: Restart worker node
shell: bash
run: |
WORKER_NODE=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o json | jq '.items[0].metadata.name' -r)
kubectl debug node/$WORKER_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh"
kubectl wait --for=condition=Ready=false --timeout=10m node/$WORKER_NODE
kubectl wait --for=condition=Ready=true --timeout=10m --all nodes
env:
KUBECONFIG: ${{ inputs.kubeconfig }}
- name: Restart all control plane nodes
shell: bash
run: |
CONTROL_PLANE_NODES=$(kubectl get nodes --selector='node-role.kubernetes.io/control-plane' -o json | jq '.items[].metadata.name' -r)
for CONTROL_PLANE_NODE in ${CONTROL_PLANE_NODES}; do
kubectl debug node/$CONTROL_PLANE_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh"
done
env:
KUBECONFIG: ${{ inputs.kubeconfig }}
- name: Constellation recover
shell: bash
run: |
timeout=600
start_time=$(date +%s)
recovered=0
while true; do
output=$(constellation recover --master-secret=${{ inputs.masterSecret }})
if echo "$output" | grep -q "Pushed recovery key."; then
echo "$output"
i=$(echo "$output" | grep -o "Pushed recovery key." | wc -l | sed 's/ //g')
recovered=$((recovered+i))
if [[ $recovered -eq ${{ inputs.controlNodesCount }} ]]; then
exit 0
fi
fi
current_time=$(date +%s)
if ((current_time - start_time > timeout)); then
echo "Control plane recovery timed out after $timeout seconds."
exit 1
fi
echo "Did not recover all nodes yet, retrying in 5 seconds [$recovered/${{ inputs.controlNodesCount }}]"
sleep 5
done
kubectl wait --for=condition=Ready --timeout=10m --all nodes
env:
KUBECONFIG: ${{ inputs.kubeconfig }}