diff --git a/.github/actions/constellation_create/action.yml b/.github/actions/constellation_create/action.yml index a4c530c94..7e149fc18 100644 --- a/.github/actions/constellation_create/action.yml +++ b/.github/actions/constellation_create/action.yml @@ -217,31 +217,9 @@ runs: env: KUBECONFIG: "${{ steps.get-kubeconfig.outputs.KUBECONFIG }}" JOINTIMEOUT: "1200" # 20 minutes timeout for all nodes to join - run: | - echo "::group::Wait for nodes" - NODES_COUNT=$((${{ inputs.controlNodesCount }} + ${{ inputs.workerNodesCount }})) - JOINWAIT=0 - until [[ "$(kubectl get nodes -o json | jq '.items | length')" == "${NODES_COUNT}" ]] || [[ $JOINWAIT -gt $JOINTIMEOUT ]]; - do - echo "$(kubectl get nodes -o json | jq '.items | length')/"${NODES_COUNT}" nodes have joined.. waiting.." - JOINWAIT=$((JOINWAIT+30)) - sleep 30 - done - if [[ $JOINWAIT -gt $JOINTIMEOUT ]]; then - kubectl get nodes -o wide - echo "::error::Timed out waiting for nodes to join" - echo "::endgroup::" - exit 1 - fi - echo "$(kubectl get nodes -o json | jq '.items | length')/"${NODES_COUNT}" nodes have joined" - if ! kubectl wait --for=condition=ready --all nodes --timeout=20m; then - kubectl get pods -n kube-system - kubectl get events -n kube-system - echo "::error::kubectl wait timed out before all nodes became ready" - echo "::endgroup::" - exit 1 - fi - echo "::endgroup::" + CONTROL_NODES_COUNT: "${{ inputs.controlNodesCount }}" + WORKER_NODES_COUNT: "${{ inputs.workerNodesCount }}" + run: ./.github/actions/constellation_create/wait-for-nodes.sh - name: Download boot logs if: always() diff --git a/.github/actions/constellation_create/wait-for-nodes.sh b/.github/actions/constellation_create/wait-for-nodes.sh new file mode 100755 index 000000000..9fb9b36e4 --- /dev/null +++ b/.github/actions/constellation_create/wait-for-nodes.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# We don't want to abort the script if there's a transient error in kubectl. +set +e +set -uo pipefail + +NODES_COUNT=$((CONTROL_NODES_COUNT + WORKER_NODES_COUNT)) +JOINWAIT=0 + +# Reports how many nodes are registered and fulfill condition=ready. +num_nodes_ready() { + kubectl get nodes -o json | + jq '.items | map(select(.status.conditions[] | .type == "Ready" and .status == "True")) | length' +} + +# Reports how many API server pods are ready. +num_apiservers_ready() { + kubectl get pods -n kube-system -l component=kube-apiserver -o json | + jq '.items | map(select(.status.conditions[] | .type == "Ready" and .status == "True")) | length' +} + +# Prints node joining progress. +report_join_progress() { + echo -n "nodes_joined=$(kubectl get nodes -o json | jq '.items | length')/${NODES_COUNT} " + echo -n "nodes_ready=$(num_nodes_ready)/${NODES_COUNT} " + echo "api_servers_ready=$(num_apiservers_ready)/${CONTROL_NODES_COUNT} ..." +} + +# Indicates by exit code whether the cluster is ready, i.e. all nodes and API servers are ready. +cluster_ready() { + [[ "$(num_nodes_ready)" == "${NODES_COUNT}" && "$(num_apiservers_ready)" == "${CONTROL_NODES_COUNT}" ]] +} + +echo "::group::Wait for nodes" +until cluster_ready || [[ ${JOINWAIT} -gt ${JOINTIMEOUT} ]]; do + report_join_progress + JOINWAIT=$((JOINWAIT + 30)) + sleep 30 +done +report_join_progress +if [[ ${JOINWAIT} -gt ${JOINTIMEOUT} ]]; then + set -x + kubectl get nodes -o wide + kubectl get pods -n kube-system -o wide + kubectl get events -n kube-system + set +x + echo "::error::timeout reached before all nodes became ready" + echo "::endgroup::" + exit 1 +fi +echo "::endgroup::"