ci: make waiting for nodes more robust (#2981)

* ci: make waiting for nodes more robust

After initializing the cluster, a lot of things happen in parallel and
are potentially getting in each others' way: nodes are joining,
daemonsets are proliferating, the network is being set up. During this
period, it's not unusual that the Kubernetes API server is unavailable
for a short time, e.g. due to etcd loosing quorum or load balancing
changes.

This period of instability has the potential to affect all kubectl
commands negatively, leading to problems especially for tests, where
command failures often lead to test failures. On the other hand, we'd
expect everything to be quite stable after the initial dust settles.

Therefore, this commit changes how we wait after initializing a cluster.
Until we have a reasonable expectation of readiness, we ignore command
failures and wait for things to stabilize. The cluster is considered
stable once all configured nodes and all API servers report ready.
This commit is contained in:
Markus Rudy 2024-03-13 09:42:18 +01:00 committed by GitHub
parent 3b8fa95648
commit 85b44f7f57
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 54 additions and 25 deletions

View File

@ -217,31 +217,9 @@ runs:
env: env:
KUBECONFIG: "${{ steps.get-kubeconfig.outputs.KUBECONFIG }}" KUBECONFIG: "${{ steps.get-kubeconfig.outputs.KUBECONFIG }}"
JOINTIMEOUT: "1200" # 20 minutes timeout for all nodes to join JOINTIMEOUT: "1200" # 20 minutes timeout for all nodes to join
run: | CONTROL_NODES_COUNT: "${{ inputs.controlNodesCount }}"
echo "::group::Wait for nodes" WORKER_NODES_COUNT: "${{ inputs.workerNodesCount }}"
NODES_COUNT=$((${{ inputs.controlNodesCount }} + ${{ inputs.workerNodesCount }})) run: ./.github/actions/constellation_create/wait-for-nodes.sh
JOINWAIT=0
until [[ "$(kubectl get nodes -o json | jq '.items | length')" == "${NODES_COUNT}" ]] || [[ $JOINWAIT -gt $JOINTIMEOUT ]];
do
echo "$(kubectl get nodes -o json | jq '.items | length')/"${NODES_COUNT}" nodes have joined.. waiting.."
JOINWAIT=$((JOINWAIT+30))
sleep 30
done
if [[ $JOINWAIT -gt $JOINTIMEOUT ]]; then
kubectl get nodes -o wide
echo "::error::Timed out waiting for nodes to join"
echo "::endgroup::"
exit 1
fi
echo "$(kubectl get nodes -o json | jq '.items | length')/"${NODES_COUNT}" nodes have joined"
if ! kubectl wait --for=condition=ready --all nodes --timeout=20m; then
kubectl get pods -n kube-system
kubectl get events -n kube-system
echo "::error::kubectl wait timed out before all nodes became ready"
echo "::endgroup::"
exit 1
fi
echo "::endgroup::"
- name: Download boot logs - name: Download boot logs
if: always() if: always()

View File

@ -0,0 +1,51 @@
#!/bin/bash
# We don't want to abort the script if there's a transient error in kubectl.
set +e
set -uo pipefail
NODES_COUNT=$((CONTROL_NODES_COUNT + WORKER_NODES_COUNT))
JOINWAIT=0
# Reports how many nodes are registered and fulfill condition=ready.
num_nodes_ready() {
kubectl get nodes -o json |
jq '.items | map(select(.status.conditions[] | .type == "Ready" and .status == "True")) | length'
}
# Reports how many API server pods are ready.
num_apiservers_ready() {
kubectl get pods -n kube-system -l component=kube-apiserver -o json |
jq '.items | map(select(.status.conditions[] | .type == "Ready" and .status == "True")) | length'
}
# Prints node joining progress.
report_join_progress() {
echo -n "nodes_joined=$(kubectl get nodes -o json | jq '.items | length')/${NODES_COUNT} "
echo -n "nodes_ready=$(num_nodes_ready)/${NODES_COUNT} "
echo "api_servers_ready=$(num_apiservers_ready)/${CONTROL_NODES_COUNT} ..."
}
# Indicates by exit code whether the cluster is ready, i.e. all nodes and API servers are ready.
cluster_ready() {
[[ "$(num_nodes_ready)" == "${NODES_COUNT}" && "$(num_apiservers_ready)" == "${CONTROL_NODES_COUNT}" ]]
}
echo "::group::Wait for nodes"
until cluster_ready || [[ ${JOINWAIT} -gt ${JOINTIMEOUT} ]]; do
report_join_progress
JOINWAIT=$((JOINWAIT + 30))
sleep 30
done
report_join_progress
if [[ ${JOINWAIT} -gt ${JOINTIMEOUT} ]]; then
set -x
kubectl get nodes -o wide
kubectl get pods -n kube-system -o wide
kubectl get events -n kube-system
set +x
echo "::error::timeout reached before all nodes became ready"
echo "::endgroup::"
exit 1
fi
echo "::endgroup::"