mirror of
https://github.com/edgelesssys/constellation.git
synced 2025-02-02 18:44:49 -05:00
ci: make waiting for nodes more robust (#2981)
* ci: make waiting for nodes more robust After initializing the cluster, a lot of things happen in parallel and are potentially getting in each others' way: nodes are joining, daemonsets are proliferating, the network is being set up. During this period, it's not unusual that the Kubernetes API server is unavailable for a short time, e.g. due to etcd loosing quorum or load balancing changes. This period of instability has the potential to affect all kubectl commands negatively, leading to problems especially for tests, where command failures often lead to test failures. On the other hand, we'd expect everything to be quite stable after the initial dust settles. Therefore, this commit changes how we wait after initializing a cluster. Until we have a reasonable expectation of readiness, we ignore command failures and wait for things to stabilize. The cluster is considered stable once all configured nodes and all API servers report ready.
This commit is contained in:
parent
3b8fa95648
commit
85b44f7f57
28
.github/actions/constellation_create/action.yml
vendored
28
.github/actions/constellation_create/action.yml
vendored
@ -217,31 +217,9 @@ runs:
|
||||
env:
|
||||
KUBECONFIG: "${{ steps.get-kubeconfig.outputs.KUBECONFIG }}"
|
||||
JOINTIMEOUT: "1200" # 20 minutes timeout for all nodes to join
|
||||
run: |
|
||||
echo "::group::Wait for nodes"
|
||||
NODES_COUNT=$((${{ inputs.controlNodesCount }} + ${{ inputs.workerNodesCount }}))
|
||||
JOINWAIT=0
|
||||
until [[ "$(kubectl get nodes -o json | jq '.items | length')" == "${NODES_COUNT}" ]] || [[ $JOINWAIT -gt $JOINTIMEOUT ]];
|
||||
do
|
||||
echo "$(kubectl get nodes -o json | jq '.items | length')/"${NODES_COUNT}" nodes have joined.. waiting.."
|
||||
JOINWAIT=$((JOINWAIT+30))
|
||||
sleep 30
|
||||
done
|
||||
if [[ $JOINWAIT -gt $JOINTIMEOUT ]]; then
|
||||
kubectl get nodes -o wide
|
||||
echo "::error::Timed out waiting for nodes to join"
|
||||
echo "::endgroup::"
|
||||
exit 1
|
||||
fi
|
||||
echo "$(kubectl get nodes -o json | jq '.items | length')/"${NODES_COUNT}" nodes have joined"
|
||||
if ! kubectl wait --for=condition=ready --all nodes --timeout=20m; then
|
||||
kubectl get pods -n kube-system
|
||||
kubectl get events -n kube-system
|
||||
echo "::error::kubectl wait timed out before all nodes became ready"
|
||||
echo "::endgroup::"
|
||||
exit 1
|
||||
fi
|
||||
echo "::endgroup::"
|
||||
CONTROL_NODES_COUNT: "${{ inputs.controlNodesCount }}"
|
||||
WORKER_NODES_COUNT: "${{ inputs.workerNodesCount }}"
|
||||
run: ./.github/actions/constellation_create/wait-for-nodes.sh
|
||||
|
||||
- name: Download boot logs
|
||||
if: always()
|
||||
|
51
.github/actions/constellation_create/wait-for-nodes.sh
vendored
Executable file
51
.github/actions/constellation_create/wait-for-nodes.sh
vendored
Executable file
@ -0,0 +1,51 @@
|
||||
#!/bin/bash
|
||||
|
||||
# We don't want to abort the script if there's a transient error in kubectl.
|
||||
set +e
|
||||
set -uo pipefail
|
||||
|
||||
NODES_COUNT=$((CONTROL_NODES_COUNT + WORKER_NODES_COUNT))
|
||||
JOINWAIT=0
|
||||
|
||||
# Reports how many nodes are registered and fulfill condition=ready.
|
||||
num_nodes_ready() {
|
||||
kubectl get nodes -o json |
|
||||
jq '.items | map(select(.status.conditions[] | .type == "Ready" and .status == "True")) | length'
|
||||
}
|
||||
|
||||
# Reports how many API server pods are ready.
|
||||
num_apiservers_ready() {
|
||||
kubectl get pods -n kube-system -l component=kube-apiserver -o json |
|
||||
jq '.items | map(select(.status.conditions[] | .type == "Ready" and .status == "True")) | length'
|
||||
}
|
||||
|
||||
# Prints node joining progress.
|
||||
report_join_progress() {
|
||||
echo -n "nodes_joined=$(kubectl get nodes -o json | jq '.items | length')/${NODES_COUNT} "
|
||||
echo -n "nodes_ready=$(num_nodes_ready)/${NODES_COUNT} "
|
||||
echo "api_servers_ready=$(num_apiservers_ready)/${CONTROL_NODES_COUNT} ..."
|
||||
}
|
||||
|
||||
# Indicates by exit code whether the cluster is ready, i.e. all nodes and API servers are ready.
|
||||
cluster_ready() {
|
||||
[[ "$(num_nodes_ready)" == "${NODES_COUNT}" && "$(num_apiservers_ready)" == "${CONTROL_NODES_COUNT}" ]]
|
||||
}
|
||||
|
||||
echo "::group::Wait for nodes"
|
||||
until cluster_ready || [[ ${JOINWAIT} -gt ${JOINTIMEOUT} ]]; do
|
||||
report_join_progress
|
||||
JOINWAIT=$((JOINWAIT + 30))
|
||||
sleep 30
|
||||
done
|
||||
report_join_progress
|
||||
if [[ ${JOINWAIT} -gt ${JOINTIMEOUT} ]]; then
|
||||
set -x
|
||||
kubectl get nodes -o wide
|
||||
kubectl get pods -n kube-system -o wide
|
||||
kubectl get events -n kube-system
|
||||
set +x
|
||||
echo "::error::timeout reached before all nodes became ready"
|
||||
echo "::endgroup::"
|
||||
exit 1
|
||||
fi
|
||||
echo "::endgroup::"
|
Loading…
x
Reference in New Issue
Block a user