constellation/cli/internal/helm/charts/cilium/files/nodeinit/startup.bash
2022-08-12 10:20:19 +02:00

169 lines
7.7 KiB
Bash

#!/bin/bash
set -o errexit
set -o pipefail
set -o nounset
echo "Link information:"
ip link
echo "Routing table:"
ip route
echo "Addressing:"
ip -4 a
ip -6 a
{{- if .Values.nodeinit.removeCbrBridge }}
if ip link show cbr0; then
echo "Detected cbr0 bridge. Deleting interface..."
ip link del cbr0
fi
{{- end }}
{{- if .Values.nodeinit.reconfigureKubelet }}
# Check if we're running on a GKE containerd flavor as indicated by the presence
# of the '--container-runtime-endpoint' flag in '/etc/default/kubelet'.
GKE_KUBERNETES_BIN_DIR="/home/kubernetes/bin"
KUBELET_DEFAULTS_FILE="/etc/default/kubelet"
if [[ -f "${GKE_KUBERNETES_BIN_DIR}/gke" ]] && [[ $(grep -cF -- '--container-runtime-endpoint' "${KUBELET_DEFAULTS_FILE}") == "1" ]]; then
echo "GKE *_containerd flavor detected..."
# (GKE *_containerd) Upon node restarts, GKE's containerd images seem to reset
# the /etc directory and our changes to the kubelet and Cilium's CNI
# configuration are removed. This leaves room for containerd and its CNI to
# take over pods previously managed by Cilium, causing Cilium to lose
# ownership over these pods. We rely on the empirical observation that
# /home/kubernetes/bin/kubelet is not changed across node reboots, and replace
# it with a wrapper script that performs some initialization steps when
# required and then hands over control to the real kubelet.
# Only create the kubelet wrapper if we haven't previously done so.
if [[ ! -f "${GKE_KUBERNETES_BIN_DIR}/the-kubelet" ]];
then
echo "Installing the kubelet wrapper..."
# Rename the real kubelet.
mv "${GKE_KUBERNETES_BIN_DIR}/kubelet" "${GKE_KUBERNETES_BIN_DIR}/the-kubelet"
# Initialize the kubelet wrapper which lives in the place of the real kubelet.
touch "${GKE_KUBERNETES_BIN_DIR}/kubelet"
chmod a+x "${GKE_KUBERNETES_BIN_DIR}/kubelet"
# Populate the kubelet wrapper. It will perform the initialization steps we
# need and then become the kubelet.
cat <<'EOF' | tee "${GKE_KUBERNETES_BIN_DIR}/kubelet"
#!/bin/bash
set -euo pipefail
CNI_CONF_DIR="/etc/cni/net.d"
CONTAINERD_CONFIG="/etc/containerd/config.toml"
# Only stop and start containerd if the Cilium CNI configuration does not exist,
# or if the 'conf_template' property is present in the containerd config file,
# in order to avoid unnecessarily restarting containerd.
if [[ -z "$(find "${CNI_CONF_DIR}" -type f -name '*cilium*')" || \
"$(grep -cE '^\s+conf_template' "${CONTAINERD_CONFIG}")" != "0" ]];
then
# Stop containerd as it starts by creating a CNI configuration from a template
# causing pods to start with IPs assigned by GKE's CNI.
# 'disable --now' is used instead of stop as this script runs concurrently
# with containerd on node startup, and hence containerd might not have been
# started yet, in which case 'disable' prevents it from starting.
echo "Disabling and stopping containerd"
systemctl disable --now containerd
# Remove any pre-existing files in the CNI configuration directory. We skip
# any possibly existing Cilium configuration file for the obvious reasons.
echo "Removing undesired CNI configuration files"
find "${CNI_CONF_DIR}" -type f -not -name '*cilium*' -exec rm {} \;
# As mentioned above, the containerd configuration needs a little tweak in
# order not to create the default CNI configuration, so we update its config.
echo "Fixing containerd configuration"
sed -Ei 's/^(\s+conf_template)/\#\1/g' "${CONTAINERD_CONFIG}"
# Start containerd. It won't create it's CNI configuration file anymore.
echo "Enabling and starting containerd"
systemctl enable --now containerd
fi
# Become the real kubelet, and pass it some additionally required flags (and
# place these last so they have precedence).
exec /home/kubernetes/bin/the-kubelet "${@}" --network-plugin=cni --cni-bin-dir={{ .Values.cni.binPath }}
EOF
else
echo "Kubelet wrapper already exists, skipping..."
fi
else
# (Generic) Alter the kubelet configuration to run in CNI mode
echo "Changing kubelet configuration to --network-plugin=cni --cni-bin-dir={{ .Values.cni.binPath }}"
mkdir -p {{ .Values.cni.binPath }}
sed -i "s:--network-plugin=kubenet:--network-plugin=cni\ --cni-bin-dir={{ .Values.cni.binPath }}:g" "${KUBELET_DEFAULTS_FILE}"
fi
echo "Restarting the kubelet..."
systemctl restart kubelet
{{- end }}
{{- if (and .Values.gke.enabled (or .Values.enableIPv4Masquerade .Values.gke.disableDefaultSnat))}}
# If Cilium is configured to manage masquerading of traffic leaving the node,
# we need to disable the IP-MASQ chain because even if ip-masq-agent
# is not installed, the node init script installs some default rules into
# the IP-MASQ chain.
# If we remove the jump to that ip-masq chain, then we ensure the ip masquerade
# configuration is solely managed by Cilium.
# Also, if Cilium is installed, it may be expected that it would be solely responsible
# for the networking configuration on that node. So provide the same functionality
# as the --disable-snat-flag for existing GKE clusters.
iptables -w -t nat -D POSTROUTING -m comment --comment "ip-masq: ensure nat POSTROUTING directs all non-LOCAL destination traffic to our custom IP-MASQ chain" -m addrtype ! --dst-type LOCAL -j IP-MASQ || true
{{- end }}
{{- if not (eq .Values.nodeinit.bootstrapFile "") }}
mkdir -p {{ .Values.nodeinit.bootstrapFile | dir | quote }}
date > {{ .Values.nodeinit.bootstrapFile | quote }}
{{- end }}
{{- if .Values.azure.enabled }}
# AKS: If azure-vnet is installed on the node, and (still) configured in bridge mode,
# configure it as 'transparent' to be consistent with Cilium's CNI chaining config.
# If the azure-vnet CNI config is not removed, kubelet will execute CNI CHECK commands
# against it every 5 seconds and write 'bridge' to its state file, causing inconsistent
# behaviour when Pods are removed.
if [ -f /etc/cni/net.d/10-azure.conflist ]; then
echo "Ensuring azure-vnet is configured in 'transparent' mode..."
sed -i 's/"mode":\s*"bridge"/"mode":"transparent"/g' /etc/cni/net.d/10-azure.conflist
fi
# The azure0 interface being present means the node was booted with azure-vnet configured
# in bridge mode. This means there might be ebtables rules and neight entries interfering
# with pod connectivity if we deploy with Azure IPAM.
if ip l show dev azure0 >/dev/null 2>&1; then
# In Azure IPAM mode, also remove the azure-vnet state file, otherwise ebtables rules get
# restored by the azure-vnet CNI plugin on every CNI CHECK, which can cause connectivity
# issues in Cilium-managed Pods. Since azure-vnet is no longer called on scheduling events,
# this file can be removed.
rm -f /var/run/azure-vnet.json
# This breaks connectivity for existing workload Pods when Cilium is scheduled, but we need
# to flush these to prevent Cilium-managed Pod IPs conflicting with Pod IPs previously allocated
# by azure-vnet. These ebtables DNAT rules contain fixed MACs that are no longer bound on the node,
# causing packets for these Pods to be redirected back out to the gateway, where they are dropped.
echo 'Flushing ebtables pre/postrouting rules in nat table.. (disconnecting non-Cilium Pods!)'
ebtables -t nat -F PREROUTING || true
ebtables -t nat -F POSTROUTING || true
# ip-masq-agent periodically injects PERM neigh entries towards the gateway
# for all other k8s nodes in the cluster. These are safe to flush, as ARP can
# resolve these nodes as usual. PERM entries will be automatically restored later.
echo 'Deleting all permanent neighbour entries on azure0...'
ip neigh show dev azure0 nud permanent | cut -d' ' -f1 | xargs -r -n1 ip neigh del dev azure0 to || true
fi
{{- end }}
{{- if .Values.nodeinit.revertReconfigureKubelet }}
rm -f /tmp/node-deinit.cilium.io
{{- end }}
echo "Node initialization complete"