constellation/cli/internal/helm/charts/cilium/files/nodeinit/startup.bash

#!/bin/bash

set -o errexit
set -o pipefail
set -o nounset

echo "Link information:"
ip link

echo "Routing table:"
ip route

echo "Addressing:"
ip -4 a
ip -6 a

{{- if .Values.nodeinit.removeCbrBridge }}
if ip link show cbr0; then
  echo "Detected cbr0 bridge. Deleting interface..."
  ip link del cbr0
fi
{{- end }}

{{- if .Values.nodeinit.reconfigureKubelet }}
# Check if we're running on a GKE containerd flavor as indicated by the presence
# of the '--container-runtime-endpoint' flag in '/etc/default/kubelet'.
GKE_KUBERNETES_BIN_DIR="/home/kubernetes/bin"
KUBELET_DEFAULTS_FILE="/etc/default/kubelet"
if [[ -f "${GKE_KUBERNETES_BIN_DIR}/gke" ]] && [[ $(grep -cF -- '--container-runtime-endpoint' "${KUBELET_DEFAULTS_FILE}") == "1" ]]; then
  echo "GKE *_containerd flavor detected..."

  # (GKE *_containerd) Upon node restarts, GKE's containerd images seem to reset
  # the /etc directory and our changes to the kubelet and Cilium's CNI
  # configuration are removed. This leaves room for containerd and its CNI to
  # take over pods previously managed by Cilium, causing Cilium to lose
  # ownership over these pods. We rely on the empirical observation that
  # /home/kubernetes/bin/kubelet is not changed across node reboots, and replace
  # it with a wrapper script that performs some initialization steps when
  # required and then hands over control to the real kubelet.

  # Only create the kubelet wrapper if we haven't previously done so.
  if [[ ! -f "${GKE_KUBERNETES_BIN_DIR}/the-kubelet" ]];
  then
    echo "Installing the kubelet wrapper..."

    # Rename the real kubelet.
    mv "${GKE_KUBERNETES_BIN_DIR}/kubelet" "${GKE_KUBERNETES_BIN_DIR}/the-kubelet"

    # Initialize the kubelet wrapper which lives in the place of the real kubelet.
    touch "${GKE_KUBERNETES_BIN_DIR}/kubelet"
    chmod a+x "${GKE_KUBERNETES_BIN_DIR}/kubelet"

    # Populate the kubelet wrapper. It will perform the initialization steps we
    # need and then become the kubelet.
    cat <<'EOF' | tee "${GKE_KUBERNETES_BIN_DIR}/kubelet"
#!/bin/bash

set -euo pipefail

CNI_CONF_DIR="/etc/cni/net.d"
CONTAINERD_CONFIG="/etc/containerd/config.toml"

# Only stop and start containerd if the Cilium CNI configuration does not exist,
# or if the 'conf_template' property is present in the containerd config file,
# in order to avoid unnecessarily restarting containerd.
if [[ -z "$(find "${CNI_CONF_DIR}" -type f -name '*cilium*')" || \
      "$(grep -cE '^\s+conf_template' "${CONTAINERD_CONFIG}")" != "0" ]];
then
  # Stop containerd as it starts by creating a CNI configuration from a template
  # causing pods to start with IPs assigned by GKE's CNI.
  # 'disable --now' is used instead of stop as this script runs concurrently
  # with containerd on node startup, and hence containerd might not have been
  # started yet, in which case 'disable' prevents it from starting.
  echo "Disabling and stopping containerd"
  systemctl disable --now containerd

  # Remove any pre-existing files in the CNI configuration directory. We skip
  # any possibly existing Cilium configuration file for the obvious reasons.
  echo "Removing undesired CNI configuration files"
  find "${CNI_CONF_DIR}" -type f -not -name '*cilium*' -exec rm {} \;

  # As mentioned above, the containerd configuration needs a little tweak in
  # order not to create the default CNI configuration, so we update its config.
  echo "Fixing containerd configuration"
  sed -Ei 's/^(\s+conf_template)/\#\1/g' "${CONTAINERD_CONFIG}"

  # Start containerd. It won't create it's CNI configuration file anymore.
  echo "Enabling and starting containerd"
  systemctl enable --now containerd
fi

# Become the real kubelet, and pass it some additionally required flags (and
# place these last so they have precedence).
exec /home/kubernetes/bin/the-kubelet "${@}" --network-plugin=cni --cni-bin-dir={{ .Values.cni.binPath }}
EOF
  else
    echo "Kubelet wrapper already exists, skipping..."
  fi
else
  # (Generic) Alter the kubelet configuration to run in CNI mode
  echo "Changing kubelet configuration to --network-plugin=cni --cni-bin-dir={{ .Values.cni.binPath }}"
  mkdir -p {{ .Values.cni.binPath }}
  sed -i "s:--network-plugin=kubenet:--network-plugin=cni\ --cni-bin-dir={{ .Values.cni.binPath }}:g" "${KUBELET_DEFAULTS_FILE}"
fi
echo "Restarting the kubelet..."
systemctl restart kubelet
{{- end }}

{{- if (and .Values.gke.enabled (or .Values.enableIPv4Masquerade .Values.gke.disableDefaultSnat))}}
# If Cilium is configured to manage masquerading of traffic leaving the node,
# we need to disable the IP-MASQ chain because even if ip-masq-agent
# is not installed, the node init script installs some default rules into
# the IP-MASQ chain.
# If we remove the jump to that ip-masq chain, then we ensure the ip masquerade
# configuration is solely managed by Cilium.
# Also, if Cilium is installed, it may be expected that it would be solely responsible
# for the networking configuration on that node. So provide the same functionality
# as the --disable-snat-flag for existing GKE clusters.
iptables -w -t nat -D POSTROUTING -m comment --comment "ip-masq: ensure nat POSTROUTING directs all non-LOCAL destination traffic to our custom IP-MASQ chain" -m addrtype ! --dst-type LOCAL -j IP-MASQ || true
{{- end }}

{{- if not (eq .Values.nodeinit.bootstrapFile "") }}
mkdir -p {{ .Values.nodeinit.bootstrapFile | dir | quote }}
date > {{ .Values.nodeinit.bootstrapFile | quote }}
{{- end }}

{{- if .Values.azure.enabled }}
# AKS: If azure-vnet is installed on the node, and (still) configured in bridge mode,
# configure it as 'transparent' to be consistent with Cilium's CNI chaining config.
# If the azure-vnet CNI config is not removed, kubelet will execute CNI CHECK commands
# against it every 5 seconds and write 'bridge' to its state file, causing inconsistent
# behaviour when Pods are removed.
if [ -f /etc/cni/net.d/10-azure.conflist ]; then
  echo "Ensuring azure-vnet is configured in 'transparent' mode..."
  sed -i 's/"mode":\s*"bridge"/"mode":"transparent"/g' /etc/cni/net.d/10-azure.conflist
fi

# The azure0 interface being present means the node was booted with azure-vnet configured
# in bridge mode. This means there might be ebtables rules and neight entries interfering
# with pod connectivity if we deploy with Azure IPAM.
if ip l show dev azure0 >/dev/null 2>&1; then

  # In Azure IPAM mode, also remove the azure-vnet state file, otherwise ebtables rules get
  # restored by the azure-vnet CNI plugin on every CNI CHECK, which can cause connectivity
  # issues in Cilium-managed Pods. Since azure-vnet is no longer called on scheduling events,
  # this file can be removed.
  rm -f /var/run/azure-vnet.json

  # This breaks connectivity for existing workload Pods when Cilium is scheduled, but we need
  # to flush these to prevent Cilium-managed Pod IPs conflicting with Pod IPs previously allocated
  # by azure-vnet. These ebtables DNAT rules contain fixed MACs that are no longer bound on the node,
  # causing packets for these Pods to be redirected back out to the gateway, where they are dropped.
  echo 'Flushing ebtables pre/postrouting rules in nat table.. (disconnecting non-Cilium Pods!)'
  ebtables -t nat -F PREROUTING || true
  ebtables -t nat -F POSTROUTING || true

  # ip-masq-agent periodically injects PERM neigh entries towards the gateway
  # for all other k8s nodes in the cluster. These are safe to flush, as ARP can
  # resolve these nodes as usual. PERM entries will be automatically restored later.
  echo 'Deleting all permanent neighbour entries on azure0...'
  ip neigh show dev azure0 nud permanent | cut -d' ' -f1 | xargs -r -n1 ip neigh del dev azure0 to || true
fi
{{- end }}

{{- if .Values.nodeinit.revertReconfigureKubelet }}
rm -f /tmp/node-deinit.cilium.io
{{- end }}
echo "Node initialization complete"