AB#2260 Refactor disk-mapper recovery (#82)

* Refactor disk-mapper recovery * Adapt constellation recover command to use new disk-mapper recovery API * Fix Cilium connectivity on rebooting nodes (#89) * Lower CoreDNS reschedule timeout to 10 seconds (#93) Signed-off-by: Daniel Weiße <dw@edgeless.systems>
2025-12-15 16:09:39 -05:00 · 2022-09-08 14:45:27 +02:00 · 2022-09-08 14:45:27 +02:00 · 8cb155d5c5
commit 8cb155d5c5
parent a7b20b2a11
40 changed files with 1600 additions and 1130 deletions
--- a/bootstrapper/internal/kubernetes/k8sapi/systemd.go
+++ b/bootstrapper/internal/kubernetes/k8sapi/systemd.go
@ -13,30 +13,6 @@ import (
 	"github.com/coreos/go-systemd/v22/dbus"
 )

-func restartSystemdUnit(ctx context.Context, unit string) error {
-	conn, err := dbus.NewSystemdConnectionContext(ctx)
-	if err != nil {
-		return fmt.Errorf("establishing systemd connection: %w", err)
-	}
-
-	restartChan := make(chan string)
-	if _, err := conn.RestartUnitContext(ctx, unit, "replace", restartChan); err != nil {
-		return fmt.Errorf("restarting systemd unit %q: %w", unit, err)
-	}
-
-	// Wait for the restart to finish and actually check if it was
-	// successful or not.
-	result := <-restartChan
-
-	switch result {
-	case "done":
-		return nil
-
-	default:
-		return fmt.Errorf("restarting systemd unit %q failed: expected %v but received %v", unit, "done", result)
-	}
-}
-
 func startSystemdUnit(ctx context.Context, unit string) error {
 	conn, err := dbus.NewSystemdConnectionContext(ctx)
 	if err != nil {
--- a/bootstrapper/internal/kubernetes/k8sapi/util.go
+++ b/bootstrapper/internal/kubernetes/k8sapi/util.go
@ -264,12 +264,19 @@ func (k *KubernetesUtil) deployCiliumGCP(ctx context.Context, helmClient *action
 		return err
 	}

+	timeoutS := int64(10)
 	// allow coredns to run on uninitialized nodes (required by cloud-controller-manager)
 	tolerations := []corev1.Toleration{
 		{
 			Key:    "node.cloudprovider.kubernetes.io/uninitialized",
 			Value:  "true",
-			Effect: "NoSchedule",
+			Effect: corev1.TaintEffectNoSchedule,
+		},
+		{
+			Key:               "node.kubernetes.io/unreachable",
+			Operator:          corev1.TolerationOpExists,
+			Effect:            corev1.TaintEffectNoExecute,
+			TolerationSeconds: &timeoutS,
 		},
 	}
 	if err = kubectl.AddTolerationsToDeployment(ctx, tolerations, "coredns", "kube-system"); err != nil {
@ -305,7 +312,7 @@ func (k *KubernetesUtil) deployCiliumGCP(ctx context.Context, helmClient *action

 // FixCilium fixes https://github.com/cilium/cilium/issues/19958 but instead of a rollout restart of
 // the cilium daemonset, it only restarts the local cilium pod.
-func (k *KubernetesUtil) FixCilium(nodeNameK8s string, log *logger.Logger) {
+func (k *KubernetesUtil) FixCilium(log *logger.Logger) {
 	// wait for cilium pod to be healthy
 	client := http.Client{}
 	for {
@ -487,13 +494,6 @@ func (k *KubernetesUtil) StartKubelet() error {
 	return startSystemdUnit(ctx, "kubelet.service")
 }

-// RestartKubelet restarts a kubelet.
-func (k *KubernetesUtil) RestartKubelet() error {
-	ctx, cancel := context.WithTimeout(context.TODO(), kubeletStartTimeout)
-	defer cancel()
-	return restartSystemdUnit(ctx, "kubelet.service")
-}
-
 // createSignedKubeletCert manually creates a Kubernetes CA signed kubelet certificate for the bootstrapper node.
 // This is necessary because this node does not request a certificate from the join service.
 func (k *KubernetesUtil) createSignedKubeletCert(nodeName string, ips []net.IP) error {