From cc615f40728b17f6f1b01a11bf9a6768b382f13a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Wei=C3=9Fe?= Date: Thu, 7 Mar 2024 11:33:07 +0100 Subject: [PATCH] Fix joining nodes retrying kubeadm 3 times in all cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Daniel Weiße --- bootstrapper/cmd/bootstrapper/run.go | 7 +++++-- bootstrapper/internal/initserver/initserver.go | 4 +++- bootstrapper/internal/joinclient/joinclient.go | 8 +++++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/bootstrapper/cmd/bootstrapper/run.go b/bootstrapper/cmd/bootstrapper/run.go index cd7a3c10b..9778457af 100644 --- a/bootstrapper/cmd/bootstrapper/run.go +++ b/bootstrapper/cmd/bootstrapper/run.go @@ -13,6 +13,7 @@ import ( "net" "sync" "syscall" + "time" "github.com/edgelesssys/constellation/v2/bootstrapper/internal/clean" "github.com/edgelesssys/constellation/v2/bootstrapper/internal/diskencryption" @@ -74,7 +75,8 @@ func run(issuer atls.Issuer, openDevice vtpm.TPMOpenFunc, fileHandler file.Handl go func() { defer wg.Done() if err := joinClient.Start(cleaner); err != nil { - log.With(slog.Any("error", err)).Error("Failed to join cluster") + log.With(slog.Any("error", err)).Error("Failed to join cluster. Rebooting...") + time.Sleep(20 * time.Second) // ensure log message is written reboot() } }() @@ -83,7 +85,8 @@ func run(issuer atls.Issuer, openDevice vtpm.TPMOpenFunc, fileHandler file.Handl go func() { defer wg.Done() if err := initServer.Serve(bindIP, bindPort, cleaner); err != nil { - log.With(slog.Any("error", err)).Error("Failed to serve init server") + log.With(slog.Any("error", err)).Error("Failed to serve init server. Rebooting...") + time.Sleep(20 * time.Second) // ensure log message is written reboot() } }() diff --git a/bootstrapper/internal/initserver/initserver.go b/bootstrapper/internal/initserver/initserver.go index 6634f376f..948e1220d 100644 --- a/bootstrapper/internal/initserver/initserver.go +++ b/bootstrapper/internal/initserver/initserver.go @@ -129,7 +129,9 @@ func (s *Server) Serve(ip, port string, cleaner cleaner) error { // If Init failed, we mark the disk for reset, so the node can restart the process // In this case we don't care about any potential errors from the grpc server if s.initFailure != nil { - return errors.Join(s.initFailure, s.markDiskForReset()) + s.log.Error("Fatal error during Init request", "error", s.initFailure) + resetErr := s.markDiskForReset() + return errors.Join(s.initFailure, resetErr) } return err diff --git a/bootstrapper/internal/joinclient/joinclient.go b/bootstrapper/internal/joinclient/joinclient.go index c0e9e7b53..068c58d0f 100644 --- a/bootstrapper/internal/joinclient/joinclient.go +++ b/bootstrapper/internal/joinclient/joinclient.go @@ -150,7 +150,8 @@ func (c *JoinClient) Start(cleaner cleaner) error { if err := c.startNodeAndJoin(ticket, kubeletKey, cleaner); err != nil { c.log.With(slog.Any("error", err)).Error("Failed to start node and join cluster") // unrecoverable error - return errors.Join(err, c.markDiskForReset()) + resetErr := c.markDiskForReset() + return errors.Join(err, resetErr) } return nil @@ -293,9 +294,10 @@ func (c *JoinClient) startNodeAndJoin(ticket *joinproto.IssueJoinTicketResponse, // sometimes fails transiently, and we don't want to brick the node because of that. for i := range 3 { err = c.joiner.JoinCluster(ctx, btd, c.role, ticket.KubernetesComponents, c.log) - if err != nil { - c.log.Error("failed to join k8s cluster", "role", c.role, "attempt", i, "error", err) + if err == nil { + break } + c.log.Error("failed to join k8s cluster", "role", c.role, "attempt", i, "error", err) } if err != nil { return fmt.Errorf("joining Kubernetes cluster: %w", err)