Fix joining nodes retrying kubeadm 3 times in all cases

Signed-off-by: Daniel Weiße <dw@edgeless.systems>
This commit is contained in:
Daniel Weiße 2024-03-07 11:33:07 +01:00
parent 21ba262a49
commit cc615f4072
No known key found for this signature in database
GPG key ID: 7DD3015F3DDE4B9C
3 changed files with 13 additions and 6 deletions

View file

@ -13,6 +13,7 @@ import (
"net"
"sync"
"syscall"
"time"
"github.com/edgelesssys/constellation/v2/bootstrapper/internal/clean"
"github.com/edgelesssys/constellation/v2/bootstrapper/internal/diskencryption"
@ -74,7 +75,8 @@ func run(issuer atls.Issuer, openDevice vtpm.TPMOpenFunc, fileHandler file.Handl
go func() {
defer wg.Done()
if err := joinClient.Start(cleaner); err != nil {
log.With(slog.Any("error", err)).Error("Failed to join cluster")
log.With(slog.Any("error", err)).Error("Failed to join cluster. Rebooting...")
time.Sleep(20 * time.Second) // ensure log message is written
reboot()
}
}()
@ -83,7 +85,8 @@ func run(issuer atls.Issuer, openDevice vtpm.TPMOpenFunc, fileHandler file.Handl
go func() {
defer wg.Done()
if err := initServer.Serve(bindIP, bindPort, cleaner); err != nil {
log.With(slog.Any("error", err)).Error("Failed to serve init server")
log.With(slog.Any("error", err)).Error("Failed to serve init server. Rebooting...")
time.Sleep(20 * time.Second) // ensure log message is written
reboot()
}
}()

View file

@ -129,7 +129,9 @@ func (s *Server) Serve(ip, port string, cleaner cleaner) error {
// If Init failed, we mark the disk for reset, so the node can restart the process
// In this case we don't care about any potential errors from the grpc server
if s.initFailure != nil {
return errors.Join(s.initFailure, s.markDiskForReset())
s.log.Error("Fatal error during Init request", "error", s.initFailure)
resetErr := s.markDiskForReset()
return errors.Join(s.initFailure, resetErr)
}
return err

View file

@ -150,7 +150,8 @@ func (c *JoinClient) Start(cleaner cleaner) error {
if err := c.startNodeAndJoin(ticket, kubeletKey, cleaner); err != nil {
c.log.With(slog.Any("error", err)).Error("Failed to start node and join cluster") // unrecoverable error
return errors.Join(err, c.markDiskForReset())
resetErr := c.markDiskForReset()
return errors.Join(err, resetErr)
}
return nil
@ -293,9 +294,10 @@ func (c *JoinClient) startNodeAndJoin(ticket *joinproto.IssueJoinTicketResponse,
// sometimes fails transiently, and we don't want to brick the node because of that.
for i := range 3 {
err = c.joiner.JoinCluster(ctx, btd, c.role, ticket.KubernetesComponents, c.log)
if err != nil {
c.log.Error("failed to join k8s cluster", "role", c.role, "attempt", i, "error", err)
if err == nil {
break
}
c.log.Error("failed to join k8s cluster", "role", c.role, "attempt", i, "error", err)
}
if err != nil {
return fmt.Errorf("joining Kubernetes cluster: %w", err)