mirror of
https://github.com/edgelesssys/constellation.git
synced 2025-07-10 17:09:27 -04:00
Fix joining nodes retrying kubeadm 3 times in all cases
Signed-off-by: Daniel Weiße <dw@edgeless.systems>
This commit is contained in:
parent
21ba262a49
commit
cc615f4072
3 changed files with 13 additions and 6 deletions
|
@ -13,6 +13,7 @@ import (
|
|||
"net"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/edgelesssys/constellation/v2/bootstrapper/internal/clean"
|
||||
"github.com/edgelesssys/constellation/v2/bootstrapper/internal/diskencryption"
|
||||
|
@ -74,7 +75,8 @@ func run(issuer atls.Issuer, openDevice vtpm.TPMOpenFunc, fileHandler file.Handl
|
|||
go func() {
|
||||
defer wg.Done()
|
||||
if err := joinClient.Start(cleaner); err != nil {
|
||||
log.With(slog.Any("error", err)).Error("Failed to join cluster")
|
||||
log.With(slog.Any("error", err)).Error("Failed to join cluster. Rebooting...")
|
||||
time.Sleep(20 * time.Second) // ensure log message is written
|
||||
reboot()
|
||||
}
|
||||
}()
|
||||
|
@ -83,7 +85,8 @@ func run(issuer atls.Issuer, openDevice vtpm.TPMOpenFunc, fileHandler file.Handl
|
|||
go func() {
|
||||
defer wg.Done()
|
||||
if err := initServer.Serve(bindIP, bindPort, cleaner); err != nil {
|
||||
log.With(slog.Any("error", err)).Error("Failed to serve init server")
|
||||
log.With(slog.Any("error", err)).Error("Failed to serve init server. Rebooting...")
|
||||
time.Sleep(20 * time.Second) // ensure log message is written
|
||||
reboot()
|
||||
}
|
||||
}()
|
||||
|
|
|
@ -129,7 +129,9 @@ func (s *Server) Serve(ip, port string, cleaner cleaner) error {
|
|||
// If Init failed, we mark the disk for reset, so the node can restart the process
|
||||
// In this case we don't care about any potential errors from the grpc server
|
||||
if s.initFailure != nil {
|
||||
return errors.Join(s.initFailure, s.markDiskForReset())
|
||||
s.log.Error("Fatal error during Init request", "error", s.initFailure)
|
||||
resetErr := s.markDiskForReset()
|
||||
return errors.Join(s.initFailure, resetErr)
|
||||
}
|
||||
|
||||
return err
|
||||
|
|
|
@ -150,7 +150,8 @@ func (c *JoinClient) Start(cleaner cleaner) error {
|
|||
|
||||
if err := c.startNodeAndJoin(ticket, kubeletKey, cleaner); err != nil {
|
||||
c.log.With(slog.Any("error", err)).Error("Failed to start node and join cluster") // unrecoverable error
|
||||
return errors.Join(err, c.markDiskForReset())
|
||||
resetErr := c.markDiskForReset()
|
||||
return errors.Join(err, resetErr)
|
||||
}
|
||||
|
||||
return nil
|
||||
|
@ -293,9 +294,10 @@ func (c *JoinClient) startNodeAndJoin(ticket *joinproto.IssueJoinTicketResponse,
|
|||
// sometimes fails transiently, and we don't want to brick the node because of that.
|
||||
for i := range 3 {
|
||||
err = c.joiner.JoinCluster(ctx, btd, c.role, ticket.KubernetesComponents, c.log)
|
||||
if err != nil {
|
||||
c.log.Error("failed to join k8s cluster", "role", c.role, "attempt", i, "error", err)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
c.log.Error("failed to join k8s cluster", "role", c.role, "attempt", i, "error", err)
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("joining Kubernetes cluster: %w", err)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue