bootstrapper: bounded retry of k8s join (#2968)

This commit is contained in:
Markus Rudy 2024-03-05 09:14:01 +01:00 committed by GitHub
parent 8b41bcaecc
commit 03fbcafe68
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 38 additions and 6 deletions

View File

@ -150,6 +150,7 @@ func (c *JoinClient) Start(cleaner cleaner) {
return return
} else if isUnrecoverable(err) { } else if isUnrecoverable(err) {
c.log.With(slog.Any("error", err)).Error("Unrecoverable error occurred") c.log.With(slog.Any("error", err)).Error("Unrecoverable error occurred")
// TODO(burgerdev): this should eventually lead to a full node reset
return return
} }
c.log.With(slog.Any("error", err)).Warn("Join failed for all available endpoints") c.log.With(slog.Any("error", err)).Warn("Join failed for all available endpoints")
@ -310,7 +311,15 @@ func (c *JoinClient) startNodeAndJoin(ticket *joinproto.IssueJoinTicketResponse,
CACertHashes: []string{ticket.DiscoveryTokenCaCertHash}, CACertHashes: []string{ticket.DiscoveryTokenCaCertHash},
} }
if err := c.joiner.JoinCluster(ctx, btd, c.role, ticket.KubernetesComponents, c.log); err != nil { // We currently cannot recover from any failure in this function. Joining the k8s cluster
// sometimes fails transiently, and we don't want to brick the node because of that.
for i := 0; i < 3; i++ {
err = c.joiner.JoinCluster(ctx, btd, c.role, ticket.KubernetesComponents, c.log)
if err != nil {
c.log.Error("failed to join k8s cluster", "role", c.role, "attempt", i, "error", err)
}
}
if err != nil {
return fmt.Errorf("joining Kubernetes cluster: %w", err) return fmt.Errorf("joining Kubernetes cluster: %w", err)
} }

View File

@ -62,6 +62,7 @@ func TestClient(t *testing.T) {
apiAnswers []any apiAnswers []any
wantLock bool wantLock bool
wantJoin bool wantJoin bool
wantNumJoins int
}{ }{
"on worker: metadata self: errors occur": { "on worker: metadata self: errors occur": {
role: role.Worker, role: role.Worker,
@ -168,12 +169,26 @@ func TestClient(t *testing.T) {
listAnswer{instances: peers}, listAnswer{instances: peers},
issueJoinTicketAnswer{}, issueJoinTicketAnswer{},
}, },
clusterJoiner: &stubClusterJoiner{joinClusterErr: someErr}, clusterJoiner: &stubClusterJoiner{numBadCalls: -1, joinClusterErr: someErr},
nodeLock: newFakeLock(), nodeLock: newFakeLock(),
disk: &stubDisk{}, disk: &stubDisk{},
wantJoin: true, wantJoin: true,
wantLock: true, wantLock: true,
}, },
"on control plane: joinCluster fails transiently": {
role: role.ControlPlane,
apiAnswers: []any{
selfAnswer{instance: controlSelf},
listAnswer{instances: peers},
issueJoinTicketAnswer{},
},
clusterJoiner: &stubClusterJoiner{numBadCalls: 1, joinClusterErr: someErr},
nodeLock: newFakeLock(),
disk: &stubDisk{},
wantJoin: true,
wantLock: true,
wantNumJoins: 2,
},
"on control plane: node already locked": { "on control plane: node already locked": {
role: role.ControlPlane, role: role.ControlPlane,
apiAnswers: []any{ apiAnswers: []any{
@ -250,9 +265,12 @@ func TestClient(t *testing.T) {
client.Stop() client.Stop()
if tc.wantJoin { if tc.wantJoin {
assert.True(tc.clusterJoiner.joinClusterCalled) assert.Greater(tc.clusterJoiner.joinClusterCalled, 0)
} else { } else {
assert.False(tc.clusterJoiner.joinClusterCalled) assert.Equal(0, tc.clusterJoiner.joinClusterCalled)
}
if tc.wantNumJoins > 0 {
assert.GreaterOrEqual(tc.clusterJoiner.joinClusterCalled, tc.wantNumJoins)
} }
if tc.wantLock { if tc.wantLock {
assert.False(client.nodeLock.TryLockOnce(nil)) // lock should be locked assert.False(client.nodeLock.TryLockOnce(nil)) // lock should be locked
@ -398,12 +416,17 @@ type issueJoinTicketAnswer struct {
} }
type stubClusterJoiner struct { type stubClusterJoiner struct {
joinClusterCalled bool joinClusterCalled int
numBadCalls int
joinClusterErr error joinClusterErr error
} }
func (j *stubClusterJoiner) JoinCluster(context.Context, *kubeadm.BootstrapTokenDiscovery, role.Role, components.Components, *slog.Logger) error { func (j *stubClusterJoiner) JoinCluster(context.Context, *kubeadm.BootstrapTokenDiscovery, role.Role, components.Components, *slog.Logger) error {
j.joinClusterCalled = true j.joinClusterCalled++
if j.numBadCalls == 0 {
return nil
}
j.numBadCalls--
return j.joinClusterErr return j.joinClusterErr
} }