mirror of
https://github.com/edgelesssys/constellation.git
synced 2025-02-02 10:35:08 -05:00
bootstrapper: bounded retry of k8s join (#2968)
This commit is contained in:
parent
8b41bcaecc
commit
03fbcafe68
@ -150,6 +150,7 @@ func (c *JoinClient) Start(cleaner cleaner) {
|
|||||||
return
|
return
|
||||||
} else if isUnrecoverable(err) {
|
} else if isUnrecoverable(err) {
|
||||||
c.log.With(slog.Any("error", err)).Error("Unrecoverable error occurred")
|
c.log.With(slog.Any("error", err)).Error("Unrecoverable error occurred")
|
||||||
|
// TODO(burgerdev): this should eventually lead to a full node reset
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
c.log.With(slog.Any("error", err)).Warn("Join failed for all available endpoints")
|
c.log.With(slog.Any("error", err)).Warn("Join failed for all available endpoints")
|
||||||
@ -310,7 +311,15 @@ func (c *JoinClient) startNodeAndJoin(ticket *joinproto.IssueJoinTicketResponse,
|
|||||||
CACertHashes: []string{ticket.DiscoveryTokenCaCertHash},
|
CACertHashes: []string{ticket.DiscoveryTokenCaCertHash},
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := c.joiner.JoinCluster(ctx, btd, c.role, ticket.KubernetesComponents, c.log); err != nil {
|
// We currently cannot recover from any failure in this function. Joining the k8s cluster
|
||||||
|
// sometimes fails transiently, and we don't want to brick the node because of that.
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
err = c.joiner.JoinCluster(ctx, btd, c.role, ticket.KubernetesComponents, c.log)
|
||||||
|
if err != nil {
|
||||||
|
c.log.Error("failed to join k8s cluster", "role", c.role, "attempt", i, "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
return fmt.Errorf("joining Kubernetes cluster: %w", err)
|
return fmt.Errorf("joining Kubernetes cluster: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,6 +62,7 @@ func TestClient(t *testing.T) {
|
|||||||
apiAnswers []any
|
apiAnswers []any
|
||||||
wantLock bool
|
wantLock bool
|
||||||
wantJoin bool
|
wantJoin bool
|
||||||
|
wantNumJoins int
|
||||||
}{
|
}{
|
||||||
"on worker: metadata self: errors occur": {
|
"on worker: metadata self: errors occur": {
|
||||||
role: role.Worker,
|
role: role.Worker,
|
||||||
@ -168,12 +169,26 @@ func TestClient(t *testing.T) {
|
|||||||
listAnswer{instances: peers},
|
listAnswer{instances: peers},
|
||||||
issueJoinTicketAnswer{},
|
issueJoinTicketAnswer{},
|
||||||
},
|
},
|
||||||
clusterJoiner: &stubClusterJoiner{joinClusterErr: someErr},
|
clusterJoiner: &stubClusterJoiner{numBadCalls: -1, joinClusterErr: someErr},
|
||||||
nodeLock: newFakeLock(),
|
nodeLock: newFakeLock(),
|
||||||
disk: &stubDisk{},
|
disk: &stubDisk{},
|
||||||
wantJoin: true,
|
wantJoin: true,
|
||||||
wantLock: true,
|
wantLock: true,
|
||||||
},
|
},
|
||||||
|
"on control plane: joinCluster fails transiently": {
|
||||||
|
role: role.ControlPlane,
|
||||||
|
apiAnswers: []any{
|
||||||
|
selfAnswer{instance: controlSelf},
|
||||||
|
listAnswer{instances: peers},
|
||||||
|
issueJoinTicketAnswer{},
|
||||||
|
},
|
||||||
|
clusterJoiner: &stubClusterJoiner{numBadCalls: 1, joinClusterErr: someErr},
|
||||||
|
nodeLock: newFakeLock(),
|
||||||
|
disk: &stubDisk{},
|
||||||
|
wantJoin: true,
|
||||||
|
wantLock: true,
|
||||||
|
wantNumJoins: 2,
|
||||||
|
},
|
||||||
"on control plane: node already locked": {
|
"on control plane: node already locked": {
|
||||||
role: role.ControlPlane,
|
role: role.ControlPlane,
|
||||||
apiAnswers: []any{
|
apiAnswers: []any{
|
||||||
@ -250,9 +265,12 @@ func TestClient(t *testing.T) {
|
|||||||
client.Stop()
|
client.Stop()
|
||||||
|
|
||||||
if tc.wantJoin {
|
if tc.wantJoin {
|
||||||
assert.True(tc.clusterJoiner.joinClusterCalled)
|
assert.Greater(tc.clusterJoiner.joinClusterCalled, 0)
|
||||||
} else {
|
} else {
|
||||||
assert.False(tc.clusterJoiner.joinClusterCalled)
|
assert.Equal(0, tc.clusterJoiner.joinClusterCalled)
|
||||||
|
}
|
||||||
|
if tc.wantNumJoins > 0 {
|
||||||
|
assert.GreaterOrEqual(tc.clusterJoiner.joinClusterCalled, tc.wantNumJoins)
|
||||||
}
|
}
|
||||||
if tc.wantLock {
|
if tc.wantLock {
|
||||||
assert.False(client.nodeLock.TryLockOnce(nil)) // lock should be locked
|
assert.False(client.nodeLock.TryLockOnce(nil)) // lock should be locked
|
||||||
@ -398,12 +416,17 @@ type issueJoinTicketAnswer struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type stubClusterJoiner struct {
|
type stubClusterJoiner struct {
|
||||||
joinClusterCalled bool
|
joinClusterCalled int
|
||||||
|
numBadCalls int
|
||||||
joinClusterErr error
|
joinClusterErr error
|
||||||
}
|
}
|
||||||
|
|
||||||
func (j *stubClusterJoiner) JoinCluster(context.Context, *kubeadm.BootstrapTokenDiscovery, role.Role, components.Components, *slog.Logger) error {
|
func (j *stubClusterJoiner) JoinCluster(context.Context, *kubeadm.BootstrapTokenDiscovery, role.Role, components.Components, *slog.Logger) error {
|
||||||
j.joinClusterCalled = true
|
j.joinClusterCalled++
|
||||||
|
if j.numBadCalls == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
j.numBadCalls--
|
||||||
return j.joinClusterErr
|
return j.joinClusterErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user