cli: report log collection failure to user (#2354)

* Report log collection failure to user

* Try collecting logs for more error cases

---------

Signed-off-by: Daniel Weiße <dw@edgeless.systems>
This commit is contained in:
Daniel Weiße 2023-09-25 12:10:07 +02:00 committed by GitHub
parent d0e3e494ba
commit fa4da88375
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 96 additions and 23 deletions

View File

@ -235,7 +235,11 @@ func (i *initCmd) initialize(
if errors.As(err, &nonRetriable) { if errors.As(err, &nonRetriable) {
cmd.PrintErrln("Cluster initialization failed. This error is not recoverable.") cmd.PrintErrln("Cluster initialization failed. This error is not recoverable.")
cmd.PrintErrln("Terminate your cluster and try again.") cmd.PrintErrln("Terminate your cluster and try again.")
cmd.PrintErrf("Fetched bootstrapper logs are stored in %q\n", i.pf.PrefixPrintablePath(constants.ErrorLog)) if nonRetriable.logCollectionErr != nil {
cmd.PrintErrf("Failed to collect logs from bootstrapper: %s\n", nonRetriable.logCollectionErr)
} else {
cmd.PrintErrf("Fetched bootstrapper logs are stored in %q\n", i.pf.PrefixPrintablePath(constants.ErrorLog))
}
} }
return err return err
} }
@ -330,7 +334,10 @@ func (d *initDoer) Do(ctx context.Context) error {
// connectedOnce is set in handleGRPCStateChanges when a connection was established in one retry attempt. // connectedOnce is set in handleGRPCStateChanges when a connection was established in one retry attempt.
// This should cancel any other retry attempts when the connection is lost since the bootstrapper likely won't accept any new attempts anymore. // This should cancel any other retry attempts when the connection is lost since the bootstrapper likely won't accept any new attempts anymore.
if d.connectedOnce { if d.connectedOnce {
return &nonRetriableError{errors.New("init already connected to the remote server in a previous attempt - resumption is not supported")} return &nonRetriableError{
logCollectionErr: errors.New("init already connected to the remote server in a previous attempt - resumption is not supported"),
err: errors.New("init already connected to the remote server in a previous attempt - resumption is not supported"),
}
} }
conn, err := d.dialer.Dial(ctx, d.endpoint) conn, err := d.dialer.Dial(ctx, d.endpoint)
@ -351,31 +358,58 @@ func (d *initDoer) Do(ctx context.Context) error {
d.log.Debugf("Created protoClient") d.log.Debugf("Created protoClient")
resp, err := protoClient.Init(ctx, d.req) resp, err := protoClient.Init(ctx, d.req)
if err != nil { if err != nil {
return &nonRetriableError{fmt.Errorf("init call: %w", err)} return &nonRetriableError{
logCollectionErr: errors.New("rpc failed before first response was received - no logs available"),
err: fmt.Errorf("init call: %w", err),
}
} }
res, err := resp.Recv() // get first response, either success or failure res, err := resp.Recv() // get first response, either success or failure
if err != nil { if err != nil {
if e := d.getLogs(resp); e != nil { if e := d.getLogs(resp); e != nil {
d.log.Debugf("Failed to collect logs: %s", e) d.log.Debugf("Failed to collect logs: %s", e)
return &nonRetriableError{
logCollectionErr: e,
err: err,
}
} }
return &nonRetriableError{err} return &nonRetriableError{err: err}
} }
switch res.Kind.(type) { switch res.Kind.(type) {
case *initproto.InitResponse_InitFailure: case *initproto.InitResponse_InitFailure:
if e := d.getLogs(resp); e != nil { if e := d.getLogs(resp); e != nil {
d.log.Debugf("Failed to get logs from cluster: %s", e) d.log.Debugf("Failed to get logs from cluster: %s", e)
return &nonRetriableError{
logCollectionErr: e,
err: errors.New(res.GetInitFailure().GetError()),
}
} }
return &nonRetriableError{errors.New(res.GetInitFailure().GetError())} return &nonRetriableError{err: errors.New(res.GetInitFailure().GetError())}
case *initproto.InitResponse_InitSuccess: case *initproto.InitResponse_InitSuccess:
d.resp = res.GetInitSuccess() d.resp = res.GetInitSuccess()
case nil: case nil:
d.log.Debugf("Cluster returned nil response type") d.log.Debugf("Cluster returned nil response type")
return &nonRetriableError{errors.New("empty response from cluster")} err = errors.New("empty response from cluster")
if e := d.getLogs(resp); e != nil {
d.log.Debugf("Failed to collect logs: %s", e)
return &nonRetriableError{
logCollectionErr: e,
err: err,
}
}
return &nonRetriableError{err: err}
default: default:
d.log.Debugf("Cluster returned unknown response type") d.log.Debugf("Cluster returned unknown response type")
return &nonRetriableError{errors.New("unknown response from cluster")} err = errors.New("unknown response from cluster")
if e := d.getLogs(resp); e != nil {
d.log.Debugf("Failed to collect logs: %s", e)
return &nonRetriableError{
logCollectionErr: e,
err: err,
}
}
return &nonRetriableError{err: err}
} }
return nil return nil
@ -392,9 +426,18 @@ func (d *initDoer) getLogs(resp initproto.API_InitClient) error {
return err return err
} }
switch res.Kind.(type) {
case *initproto.InitResponse_InitFailure:
return errors.New("trying to collect logs: received init failure response, expected log response")
case *initproto.InitResponse_InitSuccess:
return errors.New("trying to collect logs: received init success response, expected log response")
case nil:
return errors.New("trying to collect logs: received nil response, expected log response")
}
log := res.GetLog().GetLog() log := res.GetLog().GetLog()
if log == nil { if log == nil {
return errors.New("sent empty logs") return errors.New("received empty logs")
} }
if err := d.fh.Write(constants.ErrorLog, log, file.OptAppend); err != nil { if err := d.fh.Write(constants.ErrorLog, log, file.OptAppend); err != nil {
@ -609,7 +652,8 @@ type grpcDialer interface {
} }
type nonRetriableError struct { type nonRetriableError struct {
err error logCollectionErr error
err error
} }
// Error returns the error message. // Error returns the error message.

View File

@ -90,22 +90,47 @@ func TestInitialize(t *testing.T) {
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
configMutator: func(c *config.Config) { c.Provider.GCP.ServiceAccountKeyPath = serviceAccPath }, configMutator: func(c *config.Config) { c.Provider.GCP.ServiceAccountKeyPath = serviceAccPath },
serviceAccKey: gcpServiceAccKey, serviceAccKey: gcpServiceAccKey,
initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}},
}, },
"initialize some azure instances": { "initialize some azure instances": {
provider: cloudprovider.Azure, provider: cloudprovider.Azure,
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}},
}, },
"initialize some qemu instances": { "initialize some qemu instances": {
provider: cloudprovider.QEMU, provider: cloudprovider.QEMU,
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}},
}, },
"non retriable error": { "non retriable error": {
provider: cloudprovider.QEMU, provider: cloudprovider.QEMU,
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{initErr: &nonRetriableError{assert.AnError}}, initServerAPI: &stubInitServer{initErr: &nonRetriableError{err: assert.AnError}},
retriable: false,
masterSecretShouldExist: true,
wantErr: true,
},
"non retriable error with failed log collection": {
provider: cloudprovider.QEMU,
idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{
res: []*initproto.InitResponse{
{
Kind: &initproto.InitResponse_InitFailure{
InitFailure: &initproto.InitFailureResponse{
Error: "error",
},
},
},
{
Kind: &initproto.InitResponse_InitFailure{
InitFailure: &initproto.InitFailureResponse{
Error: "error",
},
},
},
},
},
retriable: false, retriable: false,
masterSecretShouldExist: true, masterSecretShouldExist: true,
wantErr: true, wantErr: true,
@ -132,7 +157,7 @@ func TestInitialize(t *testing.T) {
"k8s version without v works": { "k8s version without v works": {
provider: cloudprovider.Azure, provider: cloudprovider.Azure,
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}},
configMutator: func(c *config.Config) { configMutator: func(c *config.Config) {
res, err := versions.NewValidK8sVersion(strings.TrimPrefix(string(versions.Default), "v"), true) res, err := versions.NewValidK8sVersion(strings.TrimPrefix(string(versions.Default), "v"), true)
require.NoError(t, err) require.NoError(t, err)
@ -142,7 +167,7 @@ func TestInitialize(t *testing.T) {
"outdated k8s patch version doesn't work": { "outdated k8s patch version doesn't work": {
provider: cloudprovider.Azure, provider: cloudprovider.Azure,
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}},
configMutator: func(c *config.Config) { configMutator: func(c *config.Config) {
v, err := semver.New(versions.SupportedK8sVersions()[0]) v, err := semver.New(versions.SupportedK8sVersions()[0])
require.NoError(t, err) require.NoError(t, err)
@ -458,12 +483,14 @@ func TestAttestation(t *testing.T) {
assert := assert.New(t) assert := assert.New(t)
require := require.New(t) require := require.New(t)
initServerAPI := &stubInitServer{res: &initproto.InitResponse{ initServerAPI := &stubInitServer{res: []*initproto.InitResponse{
Kind: &initproto.InitResponse_InitSuccess{ {
InitSuccess: &initproto.InitSuccessResponse{ Kind: &initproto.InitResponse_InitSuccess{
Kubeconfig: []byte("kubeconfig"), InitSuccess: &initproto.InitSuccessResponse{
OwnerId: []byte("ownerID"), Kubeconfig: []byte("kubeconfig"),
ClusterId: []byte("clusterID"), OwnerId: []byte("ownerID"),
ClusterId: []byte("clusterID"),
},
}, },
}, },
}} }}
@ -577,14 +604,16 @@ func (i *testIssuer) Issue(_ context.Context, userData []byte, _ []byte) ([]byte
} }
type stubInitServer struct { type stubInitServer struct {
res *initproto.InitResponse res []*initproto.InitResponse
initErr error initErr error
initproto.UnimplementedAPIServer initproto.UnimplementedAPIServer
} }
func (s *stubInitServer) Init(_ *initproto.InitRequest, stream initproto.API_InitServer) error { func (s *stubInitServer) Init(_ *initproto.InitRequest, stream initproto.API_InitServer) error {
_ = stream.Send(s.res) for _, r := range s.res {
_ = stream.Send(r)
}
return s.initErr return s.initErr
} }