cli: report log collection failure to user (#2354)

* Report log collection failure to user

* Try collecting logs for more error cases

---------

Signed-off-by: Daniel Weiße <dw@edgeless.systems>
This commit is contained in:
Daniel Weiße 2023-09-25 12:10:07 +02:00 committed by GitHub
parent d0e3e494ba
commit fa4da88375
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 96 additions and 23 deletions

View File

@ -235,8 +235,12 @@ func (i *initCmd) initialize(
if errors.As(err, &nonRetriable) { if errors.As(err, &nonRetriable) {
cmd.PrintErrln("Cluster initialization failed. This error is not recoverable.") cmd.PrintErrln("Cluster initialization failed. This error is not recoverable.")
cmd.PrintErrln("Terminate your cluster and try again.") cmd.PrintErrln("Terminate your cluster and try again.")
if nonRetriable.logCollectionErr != nil {
cmd.PrintErrf("Failed to collect logs from bootstrapper: %s\n", nonRetriable.logCollectionErr)
} else {
cmd.PrintErrf("Fetched bootstrapper logs are stored in %q\n", i.pf.PrefixPrintablePath(constants.ErrorLog)) cmd.PrintErrf("Fetched bootstrapper logs are stored in %q\n", i.pf.PrefixPrintablePath(constants.ErrorLog))
} }
}
return err return err
} }
i.log.Debugf("Initialization request succeeded") i.log.Debugf("Initialization request succeeded")
@ -330,7 +334,10 @@ func (d *initDoer) Do(ctx context.Context) error {
// connectedOnce is set in handleGRPCStateChanges when a connection was established in one retry attempt. // connectedOnce is set in handleGRPCStateChanges when a connection was established in one retry attempt.
// This should cancel any other retry attempts when the connection is lost since the bootstrapper likely won't accept any new attempts anymore. // This should cancel any other retry attempts when the connection is lost since the bootstrapper likely won't accept any new attempts anymore.
if d.connectedOnce { if d.connectedOnce {
return &nonRetriableError{errors.New("init already connected to the remote server in a previous attempt - resumption is not supported")} return &nonRetriableError{
logCollectionErr: errors.New("init already connected to the remote server in a previous attempt - resumption is not supported"),
err: errors.New("init already connected to the remote server in a previous attempt - resumption is not supported"),
}
} }
conn, err := d.dialer.Dial(ctx, d.endpoint) conn, err := d.dialer.Dial(ctx, d.endpoint)
@ -351,31 +358,58 @@ func (d *initDoer) Do(ctx context.Context) error {
d.log.Debugf("Created protoClient") d.log.Debugf("Created protoClient")
resp, err := protoClient.Init(ctx, d.req) resp, err := protoClient.Init(ctx, d.req)
if err != nil { if err != nil {
return &nonRetriableError{fmt.Errorf("init call: %w", err)} return &nonRetriableError{
logCollectionErr: errors.New("rpc failed before first response was received - no logs available"),
err: fmt.Errorf("init call: %w", err),
}
} }
res, err := resp.Recv() // get first response, either success or failure res, err := resp.Recv() // get first response, either success or failure
if err != nil { if err != nil {
if e := d.getLogs(resp); e != nil { if e := d.getLogs(resp); e != nil {
d.log.Debugf("Failed to collect logs: %s", e) d.log.Debugf("Failed to collect logs: %s", e)
return &nonRetriableError{
logCollectionErr: e,
err: err,
} }
return &nonRetriableError{err} }
return &nonRetriableError{err: err}
} }
switch res.Kind.(type) { switch res.Kind.(type) {
case *initproto.InitResponse_InitFailure: case *initproto.InitResponse_InitFailure:
if e := d.getLogs(resp); e != nil { if e := d.getLogs(resp); e != nil {
d.log.Debugf("Failed to get logs from cluster: %s", e) d.log.Debugf("Failed to get logs from cluster: %s", e)
return &nonRetriableError{
logCollectionErr: e,
err: errors.New(res.GetInitFailure().GetError()),
} }
return &nonRetriableError{errors.New(res.GetInitFailure().GetError())} }
return &nonRetriableError{err: errors.New(res.GetInitFailure().GetError())}
case *initproto.InitResponse_InitSuccess: case *initproto.InitResponse_InitSuccess:
d.resp = res.GetInitSuccess() d.resp = res.GetInitSuccess()
case nil: case nil:
d.log.Debugf("Cluster returned nil response type") d.log.Debugf("Cluster returned nil response type")
return &nonRetriableError{errors.New("empty response from cluster")} err = errors.New("empty response from cluster")
if e := d.getLogs(resp); e != nil {
d.log.Debugf("Failed to collect logs: %s", e)
return &nonRetriableError{
logCollectionErr: e,
err: err,
}
}
return &nonRetriableError{err: err}
default: default:
d.log.Debugf("Cluster returned unknown response type") d.log.Debugf("Cluster returned unknown response type")
return &nonRetriableError{errors.New("unknown response from cluster")} err = errors.New("unknown response from cluster")
if e := d.getLogs(resp); e != nil {
d.log.Debugf("Failed to collect logs: %s", e)
return &nonRetriableError{
logCollectionErr: e,
err: err,
}
}
return &nonRetriableError{err: err}
} }
return nil return nil
@ -392,9 +426,18 @@ func (d *initDoer) getLogs(resp initproto.API_InitClient) error {
return err return err
} }
switch res.Kind.(type) {
case *initproto.InitResponse_InitFailure:
return errors.New("trying to collect logs: received init failure response, expected log response")
case *initproto.InitResponse_InitSuccess:
return errors.New("trying to collect logs: received init success response, expected log response")
case nil:
return errors.New("trying to collect logs: received nil response, expected log response")
}
log := res.GetLog().GetLog() log := res.GetLog().GetLog()
if log == nil { if log == nil {
return errors.New("sent empty logs") return errors.New("received empty logs")
} }
if err := d.fh.Write(constants.ErrorLog, log, file.OptAppend); err != nil { if err := d.fh.Write(constants.ErrorLog, log, file.OptAppend); err != nil {
@ -609,6 +652,7 @@ type grpcDialer interface {
} }
type nonRetriableError struct { type nonRetriableError struct {
logCollectionErr error
err error err error
} }

View File

@ -90,22 +90,47 @@ func TestInitialize(t *testing.T) {
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
configMutator: func(c *config.Config) { c.Provider.GCP.ServiceAccountKeyPath = serviceAccPath }, configMutator: func(c *config.Config) { c.Provider.GCP.ServiceAccountKeyPath = serviceAccPath },
serviceAccKey: gcpServiceAccKey, serviceAccKey: gcpServiceAccKey,
initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}},
}, },
"initialize some azure instances": { "initialize some azure instances": {
provider: cloudprovider.Azure, provider: cloudprovider.Azure,
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}},
}, },
"initialize some qemu instances": { "initialize some qemu instances": {
provider: cloudprovider.QEMU, provider: cloudprovider.QEMU,
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}},
}, },
"non retriable error": { "non retriable error": {
provider: cloudprovider.QEMU, provider: cloudprovider.QEMU,
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{initErr: &nonRetriableError{assert.AnError}}, initServerAPI: &stubInitServer{initErr: &nonRetriableError{err: assert.AnError}},
retriable: false,
masterSecretShouldExist: true,
wantErr: true,
},
"non retriable error with failed log collection": {
provider: cloudprovider.QEMU,
idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{
res: []*initproto.InitResponse{
{
Kind: &initproto.InitResponse_InitFailure{
InitFailure: &initproto.InitFailureResponse{
Error: "error",
},
},
},
{
Kind: &initproto.InitResponse_InitFailure{
InitFailure: &initproto.InitFailureResponse{
Error: "error",
},
},
},
},
},
retriable: false, retriable: false,
masterSecretShouldExist: true, masterSecretShouldExist: true,
wantErr: true, wantErr: true,
@ -132,7 +157,7 @@ func TestInitialize(t *testing.T) {
"k8s version without v works": { "k8s version without v works": {
provider: cloudprovider.Azure, provider: cloudprovider.Azure,
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}},
configMutator: func(c *config.Config) { configMutator: func(c *config.Config) {
res, err := versions.NewValidK8sVersion(strings.TrimPrefix(string(versions.Default), "v"), true) res, err := versions.NewValidK8sVersion(strings.TrimPrefix(string(versions.Default), "v"), true)
require.NoError(t, err) require.NoError(t, err)
@ -142,7 +167,7 @@ func TestInitialize(t *testing.T) {
"outdated k8s patch version doesn't work": { "outdated k8s patch version doesn't work": {
provider: cloudprovider.Azure, provider: cloudprovider.Azure,
idFile: &clusterid.File{IP: "192.0.2.1"}, idFile: &clusterid.File{IP: "192.0.2.1"},
initServerAPI: &stubInitServer{res: &initproto.InitResponse{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}, initServerAPI: &stubInitServer{res: []*initproto.InitResponse{{Kind: &initproto.InitResponse_InitSuccess{InitSuccess: testInitResp}}}},
configMutator: func(c *config.Config) { configMutator: func(c *config.Config) {
v, err := semver.New(versions.SupportedK8sVersions()[0]) v, err := semver.New(versions.SupportedK8sVersions()[0])
require.NoError(t, err) require.NoError(t, err)
@ -458,7 +483,8 @@ func TestAttestation(t *testing.T) {
assert := assert.New(t) assert := assert.New(t)
require := require.New(t) require := require.New(t)
initServerAPI := &stubInitServer{res: &initproto.InitResponse{ initServerAPI := &stubInitServer{res: []*initproto.InitResponse{
{
Kind: &initproto.InitResponse_InitSuccess{ Kind: &initproto.InitResponse_InitSuccess{
InitSuccess: &initproto.InitSuccessResponse{ InitSuccess: &initproto.InitSuccessResponse{
Kubeconfig: []byte("kubeconfig"), Kubeconfig: []byte("kubeconfig"),
@ -466,6 +492,7 @@ func TestAttestation(t *testing.T) {
ClusterId: []byte("clusterID"), ClusterId: []byte("clusterID"),
}, },
}, },
},
}} }}
existingIDFile := &clusterid.File{IP: "192.0.2.4", CloudProvider: cloudprovider.QEMU} existingIDFile := &clusterid.File{IP: "192.0.2.4", CloudProvider: cloudprovider.QEMU}
@ -577,14 +604,16 @@ func (i *testIssuer) Issue(_ context.Context, userData []byte, _ []byte) ([]byte
} }
type stubInitServer struct { type stubInitServer struct {
res *initproto.InitResponse res []*initproto.InitResponse
initErr error initErr error
initproto.UnimplementedAPIServer initproto.UnimplementedAPIServer
} }
func (s *stubInitServer) Init(_ *initproto.InitRequest, stream initproto.API_InitServer) error { func (s *stubInitServer) Init(_ *initproto.InitRequest, stream initproto.API_InitServer) error {
_ = stream.Send(s.res) for _, r := range s.res {
_ = stream.Send(r)
}
return s.initErr return s.initErr
} }