mirror of
https://github.com/edgelesssys/constellation.git
synced 2024-10-01 01:36:09 -04:00
bootstrapper: move fixing & waiting for Cilium to earlier stage
This commit is contained in:
parent
122b5ff0a0
commit
97d95bd48c
@ -247,23 +247,19 @@ type SetupPodNetworkInput struct {
|
|||||||
LoadBalancerEndpoint string
|
LoadBalancerEndpoint string
|
||||||
}
|
}
|
||||||
|
|
||||||
// FixCilium fixes https://github.com/cilium/cilium/issues/19958 but instead of a rollout restart of
|
// WaitForCilium waits until Cilium reports a healthy status over its /healthz endpoint.
|
||||||
// the cilium daemonset, it only restarts the local cilium pod.
|
func (k *KubernetesUtil) WaitForCilium(ctx context.Context, log *logger.Logger) error {
|
||||||
func (k *KubernetesUtil) FixCilium(log *logger.Logger) {
|
|
||||||
ctx := context.Background()
|
|
||||||
|
|
||||||
// wait for cilium pod to be healthy
|
// wait for cilium pod to be healthy
|
||||||
client := http.Client{}
|
client := http.Client{}
|
||||||
for {
|
for {
|
||||||
time.Sleep(5 * time.Second)
|
time.Sleep(5 * time.Second)
|
||||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://127.0.0.1:9879/healthz", http.NoBody)
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://127.0.0.1:9879/healthz", http.NoBody)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.With(zap.Error(err)).Errorf("Unable to create request")
|
return fmt.Errorf("unable to create request: %w", err)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.With(zap.Error(err)).Warnf("Waiting for local cilium daemonset pod not healthy")
|
log.With(zap.Error(err)).Infof("Waiting for local Cilium DaemonSet - Pod not healthy yet")
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
@ -272,42 +268,45 @@ func (k *KubernetesUtil) FixCilium(log *logger.Logger) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// FixCilium fixes https://github.com/cilium/cilium/issues/19958
|
||||||
|
// Instead of a rollout restart of the Cilium DaemonSet, it only restarts the local Cilium Pod.
|
||||||
|
func (k *KubernetesUtil) FixCilium(ctx context.Context) error {
|
||||||
// get cilium container id
|
// get cilium container id
|
||||||
out, err := exec.CommandContext(ctx, "/run/state/bin/crictl", "ps", "--name", "cilium-agent", "-q").CombinedOutput()
|
out, err := exec.CommandContext(ctx, "/run/state/bin/crictl", "ps", "--name", "cilium-agent", "-q").CombinedOutput()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.With(zap.Error(err)).Errorf("Getting cilium container id failed: %s", out)
|
return fmt.Errorf("getting cilium container id failed: %s", out)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
outLines := strings.Split(string(out), "\n")
|
outLines := strings.Split(string(out), "\n")
|
||||||
if len(outLines) < 2 {
|
if len(outLines) < 2 {
|
||||||
log.Errorf("Getting cilium container id returned invalid output: %s", out)
|
return fmt.Errorf("getting cilium container id returned invalid output: %s", out)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
containerID := outLines[len(outLines)-2]
|
containerID := outLines[len(outLines)-2]
|
||||||
|
|
||||||
// get cilium pod id
|
// get cilium pod id
|
||||||
out, err = exec.CommandContext(ctx, "/run/state/bin/crictl", "inspect", "-o", "go-template", "--template", "{{ .info.sandboxID }}", containerID).CombinedOutput()
|
out, err = exec.CommandContext(ctx, "/run/state/bin/crictl", "inspect", "-o", "go-template", "--template", "{{ .info.sandboxID }}", containerID).CombinedOutput()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.With(zap.Error(err)).Errorf("Getting cilium pod id failed: %s", out)
|
return fmt.Errorf("getting Cilium Pod ID failed: %s", out)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
outLines = strings.Split(string(out), "\n")
|
outLines = strings.Split(string(out), "\n")
|
||||||
if len(outLines) < 2 {
|
if len(outLines) < 2 {
|
||||||
log.Errorf("Getting cilium pod id returned invalid output: %s", out)
|
return fmt.Errorf("getting Cilium Pod ID returned invalid output: %s", out)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
podID := outLines[len(outLines)-2]
|
podID := outLines[len(outLines)-2]
|
||||||
|
|
||||||
// stop and delete pod
|
// stop and delete pod
|
||||||
out, err = exec.CommandContext(ctx, "/run/state/bin/crictl", "stopp", podID).CombinedOutput()
|
out, err = exec.CommandContext(ctx, "/run/state/bin/crictl", "stopp", podID).CombinedOutput()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.With(zap.Error(err)).Errorf("Stopping cilium agent pod failed: %s", out)
|
return fmt.Errorf("stopping Cilium agent Pod failed: %s", out)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
out, err = exec.CommandContext(ctx, "/run/state/bin/crictl", "rmp", podID).CombinedOutput()
|
out, err = exec.CommandContext(ctx, "/run/state/bin/crictl", "rmp", podID).CombinedOutput()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.With(zap.Error(err)).Errorf("Removing cilium agent pod failed: %s", out)
|
return fmt.Errorf("removing Cilium agent Pod failed: %s", out)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// JoinCluster joins existing Kubernetes cluster using kubeadm join.
|
// JoinCluster joins existing Kubernetes cluster using kubeadm join.
|
||||||
|
@ -21,7 +21,8 @@ type clusterUtil interface {
|
|||||||
InstallComponents(ctx context.Context, kubernetesComponents components.Components) error
|
InstallComponents(ctx context.Context, kubernetesComponents components.Components) error
|
||||||
InitCluster(ctx context.Context, initConfig []byte, nodeName, clusterName string, ips []net.IP, controlPlaneEndpoint string, conformanceMode bool, log *logger.Logger) ([]byte, error)
|
InitCluster(ctx context.Context, initConfig []byte, nodeName, clusterName string, ips []net.IP, controlPlaneEndpoint string, conformanceMode bool, log *logger.Logger) ([]byte, error)
|
||||||
JoinCluster(ctx context.Context, joinConfig []byte, peerRole role.Role, controlPlaneEndpoint string, log *logger.Logger) error
|
JoinCluster(ctx context.Context, joinConfig []byte, peerRole role.Role, controlPlaneEndpoint string, log *logger.Logger) error
|
||||||
FixCilium(log *logger.Logger)
|
WaitForCilium(ctx context.Context, log *logger.Logger) error
|
||||||
|
FixCilium(ctx context.Context) error
|
||||||
StartKubelet() error
|
StartKubelet() error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -189,6 +189,26 @@ func (k *KubeWrapper) InitCluster(
|
|||||||
return nil, fmt.Errorf("installing pod network: %w", err)
|
return nil, fmt.Errorf("installing pod network: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: The timeout here is high as ghcr.io can be slow sometimes. Reduce this later when we move the repository.
|
||||||
|
// Also remove the logging later.
|
||||||
|
log.Infof("Waiting for Cilium to become healthy")
|
||||||
|
timeToStartWaiting := time.Now()
|
||||||
|
// TODO(Nirusu): Reduce the timeout when we switched the package repository - this is only this high because I once
|
||||||
|
// saw polling times of ~16 minutes when hitting a slow PoP from Fastly (GitHub's / ghcr.io CDN).
|
||||||
|
waitCtx, cancel = context.WithTimeout(ctx, 20*time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
if err := k.clusterUtil.WaitForCilium(waitCtx, log); err != nil {
|
||||||
|
return nil, fmt.Errorf("waiting for Cilium to become healthy: %w", err)
|
||||||
|
}
|
||||||
|
timeUntilFinishedWaiting := time.Since(timeToStartWaiting)
|
||||||
|
log.Infof("Cilium took %s to become healthy", timeUntilFinishedWaiting.Round(time.Second).String())
|
||||||
|
|
||||||
|
log.Infof("Restart Cilium")
|
||||||
|
if err := k.clusterUtil.FixCilium(ctx); err != nil {
|
||||||
|
log.With(zap.Error(err)).Errorf("FixCilium failed")
|
||||||
|
// Continue and don't throw an error here - things might be okay.
|
||||||
|
}
|
||||||
|
|
||||||
var controlPlaneIP string
|
var controlPlaneIP string
|
||||||
if strings.Contains(controlPlaneEndpoint, ":") {
|
if strings.Contains(controlPlaneEndpoint, ":") {
|
||||||
controlPlaneIP, _, err = net.SplitHostPort(controlPlaneEndpoint)
|
controlPlaneIP, _, err = net.SplitHostPort(controlPlaneEndpoint)
|
||||||
@ -239,8 +259,6 @@ func (k *KubeWrapper) InitCluster(
|
|||||||
return nil, fmt.Errorf("installing operators: %w", err)
|
return nil, fmt.Errorf("installing operators: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
k.clusterUtil.FixCilium(log)
|
|
||||||
|
|
||||||
return kubeConfig, nil
|
return kubeConfig, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -297,7 +315,16 @@ func (k *KubeWrapper) JoinCluster(ctx context.Context, args *kubeadm.BootstrapTo
|
|||||||
return fmt.Errorf("joining cluster: %v; %w ", string(joinConfigYAML), err)
|
return fmt.Errorf("joining cluster: %v; %w ", string(joinConfigYAML), err)
|
||||||
}
|
}
|
||||||
|
|
||||||
k.clusterUtil.FixCilium(log)
|
log.Infof("Waiting for Cilium to become healthy")
|
||||||
|
if err := k.clusterUtil.WaitForCilium(context.Background(), log); err != nil {
|
||||||
|
return fmt.Errorf("waiting for Cilium to become healthy: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Infof("Restart Cilium")
|
||||||
|
if err := k.clusterUtil.FixCilium(context.Background()); err != nil {
|
||||||
|
log.With(zap.Error(err)).Errorf("FixCilium failed")
|
||||||
|
// Continue and don't throw an error here - things might be okay.
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -358,7 +385,17 @@ func (k *KubeWrapper) StartKubelet(log *logger.Logger) error {
|
|||||||
return fmt.Errorf("starting kubelet: %w", err)
|
return fmt.Errorf("starting kubelet: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
k.clusterUtil.FixCilium(log)
|
log.Infof("Waiting for Cilium to become healthy")
|
||||||
|
if err := k.clusterUtil.WaitForCilium(context.Background(), log); err != nil {
|
||||||
|
return fmt.Errorf("waiting for Cilium to become healthy: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Infof("Restart Cilium")
|
||||||
|
if err := k.clusterUtil.FixCilium(context.Background()); err != nil {
|
||||||
|
log.With(zap.Error(err)).Errorf("FixCilium failed")
|
||||||
|
// Continue and don't throw an error here - things might be okay.
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -510,7 +510,12 @@ func (s *stubClusterUtil) StartKubelet() error {
|
|||||||
return s.startKubeletErr
|
return s.startKubeletErr
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *stubClusterUtil) FixCilium(log *logger.Logger) {
|
func (s *stubClusterUtil) WaitForCilium(ctx context.Context, log *logger.Logger) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *stubClusterUtil) FixCilium(ctx context.Context) error {
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type stubConfigProvider struct {
|
type stubConfigProvider struct {
|
||||||
|
Loading…
Reference in New Issue
Block a user