mirror of
https://github.com/edgelesssys/constellation.git
synced 2025-06-07 06:03:05 -04:00
bootstrapper: remove cilium restart fix
Tests concluded that restating the Cilium agent after the first boot is not needed anymore to regain connectivity for pods.
This commit is contained in:
parent
1972b635b4
commit
79f562374a
9 changed files with 4 additions and 207 deletions
|
@ -51,7 +51,7 @@ func run(issuer atls.Issuer, openDevice vtpm.TPMOpenFunc, fileHandler file.Handl
|
||||||
}
|
}
|
||||||
|
|
||||||
if nodeBootstrapped {
|
if nodeBootstrapped {
|
||||||
if err := kube.StartKubelet(log); err != nil {
|
if err := kube.StartKubelet(); err != nil {
|
||||||
log.With(zap.Error(err)).Fatalf("Failed to restart kubelet")
|
log.With(zap.Error(err)).Fatalf("Failed to restart kubelet")
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
|
@ -93,7 +93,7 @@ func getDiskUUID() (string, error) {
|
||||||
type clusterInitJoiner interface {
|
type clusterInitJoiner interface {
|
||||||
joinclient.ClusterJoiner
|
joinclient.ClusterJoiner
|
||||||
initserver.ClusterInitializer
|
initserver.ClusterInitializer
|
||||||
StartKubelet(*logger.Logger) error
|
StartKubelet() error
|
||||||
}
|
}
|
||||||
|
|
||||||
type metadataAPI interface {
|
type metadataAPI interface {
|
||||||
|
|
|
@ -33,7 +33,7 @@ func (c *clusterFake) JoinCluster(context.Context, *kubeadm.BootstrapTokenDiscov
|
||||||
}
|
}
|
||||||
|
|
||||||
// StartKubelet starts the kubelet service.
|
// StartKubelet starts the kubelet service.
|
||||||
func (c *clusterFake) StartKubelet(*logger.Logger) error {
|
func (c *clusterFake) StartKubelet() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,6 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
@ -250,71 +249,6 @@ type SetupPodNetworkInput struct {
|
||||||
LoadBalancerPort string
|
LoadBalancerPort string
|
||||||
}
|
}
|
||||||
|
|
||||||
// WaitForCilium waits until Cilium reports a healthy status over its /healthz endpoint.
|
|
||||||
func (k *KubernetesUtil) WaitForCilium(ctx context.Context, log *logger.Logger) error {
|
|
||||||
// wait for cilium pod to be healthy
|
|
||||||
client := http.Client{}
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
default:
|
|
||||||
time.Sleep(5 * time.Second)
|
|
||||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://127.0.0.1:9879/healthz", http.NoBody)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("unable to create request: %w", err)
|
|
||||||
}
|
|
||||||
resp, err := client.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
log.With(zap.Error(err)).Infof("Waiting for local Cilium DaemonSet - Pod not healthy yet")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
resp.Body.Close()
|
|
||||||
if resp.StatusCode == 200 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// FixCilium fixes https://github.com/cilium/cilium/issues/19958
|
|
||||||
// Instead of a rollout restart of the Cilium DaemonSet, it only restarts the local Cilium Pod.
|
|
||||||
func (k *KubernetesUtil) FixCilium(ctx context.Context) error {
|
|
||||||
// get cilium container id
|
|
||||||
out, err := exec.CommandContext(ctx, "/run/state/bin/crictl", "ps", "--name", "cilium-agent", "-q").CombinedOutput()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("getting cilium container id failed: %s", out)
|
|
||||||
}
|
|
||||||
outLines := strings.Split(string(out), "\n")
|
|
||||||
if len(outLines) < 2 {
|
|
||||||
return fmt.Errorf("getting cilium container id returned invalid output: %s", out)
|
|
||||||
}
|
|
||||||
containerID := outLines[len(outLines)-2]
|
|
||||||
|
|
||||||
// get cilium pod id
|
|
||||||
out, err = exec.CommandContext(ctx, "/run/state/bin/crictl", "inspect", "-o", "go-template", "--template", "{{ .info.sandboxID }}", containerID).CombinedOutput()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("getting Cilium Pod ID failed: %s", out)
|
|
||||||
}
|
|
||||||
outLines = strings.Split(string(out), "\n")
|
|
||||||
if len(outLines) < 2 {
|
|
||||||
return fmt.Errorf("getting Cilium Pod ID returned invalid output: %s", out)
|
|
||||||
}
|
|
||||||
podID := outLines[len(outLines)-2]
|
|
||||||
|
|
||||||
// stop and delete pod
|
|
||||||
out, err = exec.CommandContext(ctx, "/run/state/bin/crictl", "stopp", podID).CombinedOutput()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("stopping Cilium agent Pod failed: %s", out)
|
|
||||||
}
|
|
||||||
out, err = exec.CommandContext(ctx, "/run/state/bin/crictl", "rmp", podID).CombinedOutput()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("removing Cilium agent Pod failed: %s", out)
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// JoinCluster joins existing Kubernetes cluster using kubeadm join.
|
// JoinCluster joins existing Kubernetes cluster using kubeadm join.
|
||||||
func (k *KubernetesUtil) JoinCluster(ctx context.Context, joinConfig []byte, peerRole role.Role, controlPlaneHost, controlPlanePort string, log *logger.Logger) error {
|
func (k *KubernetesUtil) JoinCluster(ctx context.Context, joinConfig []byte, peerRole role.Role, controlPlaneHost, controlPlanePort string, log *logger.Logger) error {
|
||||||
// TODO(3u13r): audit policy should be user input
|
// TODO(3u13r): audit policy should be user input
|
||||||
|
|
|
@ -19,7 +19,5 @@ type clusterUtil interface {
|
||||||
InstallComponents(ctx context.Context, kubernetesComponents components.Components) error
|
InstallComponents(ctx context.Context, kubernetesComponents components.Components) error
|
||||||
InitCluster(ctx context.Context, initConfig []byte, nodeName, clusterName string, ips []net.IP, controlPlaneHost, controlPlanePort string, conformanceMode bool, log *logger.Logger) ([]byte, error)
|
InitCluster(ctx context.Context, initConfig []byte, nodeName, clusterName string, ips []net.IP, controlPlaneHost, controlPlanePort string, conformanceMode bool, log *logger.Logger) ([]byte, error)
|
||||||
JoinCluster(ctx context.Context, joinConfig []byte, peerRole role.Role, controlPlaneHost, controlPlanePort string, log *logger.Logger) error
|
JoinCluster(ctx context.Context, joinConfig []byte, peerRole role.Role, controlPlaneHost, controlPlanePort string, log *logger.Logger) error
|
||||||
WaitForCilium(ctx context.Context, log *logger.Logger) error
|
|
||||||
FixCilium(ctx context.Context) error
|
|
||||||
StartKubelet() error
|
StartKubelet() error
|
||||||
}
|
}
|
||||||
|
|
|
@ -242,17 +242,6 @@ func (k *KubeWrapper) JoinCluster(ctx context.Context, args *kubeadm.BootstrapTo
|
||||||
return fmt.Errorf("joining cluster: %v; %w ", string(joinConfigYAML), err)
|
return fmt.Errorf("joining cluster: %v; %w ", string(joinConfigYAML), err)
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Infof("Waiting for Cilium to become healthy")
|
|
||||||
if err := k.clusterUtil.WaitForCilium(context.Background(), log); err != nil {
|
|
||||||
return fmt.Errorf("waiting for Cilium to become healthy: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Infof("Restarting Cilium")
|
|
||||||
if err := k.clusterUtil.FixCilium(context.Background()); err != nil {
|
|
||||||
log.With(zap.Error(err)).Errorf("FixCilium failed")
|
|
||||||
// Continue and don't throw an error here - things might be okay.
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -307,22 +296,11 @@ func k8sCompliantHostname(in string) (string, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// StartKubelet starts the kubelet service.
|
// StartKubelet starts the kubelet service.
|
||||||
func (k *KubeWrapper) StartKubelet(log *logger.Logger) error {
|
func (k *KubeWrapper) StartKubelet() error {
|
||||||
if err := k.clusterUtil.StartKubelet(); err != nil {
|
if err := k.clusterUtil.StartKubelet(); err != nil {
|
||||||
return fmt.Errorf("starting kubelet: %w", err)
|
return fmt.Errorf("starting kubelet: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Infof("Waiting for Cilium to become healthy")
|
|
||||||
if err := k.clusterUtil.WaitForCilium(context.Background(), log); err != nil {
|
|
||||||
return fmt.Errorf("waiting for Cilium to become healthy: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Infof("Restarting Cilium")
|
|
||||||
if err := k.clusterUtil.FixCilium(context.Background()); err != nil {
|
|
||||||
log.With(zap.Error(err)).Errorf("FixCilium failed")
|
|
||||||
// Continue and don't throw an error here - things might be okay.
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -479,14 +479,6 @@ func (s *stubClusterUtil) StartKubelet() error {
|
||||||
return s.startKubeletErr
|
return s.startKubeletErr
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *stubClusterUtil) WaitForCilium(_ context.Context, _ *logger.Logger) error {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *stubClusterUtil) FixCilium(_ context.Context) error {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type stubConfigProvider struct {
|
type stubConfigProvider struct {
|
||||||
initConfig k8sapi.KubeadmInitYAML
|
initConfig k8sapi.KubeadmInitYAML
|
||||||
joinConfig k8sapi.KubeadmJoinYAML
|
joinConfig k8sapi.KubeadmJoinYAML
|
||||||
|
|
|
@ -7,7 +7,6 @@ go_library(
|
||||||
"action.go",
|
"action.go",
|
||||||
"actionfactory.go",
|
"actionfactory.go",
|
||||||
"chartutil.go",
|
"chartutil.go",
|
||||||
"ciliumhelper.go",
|
|
||||||
"helm.go",
|
"helm.go",
|
||||||
"loader.go",
|
"loader.go",
|
||||||
"overrides.go",
|
"overrides.go",
|
||||||
|
@ -474,9 +473,6 @@ go_library(
|
||||||
"//internal/semver",
|
"//internal/semver",
|
||||||
"//internal/versions",
|
"//internal/versions",
|
||||||
"@com_github_pkg_errors//:errors",
|
"@com_github_pkg_errors//:errors",
|
||||||
"@io_k8s_apimachinery//pkg/apis/meta/v1:meta",
|
|
||||||
"@io_k8s_client_go//kubernetes",
|
|
||||||
"@io_k8s_client_go//tools/clientcmd",
|
|
||||||
"@io_k8s_client_go//util/retry",
|
"@io_k8s_client_go//util/retry",
|
||||||
"@sh_helm_helm//pkg/ignore",
|
"@sh_helm_helm//pkg/ignore",
|
||||||
"@sh_helm_helm_v3//pkg/action",
|
"@sh_helm_helm_v3//pkg/action",
|
||||||
|
|
|
@ -11,10 +11,8 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/edgelesssys/constellation/v2/internal/compatibility"
|
"github.com/edgelesssys/constellation/v2/internal/compatibility"
|
||||||
"github.com/edgelesssys/constellation/v2/internal/constants"
|
|
||||||
"github.com/edgelesssys/constellation/v2/internal/semver"
|
"github.com/edgelesssys/constellation/v2/internal/semver"
|
||||||
"helm.sh/helm/v3/pkg/action"
|
"helm.sh/helm/v3/pkg/action"
|
||||||
"helm.sh/helm/v3/pkg/chart"
|
"helm.sh/helm/v3/pkg/chart"
|
||||||
|
@ -133,36 +131,9 @@ func (a actionFactory) appendNewAction(release Release, configTargetVersion semv
|
||||||
|
|
||||||
func (a actionFactory) newInstall(release Release) *installAction {
|
func (a actionFactory) newInstall(release Release) *installAction {
|
||||||
action := &installAction{helmAction: newHelmInstallAction(a.cfg, release), release: release, log: a.log}
|
action := &installAction{helmAction: newHelmInstallAction(a.cfg, release), release: release, log: a.log}
|
||||||
if action.ReleaseName() == ciliumInfo.releaseName {
|
|
||||||
action.postInstall = func(ctx context.Context) error {
|
|
||||||
return ciliumPostInstall(ctx, a.log)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return action
|
return action
|
||||||
}
|
}
|
||||||
|
|
||||||
func ciliumPostInstall(ctx context.Context, log debugLog) error {
|
|
||||||
log.Debugf("Waiting for Cilium to become ready")
|
|
||||||
helper, err := newK8sCiliumHelper(constants.AdminConfFilename)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("creating Kubernetes client: %w", err)
|
|
||||||
}
|
|
||||||
timeToStartWaiting := time.Now()
|
|
||||||
// TODO(3u13r): Reduce the timeout when we switched the package repository - this is only this high because we once
|
|
||||||
// saw polling times of ~16 minutes when hitting a slow PoP from Fastly (GitHub's / ghcr.io CDN).
|
|
||||||
if err := helper.WaitForDS(ctx, "kube-system", "cilium", log); err != nil {
|
|
||||||
return fmt.Errorf("waiting for Cilium to become healthy: %w", err)
|
|
||||||
}
|
|
||||||
timeUntilFinishedWaiting := time.Since(timeToStartWaiting)
|
|
||||||
log.Debugf("Cilium became healthy after %s", timeUntilFinishedWaiting.String())
|
|
||||||
|
|
||||||
log.Debugf("Fix Cilium through restart")
|
|
||||||
if err := helper.RestartDS("kube-system", "cilium"); err != nil {
|
|
||||||
return fmt.Errorf("restarting Cilium: %w", err)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a actionFactory) newUpgrade(release Release) *upgradeAction {
|
func (a actionFactory) newUpgrade(release Release) *upgradeAction {
|
||||||
action := &upgradeAction{helmAction: newHelmUpgradeAction(a.cfg), release: release, log: a.log}
|
action := &upgradeAction{helmAction: newHelmUpgradeAction(a.cfg), release: release, log: a.log}
|
||||||
if release.ReleaseName == constellationOperatorsInfo.releaseName {
|
if release.ReleaseName == constellationOperatorsInfo.releaseName {
|
||||||
|
|
|
@ -1,72 +0,0 @@
|
||||||
/*
|
|
||||||
Copyright (c) Edgeless Systems GmbH
|
|
||||||
|
|
||||||
SPDX-License-Identifier: AGPL-3.0-only
|
|
||||||
*/
|
|
||||||
|
|
||||||
package helm
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
||||||
"k8s.io/client-go/kubernetes"
|
|
||||||
"k8s.io/client-go/tools/clientcmd"
|
|
||||||
)
|
|
||||||
|
|
||||||
type k8sDsClient struct {
|
|
||||||
clientset *kubernetes.Clientset
|
|
||||||
}
|
|
||||||
|
|
||||||
func newK8sCiliumHelper(kubeconfigPath string) (*k8sDsClient, error) {
|
|
||||||
config, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
clientset, err := kubernetes.NewForConfig(config)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return &k8sDsClient{clientset: clientset}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// WaitForDS waits for a DaemonSet to become ready.
|
|
||||||
func (h *k8sDsClient) WaitForDS(ctx context.Context, namespace, name string, log debugLog) error {
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return fmt.Errorf("context expired before DaemonSet %q became ready", name)
|
|
||||||
default:
|
|
||||||
ds, err := h.clientset.AppsV1().DaemonSets(namespace).Get(ctx, name, v1.GetOptions{})
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if ds.Status.NumberReady == ds.Status.DesiredNumberScheduled {
|
|
||||||
log.Debugf("DaemonSet %s is ready\n", name)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
log.Debugf("Waiting for DaemonSet %s to become ready...\n", name)
|
|
||||||
time.Sleep(10 * time.Second)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// RestartDS restarts all pods of a DaemonSet by updating its template.
|
|
||||||
func (h *k8sDsClient) RestartDS(namespace, name string) error {
|
|
||||||
ds, err := h.clientset.AppsV1().DaemonSets(namespace).Get(context.Background(), name, v1.GetOptions{})
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
ds.Spec.Template.ObjectMeta.Annotations["restartTimestamp"] = fmt.Sprintf("%d", time.Now().Unix())
|
|
||||||
_, err = h.clientset.AppsV1().DaemonSets(namespace).Update(context.Background(), ds, v1.UpdateOptions{})
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
Loading…
Add table
Add a link
Reference in a new issue