constellation/e2e/internal/upgrade/upgrade_test.go

492 lines
16 KiB
Go
Raw Normal View History

//go:build e2e
/*
Copyright (c) Edgeless Systems GmbH
SPDX-License-Identifier: AGPL-3.0-only
*/
package upgrade
import (
"context"
"errors"
"flag"
"fmt"
"io"
"log"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"testing"
"time"
"github.com/bazelbuild/rules_go/go/runfiles"
"github.com/edgelesssys/constellation/v2/e2e/internal/kubectl"
"github.com/edgelesssys/constellation/v2/internal/api/attestationconfigapi"
"github.com/edgelesssys/constellation/v2/internal/config"
"github.com/edgelesssys/constellation/v2/internal/constants"
"github.com/edgelesssys/constellation/v2/internal/file"
"github.com/edgelesssys/constellation/v2/internal/imagefetcher"
"github.com/edgelesssys/constellation/v2/internal/semver"
"github.com/edgelesssys/constellation/v2/internal/versions"
"github.com/spf13/afero"
"github.com/stretchr/testify/require"
coreV1 "k8s.io/api/core/v1"
metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
)
// Flags are defined globally as `go test` implicitly calls flag.Parse() before executing a testcase.
// Thus defining and parsing flags inside a testcase would result in a panic.
// See https://groups.google.com/g/golang-nuts/c/P6EdEdgvDuc/m/5-Dg6bPxmvQJ.
var (
targetImage = flag.String("target-image", "", "Image (shortversion) to upgrade to.")
targetKubernetes = flag.String("target-kubernetes", "", "Kubernetes version (MAJOR.MINOR.PATCH) to upgrade to. Defaults to default version of target CLI.")
targetMicroservices = flag.String("target-microservices", "", "Microservice version (MAJOR.MINOR.PATCH) to upgrade to. Defaults to default version of target CLI.")
// When executing the test as a bazel target the workspace path is supplied through an env variable that bazel sets.
workspace = flag.String("workspace", "", "Constellation workspace in which to run the tests.")
// When executing the test as a bazel target the CLI path is supplied through an env variable that bazel sets.
// When executing via `go test` extra care should be taken that the supplied CLI is built on the same commit as this test.
cliPath = flag.String("cli", "", "Constellation CLI to run the tests.")
wantWorker = flag.Int("want-worker", 0, "Number of wanted worker nodes.")
wantControl = flag.Int("want-control", 0, "Number of wanted control nodes.")
timeout = flag.Duration("timeout", 3*time.Hour, "Timeout after which the cluster should have converged to the target version.")
)
// TestUpgrade checks that the workspace's kubeconfig points to a healthy cluster,
// we can write an upgrade config, we can trigger an upgrade
// and the cluster eventually upgrades to the target version.
func TestUpgrade(t *testing.T) {
require := require.New(t)
err := setup()
require.NoError(err)
k, err := kubectl.New()
require.NoError(err)
require.NotNil(k)
require.NotEqual(*targetImage, "", "--target-image needs to be specified")
log.Println("Waiting for nodes and pods to be ready.")
testNodesEventuallyAvailable(t, k, *wantControl, *wantWorker)
testPodsEventuallyReady(t, k, "kube-system")
cli, err := getCLIPath(*cliPath)
require.NoError(err)
targetVersions := writeUpgradeConfig(require, *targetImage, *targetKubernetes, *targetMicroservices)
log.Println("Fetching measurements for new image.")
cmd := exec.CommandContext(context.Background(), cli, "config", "fetch-measurements", "--insecure", "--debug")
stdout, stderr, err := runCommandWithSeparateOutputs(cmd)
require.NoError(err, "Stdout: %s\nStderr: %s", string(stdout), string(stderr))
log.Println(string(stdout))
data, err := os.ReadFile("./constellation-conf.yaml")
require.NoError(err)
log.Println(string(data))
log.Println("Checking upgrade.")
runUpgradeCheck(require, cli, *targetKubernetes)
cli: Terraform migrations on upgrade (#1685) * add terraform planning * overwrite terraform files in upgrade workspace * Revert "overwrite terraform files in upgrade workspace" This reverts commit 8bdacfb8bef23ef2cdbdb06bad0855b3bbc42df0. * prepare terraform workspace * test upgrade integration * print upgrade abort * rename plan file * write output to file * add show plan test * add upgrade tf workdir * fix workspace preparing * squash to 1 command * test * bazel build * plan test * register flag manually * bazel tidy * fix linter * remove MAA variable * fix workdir * accept tf variables * variable fetching * fix resource indices * accept Terraform targets * refactor upgrade command * Terraform migration apply unit test * pass down image fetcher to test * use new flags in e2e test * move file name to constant * update buildfiles * fix version constant * conditionally create MAA * move interface down * upgrade dir * update buildfiles * fix interface * fix createMAA check * fix imports * update buildfiles * wip: workspace backup * copy utils * backup upgrade workspace * remove debug print * replace old state after upgrade * check if flag exists * prepare test workspace * remove prefix Co-authored-by: Otto Bittner <cobittner@posteo.net> * respect file permissions * refactor tf upgrader * check workspace before upgrades * remove temp upgrade dir after completion * clean up workspace after abortion * fix upgrade apply test * fix linter --------- Co-authored-by: Otto Bittner <cobittner@posteo.net>
2023-05-22 07:31:20 -04:00
log.Println("Triggering upgrade.")
runUpgradeApply(require, cli)
wg := queryStatusAsync(t, cli)
testMicroservicesEventuallyHaveVersion(t, targetVersions.microservices, *timeout)
testNodesEventuallyHaveVersion(t, k, targetVersions, *wantControl+*wantWorker, *timeout)
wg.Wait()
}
// setup checks that the prerequisites for the test are met:
// - a workspace is set
// - a CLI path is set
// - the constellation-upgrade folder does not exist.
func setup() error {
workingDir, err := workingDir(*workspace)
if err != nil {
return fmt.Errorf("getting working directory: %w", err)
}
if err := os.Chdir(workingDir); err != nil {
return fmt.Errorf("changing working directory: %w", err)
}
if _, err := getCLIPath(*cliPath); err != nil {
return fmt.Errorf("getting CLI path: %w", err)
}
return nil
}
// workingDir returns the path to the workspace.
func workingDir(workspace string) (string, error) {
workingDir := os.Getenv("BUILD_WORKING_DIRECTORY")
switch {
case workingDir != "":
return workingDir, nil
case workspace != "":
return workspace, nil
default:
return "", errors.New("neither 'BUILD_WORKING_DIRECTORY' nor 'workspace' flag set")
}
}
// getCLIPath returns the path to the CLI.
func getCLIPath(cliPathFlag string) (string, error) {
pathCLI := os.Getenv("PATH_CLI")
var relCLIPath string
switch {
case pathCLI != "":
relCLIPath = pathCLI
case cliPathFlag != "":
relCLIPath = cliPathFlag
default:
return "", errors.New("neither 'PATH_CLI' nor 'cli' flag set")
}
// try to find the CLI in the working directory
// (e.g. when running via `go test` or when specifying a path manually)
workdir, err := os.Getwd()
if err != nil {
return "", fmt.Errorf("getting working directory: %w", err)
}
absCLIPath := relCLIPath
if !filepath.IsAbs(relCLIPath) {
absCLIPath = filepath.Join(workdir, relCLIPath)
}
if _, err := os.Stat(absCLIPath); err == nil {
return absCLIPath, nil
}
// fall back to runfiles (e.g. when running via bazel)
return runfiles.Rlocation(pathCLI)
}
// testPodsEventuallyReady checks that:
// 1) all pods are running.
// 2) all pods have good status conditions.
func testPodsEventuallyReady(t *testing.T, k *kubernetes.Clientset, namespace string) {
require.Eventually(t, func() bool {
pods, err := k.CoreV1().Pods(namespace).List(context.Background(), metaV1.ListOptions{})
if err != nil {
log.Println(err)
return false
}
for _, pod := range pods.Items {
if pod.Status.Phase != coreV1.PodRunning {
log.Printf("Pod %s is not running, but %s\n", pod.Name, pod.Status.Phase)
return false
}
for _, condition := range pod.Status.Conditions {
switch condition.Type {
case coreV1.ContainersReady, coreV1.PodInitialized, coreV1.PodReady, coreV1.PodScheduled:
if condition.Status != coreV1.ConditionTrue {
log.Printf("Pod %s's status %s is false\n", pod.Name, coreV1.ContainersReady)
return false
}
}
}
}
return true
}, time.Minute*30, time.Minute)
}
// testNodesEventuallyAvailable checks that:
// 1) all nodes only have good status conditions.
// 2) the expected number of nodes have joined the cluster.
func testNodesEventuallyAvailable(t *testing.T, k *kubernetes.Clientset, wantControlNodeCount, wantWorkerNodeCount int) {
require.Eventually(t, func() bool {
nodes, err := k.CoreV1().Nodes().List(context.Background(), metaV1.ListOptions{})
if err != nil {
log.Println(err)
return false
}
var controlNodeCount, workerNodeCount int
for _, node := range nodes.Items {
if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok {
controlNodeCount++
} else {
workerNodeCount++
}
for _, condition := range node.Status.Conditions {
switch condition.Type {
case coreV1.NodeReady:
if condition.Status != coreV1.ConditionTrue {
fmt.Printf("Status %s for node %s is %s\n", condition.Type, node.Name, condition.Status)
return false
}
case coreV1.NodeMemoryPressure, coreV1.NodeDiskPressure, coreV1.NodePIDPressure, coreV1.NodeNetworkUnavailable:
if condition.Status != coreV1.ConditionFalse {
fmt.Printf("Status %s for node %s is %s\n", condition.Type, node.Name, condition.Status)
return false
}
}
}
}
if controlNodeCount != wantControlNodeCount {
log.Printf("Want %d control nodes but got %d\n", wantControlNodeCount, controlNodeCount)
return false
}
if workerNodeCount != wantWorkerNodeCount {
log.Printf("Want %d worker nodes but got %d\n", wantWorkerNodeCount, workerNodeCount)
return false
}
return true
}, time.Minute*30, time.Minute)
}
func writeUpgradeConfig(require *require.Assertions, image string, kubernetes string, microservices string) versionContainer {
fileHandler := file.NewHandler(afero.NewOsFs())
attestationFetcher := attestationconfigapi.NewFetcher()
cfg, err := config.New(fileHandler, constants.ConfigFilename, attestationFetcher, true)
var cfgErr *config.ValidationError
var longMsg string
if errors.As(err, &cfgErr) {
longMsg = cfgErr.LongMessage()
}
require.NoError(err, longMsg)
imageFetcher := imagefetcher.New()
imageRef, err := imageFetcher.FetchReference(
2023-05-22 08:59:28 -04:00
context.Background(),
cfg.GetProvider(),
cfg.GetAttestationConfig().GetVariant(),
image,
terraform: Azure Marketplace image support (#2651) * terraform: add Azure marketplace variable Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> * config: add Azure marketplace variable Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> * cli: use Terraform variables from config Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> * terraform: pass down marketplace variable * image: pad Azure images to 1GiB * terraform: add version attribute to marketplace image * semver: allow versions to be exported without prefix * cli: boolean var to use marketplace images * config: remove dive key * dev-docs: add instructions on how to use marketplace images * terraform: fix unit test * terraform: only fetch image for non-marketplace images * mpimage: refactor image selection Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> * [remove] increase minor version for image build Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> * terraform: ignore changes to source_image_reference on upgrade * operator: add support for parsing Azure marketplace images Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> * upgrade: fix imagefetcher call * docs: add info about azure marketplace * image: ensure more than 1GiB in size * image: test to pad to 2GiB * version: change back to v2.14.0-pre * image: GPT-conformant image size padding * [remove] increase version * mpimage: inline prefix func Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> * ci: add marketplace image e2e test Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> * [remove] register workflow * ci: fix workflow name * ci: only allow azure test * cli: add marketplace image input to interface * cli: fix argument passing * version: roll back to v2.14.0 * ci: add force-flag support * Update docs/docs/overview/license.md * Update dev-docs/workflows/marketplace-images.md Co-authored-by: Moritz Eckert <m1gh7ym0@gmail.com> --------- Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com> Co-authored-by: Moritz Eckert <m1gh7ym0@gmail.com> Co-authored-by: Thomas Tendyck <51411342+thomasten@users.noreply.github.com>
2023-12-08 08:40:31 -05:00
cfg.GetRegion(), cfg.UseMarketplaceImage(),
2023-05-22 08:59:28 -04:00
)
require.NoError(err)
log.Printf("Setting image version: %s\n", image)
cfg.Image = image
defaultConfig := config.Default()
var kubernetesVersion versions.ValidK8sVersion
if kubernetes == "" {
kubernetesVersion = defaultConfig.KubernetesVersion
} else {
kubernetesVersion = versions.ValidK8sVersion(kubernetes) // ignore validation because the config is only written to file
}
var microserviceVersion semver.Semver
if microservices == "" {
microserviceVersion = defaultConfig.MicroserviceVersion
} else {
version, err := semver.New(microservices)
require.NoError(err)
microserviceVersion = version
}
log.Printf("Setting K8s version: %s\n", kubernetesVersion)
cfg.KubernetesVersion = kubernetesVersion
log.Printf("Setting microservice version: %s\n", microserviceVersion)
cfg.MicroserviceVersion = microserviceVersion
err = fileHandler.WriteYAML(constants.ConfigFilename, cfg, file.OptOverwrite)
require.NoError(err)
return versionContainer{imageRef: imageRef, kubernetes: kubernetesVersion, microservices: microserviceVersion}
}
// runUpgradeCheck executes 'upgrade check' and does basic checks on the output.
// We can not check images upgrades because we might use unpublished images. CLI uses public CDN to check for available images.
func runUpgradeCheck(require *require.Assertions, cli, targetKubernetes string) {
aws: use new LB controller to fix SecurityGroup cleanup on K8s service deletion (#2090) * add current chart add current helm chart * disable service controller for aws ccm * add new iam roles * doc AWS internet LB + add to LB test * pass clusterName to helm for AWS LB * fix update-aws-lb chart to also include .helmignore * move chart outside services * working state * add subnet tags for AWS subnet discovery * fix .helmignore load rule with file in subdirectory * upgrade iam profile * revert new loader impl since cilium is not correctly loaded * install chart if not already present during `upgrade apply` * cleanup PR + fix build + add todos cleanup PR + add todos * shared helm pkg for cli install and bootstrapper * add link to eks docs * refactor iamMigrationCmd * delete unused helm.symwallk * move iammigrate to upgrade pkg * fixup! delete unused helm.symwallk * add to upgradecheck * remove nodeSelector from go code (Otto) * update iam docs and sort permission + remove duplicate roles * fix bug in `upgrade check` * better upgrade check output when svc version upgrade not possible * pr feedback * remove force flag in upgrade_test * use upgrader.GetUpgradeID instead of extra type * remove todos + fix check * update doc lb (leo) * remove bootstrapper helm package * Update cli/internal/cmd/upgradecheck.go Co-authored-by: Daniel Weiße <66256922+daniel-weisse@users.noreply.github.com> * final nits * add docs for e2e upgrade test setup * Apply suggestions from code review Co-authored-by: Daniel Weiße <66256922+daniel-weisse@users.noreply.github.com> * Update cli/internal/helm/loader.go Co-authored-by: Daniel Weiße <66256922+daniel-weisse@users.noreply.github.com> * Update cli/internal/cmd/tfmigrationclient.go Co-authored-by: Daniel Weiße <66256922+daniel-weisse@users.noreply.github.com> * fix daniel review * link to the iam permissions instead of manually updating them (agreed with leo) * disable iam upgrade in upgrade apply --------- Co-authored-by: Daniel Weiße <66256922+daniel-weisse@users.noreply.github.com> Co-authored-by: Malte Poll
2023-07-24 04:30:53 -04:00
cmd := exec.CommandContext(context.Background(), cli, "upgrade", "check", "--debug")
stdout, stderr, err := runCommandWithSeparateOutputs(cmd)
require.NoError(err, "Stdout: %s\nStderr: %s", string(stdout), string(stderr))
require.Contains(string(stdout), "The following updates are available with this CLI:")
require.Contains(string(stdout), "Kubernetes:")
log.Printf("targetKubernetes: %s\n", targetKubernetes)
if targetKubernetes == "" {
log.Printf("true\n")
require.True(containsAny(string(stdout), versions.SupportedK8sVersions()))
} else {
log.Printf("false. targetKubernetes: %s\n", targetKubernetes)
require.Contains(string(stdout), targetKubernetes, fmt.Sprintf("Expected Kubernetes version %s in output.", targetKubernetes))
}
require.Contains(string(stdout), "Services:")
require.Contains(string(stdout), fmt.Sprintf("--> %s", constants.BinaryVersion().String()))
log.Println(string(stdout))
}
func containsAny(text string, substrs []string) bool {
for _, substr := range substrs {
if strings.Contains(text, substr) {
return true
}
}
return false
}
func runUpgradeApply(require *require.Assertions, cli string) {
tfLogFlag := ""
cmd := exec.CommandContext(context.Background(), cli, "--help")
stdout, stderr, err := runCommandWithSeparateOutputs(cmd)
require.NoError(err, "Stdout: %s\nStderr: %s", string(stdout), string(stderr))
if strings.Contains(string(stdout), "--tf-log") {
tfLogFlag = "--tf-log=DEBUG"
}
cmd = exec.CommandContext(context.Background(), cli, "apply", "--debug", "--yes", tfLogFlag)
stdout, stderr, err = runCommandWithSeparateOutputs(cmd)
require.NoError(err, "Stdout: %s\nStderr: %s", string(stdout), string(stderr))
require.NoError(containsUnexepectedMsg(string(stdout)))
log.Println(string(stdout))
log.Println(string(stderr)) // also print debug logs.
}
// containsUnexepectedMsg checks if the given input contains any unexpected messages.
// unexepcted messages are:
// "Skipping image & Kubernetes upgrades. Another upgrade is in progress".
func containsUnexepectedMsg(input string) error {
if strings.Contains(input, "Skipping image & Kubernetes upgrades. Another upgrade is in progress") {
return errors.New("unexpected upgrade in progress")
}
return nil
}
func queryStatusAsync(t *testing.T, cli string) *sync.WaitGroup {
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
// The first control plane node should finish upgrading after 20 minutes. If it does not, something is fishy.
// Nodes can upgrade in <5mins.
testStatusEventuallyWorks(t, cli, 20*time.Minute)
}()
return &wg
}
func testStatusEventuallyWorks(t *testing.T, cli string, timeout time.Duration) {
require.Eventually(t, func() bool {
// Show versions set in cluster.
// The string after "Cluster status:" in the output might not be updated yet.
// This is only updated after the operator finishes one reconcile loop.
cmd := exec.CommandContext(context.Background(), cli, "status")
stdout, stderr, err := runCommandWithSeparateOutputs(cmd)
if err != nil {
log.Printf("Stdout: %s\nStderr: %s", string(stdout), string(stderr))
return false
}
log.Println(string(stdout))
return true
}, timeout, time.Minute)
}
func testMicroservicesEventuallyHaveVersion(t *testing.T, wantMicroserviceVersion semver.Semver, timeout time.Duration) {
require.Eventually(t, func() bool {
version, err := servicesVersion(t)
if err != nil {
log.Printf("Unable to fetch microservice version: %v\n", err)
return false
}
if version != wantMicroserviceVersion {
log.Printf("Microservices still at version %v, want %v\n", version, wantMicroserviceVersion)
return false
}
return true
}, timeout, time.Minute)
}
func testNodesEventuallyHaveVersion(t *testing.T, k *kubernetes.Clientset, targetVersions versionContainer, totalNodeCount int, timeout time.Duration) {
require.Eventually(t, func() bool {
nodes, err := k.CoreV1().Nodes().List(context.Background(), metaV1.ListOptions{})
if err != nil {
log.Println(err)
return false
}
require.False(t, len(nodes.Items) < totalNodeCount, "expected at least %v nodes, got %v", totalNodeCount, len(nodes.Items))
allUpdated := true
log.Printf("Node status (%v):", time.Now())
for _, node := range nodes.Items {
for key, value := range node.Annotations {
if key == "constellation.edgeless.systems/node-image" {
if !strings.EqualFold(value, targetVersions.imageRef) {
log.Printf("\t%s: Image %s, want %s\n", node.Name, value, targetVersions.imageRef)
allUpdated = false
}
}
}
kubeletVersion := node.Status.NodeInfo.KubeletVersion
if kubeletVersion != string(targetVersions.kubernetes) {
log.Printf("\t%s: K8s (Kubelet) %s, want %s\n", node.Name, kubeletVersion, targetVersions.kubernetes)
allUpdated = false
}
kubeProxyVersion := node.Status.NodeInfo.KubeProxyVersion
if kubeProxyVersion != string(targetVersions.kubernetes) {
log.Printf("\t%s: K8s (Proxy) %s, want %s\n", node.Name, kubeProxyVersion, targetVersions.kubernetes)
allUpdated = false
}
}
return allUpdated
}, timeout, time.Minute)
}
type versionContainer struct {
imageRef string
kubernetes versions.ValidK8sVersion
microservices semver.Semver
}
// runCommandWithSeparateOutputs runs the given command while separating buffers for
// stdout and stderr.
func runCommandWithSeparateOutputs(cmd *exec.Cmd) (stdout, stderr []byte, err error) {
stdoutIn, err := cmd.StdoutPipe()
if err != nil {
err = fmt.Errorf("create stdout pipe: %w", err)
return
}
stderrIn, err := cmd.StderrPipe()
if err != nil {
err = fmt.Errorf("create stderr pipe: %w", err)
return
}
err = cmd.Start()
if err != nil {
err = fmt.Errorf("start command: %w", err)
return
}
stdout, err = io.ReadAll(stdoutIn)
if err != nil {
err = fmt.Errorf("start command: %w", err)
return
}
stderr, err = io.ReadAll(stderrIn)
if err != nil {
err = fmt.Errorf("start command: %w", err)
return
}
if err = cmd.Wait(); err != nil {
err = fmt.Errorf("wait for command to finish: %w", err)
}
return
}