diff --git a/.github/actions/constellation_create/action.yml b/.github/actions/constellation_create/action.yml index 1c13be1c9..8160352c2 100644 --- a/.github/actions/constellation_create/action.yml +++ b/.github/actions/constellation_create/action.yml @@ -61,6 +61,9 @@ outputs: kubeconfig: description: "The kubeconfig for the cluster." value: ${{ steps.constellation-init.outputs.KUBECONFIG }} + masterSecret: + description: "The master-secret for the cluster." + value: ${{ steps.constellation-init.outputs.MASTERSECRET }} runs: using: "composite" @@ -178,6 +181,7 @@ runs: run: | constellation init echo "KUBECONFIG=$(pwd)/constellation-admin.conf" >> $GITHUB_OUTPUT + echo "MASTERSECRET=$(pwd)/constellation-mastersecret.json" >> $GITHUB_OUTPUT - name: Wait for nodes to join and become ready shell: bash diff --git a/.github/actions/e2e_recover/action.yml b/.github/actions/e2e_recover/action.yml new file mode 100644 index 000000000..79c6e9206 --- /dev/null +++ b/.github/actions/e2e_recover/action.yml @@ -0,0 +1,73 @@ +name: Constellation recover +description: "Recover a Constellation cluster with an unavailable control plane." + +inputs: + controlNodesCount: + description: "The amount of control plane nodes in the cluster." + required: true + kubeconfig: + description: "The kubeconfig for the cluster." + required: true + masterSecret: + description: "The master-secret for the cluster." + required: true + cloudProvider: + description: "Which cloud provider to use." + required: true + gcpProject: + description: "The GCP project Constellation is deployed in." + required: false + resourceGroup: + description: "The Azure resource group Constellation is deployed in." + required: false + +runs: + using: "composite" + steps: + - name: Restart worker node + shell: bash + run: | + WORKER_NODE=$(kubectl get nodes --selector='!node-role.kubernetes.io/control-plane' -o json | jq '.items[0].metadata.name' -r) + kubectl debug node/$WORKER_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh" + kubectl wait --for=condition=Ready=false --timeout=10m node/$WORKER_NODE + kubectl wait --for=condition=Ready=true --timeout=10m --all nodes + env: + KUBECONFIG: ${{ inputs.kubeconfig }} + - name: Restart all control plane nodes + shell: bash + run: | + CONTROL_PLANE_NODES=$(kubectl get nodes --selector='node-role.kubernetes.io/control-plane' -o json | jq '.items[].metadata.name' -r) + for CONTROL_PLANE_NODE in ${CONTROL_PLANE_NODES}; do + kubectl debug node/$CONTROL_PLANE_NODE --image=ubuntu -- bash -c "echo reboot > reboot.sh && chroot /host < reboot.sh" + done + env: + KUBECONFIG: ${{ inputs.kubeconfig }} + - name: Constellation recover + shell: bash + run: | + timeout=600 + start_time=$(date +%s) + recovered=0 + while true; do + output=$(constellation recover --master-secret=${{ inputs.masterSecret }}) + if echo "$output" | grep -q "Pushed recovery key."; then + echo "$output" + i=$(echo "$output" | grep -o "Pushed recovery key." | wc -l | sed 's/ //g') + recovered=$((recovered+i)) + if [[ $recovered -eq ${{ inputs.controlNodesCount }} ]]; then + exit 0 + fi + fi + + current_time=$(date +%s) + if ((current_time - start_time > timeout)); then + echo "Control plane recovery timed out after $timeout seconds." + exit 1 + fi + + echo "Did not recover all nodes yet, retrying in 5 seconds [$recovered/${{ inputs.controlNodesCount }}]" + sleep 5 + done + kubectl wait --for=condition=Ready --timeout=10m --all nodes + env: + KUBECONFIG: ${{ inputs.kubeconfig }} diff --git a/.github/actions/e2e_test/action.yml b/.github/actions/e2e_test/action.yml index 5ff995eaf..0e239e7ad 100644 --- a/.github/actions/e2e_test/action.yml +++ b/.github/actions/e2e_test/action.yml @@ -57,7 +57,7 @@ inputs: description: "The resource group to use" required: false test: - description: "The test to run. Can currently be one of [sonobuoy full, sonobuoy quick, autoscaling, lb, k-bench, verify, nop]." + description: "The test to run. Can currently be one of [sonobuoy full, sonobuoy quick, autoscaling, lb, k-bench, verify, recover, nop]." required: true sonobuoyTestSuiteCmd: description: "The sonobuoy test suite to run." @@ -72,7 +72,7 @@ runs: using: "composite" steps: - name: Check input - if: (!contains(fromJson('["sonobuoy full", "sonobuoy quick", "autoscaling", "k-bench", "verify", "lb", "nop"]'), inputs.test)) + if: (!contains(fromJson('["sonobuoy full", "sonobuoy quick", "autoscaling", "k-bench", "verify", "lb", "recover", "nop"]'), inputs.test)) shell: bash run: | echo "Invalid input for test field: ${{ inputs.test }}" @@ -212,3 +212,14 @@ runs: with: cloudProvider: ${{ inputs.cloudProvider }} osImage: ${{ inputs.osImage }} + + - name: Run recover test + if: inputs.test == 'recover' + uses: ./.github/actions/e2e_recover + with: + controlNodesCount: ${{ inputs.controlNodesCount }} + cloudProvider: ${{ inputs.cloudProvider }} + gcpProject: ${{ inputs.gcpProject }} + kubeconfig: ${{ steps.constellation-create.outputs.kubeconfig }} + masterSecret: ${{ steps.constellation-create.outputs.masterSecret }} + azureResourceGroup: ${{ inputs.azureResourceGroup }} diff --git a/.github/workflows/e2e-test-manual.yml b/.github/workflows/e2e-test-manual.yml index 55dbb7388..f681e90a3 100644 --- a/.github/workflows/e2e-test-manual.yml +++ b/.github/workflows/e2e-test-manual.yml @@ -37,6 +37,7 @@ on: - "lb" - "k-bench" - "verify" + - "recover" - "nop" required: true kubernetesVersion: diff --git a/.github/workflows/e2e-test-weekly.yml b/.github/workflows/e2e-test-weekly.yml index b2b613ac6..5b395347d 100644 --- a/.github/workflows/e2e-test-weekly.yml +++ b/.github/workflows/e2e-test-weekly.yml @@ -45,7 +45,7 @@ jobs: fail-fast: false max-parallel: 5 matrix: - test: ["sonobuoy full", "autoscaling", "k-bench", "lb", "verify"] + test: ["sonobuoy full", "autoscaling", "k-bench", "lb", "verify", "recover"] provider: ["gcp", "azure", "aws"] version: ["1.23", "1.24", "1.25", "1.26"] exclude: @@ -56,6 +56,13 @@ jobs: version: "1.24" - test: "verify" version: "1.25" + # Recover test runs only on latest version. + - test: "recover" + version: "1.23" + - test: "recover" + version: "1.24" + - test: "recover" + version: "1.25" # Autoscaling test runs only on latest version. - test: "autoscaling" version: "1.23" diff --git a/cli/internal/cloudcmd/clients.go b/cli/internal/cloudcmd/clients.go index 03850bb15..a3a3ef4eb 100644 --- a/cli/internal/cloudcmd/clients.go +++ b/cli/internal/cloudcmd/clients.go @@ -17,7 +17,7 @@ import ( type terraformClient interface { PrepareWorkspace(path string, input terraform.Variables) error - CreateCluster(ctx context.Context) (string, string, error) + CreateCluster(ctx context.Context) (terraform.CreateOutput, error) CreateIAMConfig(ctx context.Context, provider cloudprovider.Provider) (terraform.IAMOutput, error) DestroyCluster(ctx context.Context) error CleanUpWorkspace() error diff --git a/cli/internal/cloudcmd/clients_test.go b/cli/internal/cloudcmd/clients_test.go index be94a810b..f279aa6cf 100644 --- a/cli/internal/cloudcmd/clients_test.go +++ b/cli/internal/cloudcmd/clients_test.go @@ -29,6 +29,7 @@ type stubTerraformClient struct { ip string initSecret string iamOutput terraform.IAMOutput + uid string cleanUpWorkspaceCalled bool removeInstallerCalled bool destroyClusterCalled bool @@ -39,8 +40,12 @@ type stubTerraformClient struct { iamOutputErr error } -func (c *stubTerraformClient) CreateCluster(ctx context.Context) (string, string, error) { - return c.ip, c.initSecret, c.createClusterErr +func (c *stubTerraformClient) CreateCluster(ctx context.Context) (terraform.CreateOutput, error) { + return terraform.CreateOutput{ + IP: c.ip, + Secret: c.initSecret, + UID: c.uid, + }, c.createClusterErr } func (c *stubTerraformClient) CreateIAMConfig(ctx context.Context, provider cloudprovider.Provider) (terraform.IAMOutput, error) { diff --git a/cli/internal/cloudcmd/create.go b/cli/internal/cloudcmd/create.go index 86cd7a1a2..7c51478f6 100644 --- a/cli/internal/cloudcmd/create.go +++ b/cli/internal/cloudcmd/create.go @@ -123,15 +123,16 @@ func (c *Creator) createAWS(ctx context.Context, cl terraformClient, config *con } defer rollbackOnError(context.Background(), c.out, &retErr, &rollbackerTerraform{client: cl}) - ip, initSecret, err := cl.CreateCluster(ctx) + tfOutput, err := cl.CreateCluster(ctx) if err != nil { return clusterid.File{}, err } return clusterid.File{ CloudProvider: cloudprovider.AWS, - InitSecret: []byte(initSecret), - IP: ip, + InitSecret: []byte(tfOutput.Secret), + IP: tfOutput.IP, + UID: tfOutput.UID, }, nil } @@ -160,15 +161,16 @@ func (c *Creator) createGCP(ctx context.Context, cl terraformClient, config *con } defer rollbackOnError(context.Background(), c.out, &retErr, &rollbackerTerraform{client: cl}) - ip, initSecret, err := cl.CreateCluster(ctx) + tfOutput, err := cl.CreateCluster(ctx) if err != nil { return clusterid.File{}, err } return clusterid.File{ CloudProvider: cloudprovider.GCP, - InitSecret: []byte(initSecret), - IP: ip, + InitSecret: []byte(tfOutput.Secret), + IP: tfOutput.IP, + UID: tfOutput.UID, }, nil } @@ -200,15 +202,16 @@ func (c *Creator) createAzure(ctx context.Context, cl terraformClient, config *c } defer rollbackOnError(context.Background(), c.out, &retErr, &rollbackerTerraform{client: cl}) - ip, initSecret, err := cl.CreateCluster(ctx) + tfOutput, err := cl.CreateCluster(ctx) if err != nil { return clusterid.File{}, err } return clusterid.File{ CloudProvider: cloudprovider.Azure, - IP: ip, - InitSecret: []byte(initSecret), + IP: tfOutput.IP, + InitSecret: []byte(tfOutput.Secret), + UID: tfOutput.UID, }, nil } @@ -313,14 +316,15 @@ func (c *Creator) createQEMU(ctx context.Context, cl terraformClient, lv libvirt // Allow rollback of QEMU Terraform workspace from this point on qemuRollbacker.createdWorkspace = true - ip, initSecret, err := cl.CreateCluster(ctx) + tfOutput, err := cl.CreateCluster(ctx) if err != nil { return clusterid.File{}, err } return clusterid.File{ CloudProvider: cloudprovider.QEMU, - InitSecret: []byte(initSecret), - IP: ip, + InitSecret: []byte(tfOutput.Secret), + IP: tfOutput.IP, + UID: tfOutput.UID, }, nil } diff --git a/cli/internal/terraform/terraform.go b/cli/internal/terraform/terraform.go index 613ae2bb0..b862467ae 100644 --- a/cli/internal/terraform/terraform.go +++ b/cli/internal/terraform/terraform.go @@ -74,39 +74,59 @@ func (c *Client) PrepareWorkspace(path string, vars Variables) error { } // CreateCluster creates a Constellation cluster using Terraform. -func (c *Client) CreateCluster(ctx context.Context) (string, string, error) { +func (c *Client) CreateCluster(ctx context.Context) (CreateOutput, error) { if err := c.tf.Init(ctx); err != nil { - return "", "", err + return CreateOutput{}, err } if err := c.tf.Apply(ctx); err != nil { - return "", "", err + return CreateOutput{}, err } tfState, err := c.tf.Show(ctx) if err != nil { - return "", "", err + return CreateOutput{}, err } ipOutput, ok := tfState.Values.Outputs["ip"] if !ok { - return "", "", errors.New("no IP output found") + return CreateOutput{}, errors.New("no IP output found") } ip, ok := ipOutput.Value.(string) if !ok { - return "", "", errors.New("invalid type in IP output: not a string") + return CreateOutput{}, errors.New("invalid type in IP output: not a string") } secretOutput, ok := tfState.Values.Outputs["initSecret"] if !ok { - return "", "", errors.New("no initSecret output found") + return CreateOutput{}, errors.New("no initSecret output found") } secret, ok := secretOutput.Value.(string) if !ok { - return "", "", errors.New("invalid type in initSecret output: not a string") + return CreateOutput{}, errors.New("invalid type in initSecret output: not a string") } - return ip, secret, nil + uidOutput, ok := tfState.Values.Outputs["uid"] + if !ok { + return CreateOutput{}, errors.New("no uid output found") + } + uid, ok := uidOutput.Value.(string) + if !ok { + return CreateOutput{}, errors.New("invalid type in uid output: not a string") + } + + return CreateOutput{ + IP: ip, + Secret: secret, + UID: uid, + }, nil +} + +// CreateOutput contains the Terraform output values of a cluster creation. +type CreateOutput struct { + IP string + Secret string + UID string } // IAMOutput contains the output information of the Terraform IAM operations. diff --git a/cli/internal/terraform/terraform/aws/outputs.tf b/cli/internal/terraform/terraform/aws/outputs.tf index 3977e9777..fe3372a68 100644 --- a/cli/internal/terraform/terraform/aws/outputs.tf +++ b/cli/internal/terraform/terraform/aws/outputs.tf @@ -2,6 +2,10 @@ output "ip" { value = aws_eip.lb.public_ip } +output "uid" { + value = local.uid +} + output "initSecret" { value = random_password.initSecret.result sensitive = true diff --git a/cli/internal/terraform/terraform/azure/outputs.tf b/cli/internal/terraform/terraform/azure/outputs.tf index 0941a9351..80172a6b8 100644 --- a/cli/internal/terraform/terraform/azure/outputs.tf +++ b/cli/internal/terraform/terraform/azure/outputs.tf @@ -2,6 +2,10 @@ output "ip" { value = azurerm_public_ip.loadbalancer_ip.ip_address } +output "uid" { + value = local.uid +} + output "initSecret" { value = random_password.initSecret.result sensitive = true diff --git a/cli/internal/terraform/terraform/gcp/outputs.tf b/cli/internal/terraform/terraform/gcp/outputs.tf index 8318b49ae..622998ec5 100644 --- a/cli/internal/terraform/terraform/gcp/outputs.tf +++ b/cli/internal/terraform/terraform/gcp/outputs.tf @@ -2,6 +2,10 @@ output "ip" { value = google_compute_global_address.loadbalancer_ip.address } +output "uid" { + value = local.uid +} + output "initSecret" { value = random_password.initSecret.result sensitive = true diff --git a/cli/internal/terraform/terraform_test.go b/cli/internal/terraform/terraform_test.go index f3239b2d9..89cb265b7 100644 --- a/cli/internal/terraform/terraform_test.go +++ b/cli/internal/terraform/terraform_test.go @@ -214,6 +214,9 @@ func TestCreateCluster(t *testing.T) { "initSecret": { Value: "initSecret", }, + "uid": { + Value: "12345abc", + }, }, }, } @@ -300,6 +303,34 @@ func TestCreateCluster(t *testing.T) { fs: afero.NewMemMapFs(), wantErr: true, }, + "no uid": { + pathBase: "terraform", + provider: cloudprovider.QEMU, + vars: qemuVars, + tf: &stubTerraform{ + showState: &tfjson.State{ + Values: &tfjson.StateValues{ + Outputs: map[string]*tfjson.StateOutput{}, + }, + }, + }, + fs: afero.NewMemMapFs(), + wantErr: true, + }, + "uid has wrong type": { + pathBase: "terraform", + provider: cloudprovider.QEMU, + vars: qemuVars, + tf: &stubTerraform{ + showState: &tfjson.State{ + Values: &tfjson.StateValues{ + Outputs: map[string]*tfjson.StateOutput{"uid": {Value: 42}}, + }, + }, + }, + fs: afero.NewMemMapFs(), + wantErr: true, + }, } for name, tc := range testCases { @@ -315,15 +346,16 @@ func TestCreateCluster(t *testing.T) { path := path.Join(tc.pathBase, strings.ToLower(tc.provider.String())) require.NoError(c.PrepareWorkspace(path, tc.vars)) - ip, initSecret, err := c.CreateCluster(context.Background()) + tfOutput, err := c.CreateCluster(context.Background()) if tc.wantErr { assert.Error(err) return } assert.NoError(err) - assert.Equal("192.0.2.100", ip) - assert.Equal("initSecret", initSecret) + assert.Equal("192.0.2.100", tfOutput.IP) + assert.Equal("initSecret", tfOutput.Secret) + assert.Equal("12345abc", tfOutput.UID) }) } } diff --git a/dev-docs/workflows/github-actions.md b/dev-docs/workflows/github-actions.md index 8f9101a42..7f10152ff 100644 --- a/dev-docs/workflows/github-actions.md +++ b/dev-docs/workflows/github-actions.md @@ -9,12 +9,13 @@ gh workflow run e2e-test-manual.yml \ --ref feat/e2e_pipeline \ # On your specific branch! -F cloudProvider=gcp \ # With your ... -F controlNodesCount=1 -F workerNodesCount=2 \ # ... settings - -F machineType=n2d-standard-4 + -F machineType=n2d-standard-4 \ + -F test=nop ``` ### E2E Test Suites -Here are some examples for test suits you might want to run. Values for `sonobuoyTestSuiteCmd`: +Here are some examples for test suites you might want to run. Values for `sonobuoyTestSuiteCmd`: * `--mode quick` * Runs a set of tests that are known to be quick to execute! (<1 min)