From 63b9761962cbd673a6522387da326733ff8a7ed9 Mon Sep 17 00:00:00 2001
From: Malte Poll <1780588+malt3@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:05:15 +0100
Subject: [PATCH] docs: explain recovery steps on STACKIT
---
docs/docs/workflows/recovery.md | 31 +++++++++++++++++++
.../version-2.16/workflows/recovery.md | 31 +++++++++++++++++++
2 files changed, 62 insertions(+)
diff --git a/docs/docs/workflows/recovery.md b/docs/docs/workflows/recovery.md
index 9396bf8f2..9bbb32652 100644
--- a/docs/docs/workflows/recovery.md
+++ b/docs/docs/workflows/recovery.md
@@ -118,6 +118,37 @@ If this fails due to an unhealthy control plane, you will see log messages simil
This means that you have to recover the node manually.
+
+
+
+First, open the STACKIT portal to view all servers in your project. Select individual control plane nodes `--control-plane--` and check that enough members are in a *Running* state.
+
+Second, check the boot logs of these servers. Click on a server name and select **Overview**. Find the **Machine Setup** section and click on **Web console** > **Open console**.
+
+In the serial console output, search for `Waiting for decryption key`.
+Similar output to the following means your node was restarted and needs to decrypt the [state disk](../architecture/images.md#state-disk):
+
+```json
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","caller":"cmd/main.go:55","msg":"Starting disk-mapper","version":"2.0.0","cloudProvider":"gcp"}
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"setupManager","caller":"setup/setup.go:72","msg":"Preparing existing state disk"}
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"rejoinClient","caller":"rejoinclient/client.go:65","msg":"Starting RejoinClient"}
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"recoveryServer","caller":"recoveryserver/server.go:59","msg":"Starting RecoveryServer"}
+```
+
+The node will then try to connect to the [*JoinService*](../architecture/microservices.md#joinservice) and obtain the decryption key.
+If this fails due to an unhealthy control plane, you will see log messages similar to the following:
+
+```json
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"rejoinClient","caller":"rejoinclient/client.go:77","msg":"Received list with JoinService endpoints","endpoints":["192.168.178.4:30090","192.168.178.2:30090"]}
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"rejoinClient","caller":"rejoinclient/client.go:96","msg":"Requesting rejoin ticket","endpoint":"192.168.178.4:30090"}
+{"level":"WARN","ts":"2022-09-08T10:21:53Z","logger":"rejoinClient","caller":"rejoinclient/client.go:101","msg":"Failed to rejoin on endpoint","error":"rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp 192.168.178.4:30090: connect: connection refused\"","endpoint":"192.168.178.4:30090"}
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"rejoinClient","caller":"rejoinclient/client.go:96","msg":"Requesting rejoin ticket","endpoint":"192.168.178.2:30090"}
+{"level":"WARN","ts":"2022-09-08T10:22:13Z","logger":"rejoinClient","caller":"rejoinclient/client.go:101","msg":"Failed to rejoin on endpoint","error":"rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp 192.168.178.2:30090: i/o timeout\"","endpoint":"192.168.178.2:30090"}
+{"level":"ERROR","ts":"2022-09-08T10:22:13Z","logger":"rejoinClient","caller":"rejoinclient/client.go:110","msg":"Failed to rejoin on all endpoints"}
+```
+
+This means that you have to recover the node manually.
+
diff --git a/docs/versioned_docs/version-2.16/workflows/recovery.md b/docs/versioned_docs/version-2.16/workflows/recovery.md
index 9396bf8f2..b7224a141 100644
--- a/docs/versioned_docs/version-2.16/workflows/recovery.md
+++ b/docs/versioned_docs/version-2.16/workflows/recovery.md
@@ -118,6 +118,37 @@ If this fails due to an unhealthy control plane, you will see log messages simil
This means that you have to recover the node manually.
+
+
+
+First, open the STACKIT portal to view all servers in your project. Select individual control plane nodes `--control-plane--` and check that enough members are in a *Running* state.
+
+Second, check the boot logs of these *Servers*. Click on a server name and select **Overview**. Find the **Machine Setup** section and click on **Web console** > **Open console**.
+
+In the serial console output, search for `Waiting for decryption key`.
+Similar output to the following means your node was restarted and needs to decrypt the [state disk](../architecture/images.md#state-disk):
+
+```json
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","caller":"cmd/main.go:55","msg":"Starting disk-mapper","version":"2.0.0","cloudProvider":"gcp"}
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"setupManager","caller":"setup/setup.go:72","msg":"Preparing existing state disk"}
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"rejoinClient","caller":"rejoinclient/client.go:65","msg":"Starting RejoinClient"}
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"recoveryServer","caller":"recoveryserver/server.go:59","msg":"Starting RecoveryServer"}
+```
+
+The node will then try to connect to the [*JoinService*](../architecture/microservices.md#joinservice) and obtain the decryption key.
+If this fails due to an unhealthy control plane, you will see log messages similar to the following:
+
+```json
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"rejoinClient","caller":"rejoinclient/client.go:77","msg":"Received list with JoinService endpoints","endpoints":["192.168.178.4:30090","192.168.178.2:30090"]}
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"rejoinClient","caller":"rejoinclient/client.go:96","msg":"Requesting rejoin ticket","endpoint":"192.168.178.4:30090"}
+{"level":"WARN","ts":"2022-09-08T10:21:53Z","logger":"rejoinClient","caller":"rejoinclient/client.go:101","msg":"Failed to rejoin on endpoint","error":"rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp 192.168.178.4:30090: connect: connection refused\"","endpoint":"192.168.178.4:30090"}
+{"level":"INFO","ts":"2022-09-08T10:21:53Z","logger":"rejoinClient","caller":"rejoinclient/client.go:96","msg":"Requesting rejoin ticket","endpoint":"192.168.178.2:30090"}
+{"level":"WARN","ts":"2022-09-08T10:22:13Z","logger":"rejoinClient","caller":"rejoinclient/client.go:101","msg":"Failed to rejoin on endpoint","error":"rpc error: code = Unavailable desc = connection error: desc = \"transport: Error while dialing dial tcp 192.168.178.2:30090: i/o timeout\"","endpoint":"192.168.178.2:30090"}
+{"level":"ERROR","ts":"2022-09-08T10:22:13Z","logger":"rejoinClient","caller":"rejoinclient/client.go:110","msg":"Failed to rejoin on all endpoints"}
+```
+
+This means that you have to recover the node manually.
+