From d612ed2caedf5e850f4e6d948dde76ad4b70b4ae Mon Sep 17 00:00:00 2001
From: Christoph Meyer <cme@edgeless.systems>
Date: Sat, 22 Oct 2022 14:07:54 +0000
Subject: [PATCH] AB#2530 CI benchmarks compare to previous and generate graphs

- Get the previous benchmark results from artifact store S3 bucket
- Compare the current benchmark to the previous results
- Attach markdown table comparing results to the workflow output
- Update benchmarks in bucket if running on main
- Generate graphs from comparison
- Document continous benchmarking
---
 .github/actions/e2e_kbench/README.md          | 162 +++++++++++++++
 .github/actions/e2e_kbench/action.yml         |  52 ++++-
 .../actions/e2e_kbench/evaluate/__init__.py   |   0
 .../actions/e2e_kbench/evaluate/compare.py    | 131 ++++++++++++
 .../e2e_kbench/evaluate/evaluators/default.py |  11 +-
 .../e2e_kbench/evaluate/evaluators/fio.py     |  17 +-
 .../e2e_kbench/evaluate/evaluators/network.py |  14 +-
 .github/actions/e2e_kbench/evaluate/graph.py  | 194 ++++++++++++++++++
 .github/actions/e2e_kbench/evaluate/parse.py  |  82 ++++++++
 9 files changed, 639 insertions(+), 24 deletions(-)
 create mode 100644 .github/actions/e2e_kbench/README.md
 create mode 100644 .github/actions/e2e_kbench/evaluate/__init__.py
 create mode 100644 .github/actions/e2e_kbench/evaluate/compare.py
 create mode 100644 .github/actions/e2e_kbench/evaluate/graph.py
 create mode 100644 .github/actions/e2e_kbench/evaluate/parse.py
diff --git a/.github/actions/e2e_kbench/README.md b/.github/actions/e2e_kbench/README.md
new file mode 100644
index 000000000..b31349720
--- /dev/null
+++ b/.github/actions/e2e_kbench/README.md
@@ -0,0 +1,162 @@
+# K-Bench
+
+## Continuous Benchmarking
+The K-Bench action runs K-Bench benchmarks on Constellation clusters.
+The benchmark suite records storage, network, and Kubernetes API benchmarks.
+
+After testing, the action compares the results of the benchmarks to previous results of Constellation on the same cloud provider. That way, it is possible to evaluate performance progression throughout the development.
+
+The data of previous benchmarks is stored in the private S3 artifact store.
+
+In order to support encrypted storage, the action deploys the [Azure CSI](https://github.com/edgelesssys/constellation-azuredisk-csi-driver) and [GCP CSI](https://github.com/edgelesssys/constellation-gcp-compute-persistent-disk-csi-driver) drivers. It uses a [fork](https://github.com/edgelesssys/k-bench) of VMware's K-Bench. The fork deploys volumes that use the `encrypted-storage` storage class. Also, it has support to authenticate against GCP which is required to update the stored records for GKE.
+
+### Displaying Performance Progression
+The action creates a summary of the action and attaches it the workflow execution log.
+
+The table compares the current benchmark results of Constellation on the selected cloud provider to the previous records (of Constellation on the cloud provider).
+
+The hashes of the two commits that are the base for the comparison are prepended to the table.
+
+Example table:
+
+<details>
+
+- Commit of current benchmark: 8eb0a6803bc431bcebc2f6766ab2c6376500e106
+- Commit of previous benchmark: 8f733daaf5c5509f024745260220d89ef8e6e440
+
+| Benchmark suite | Current | Previous | Ratio |
+|-|-|-|-|
+| pod_create (ms) | 135 | 198 | 0.682 ⬇️ |
+| pod_list (ms) | 100 | 99 | 1.01 ⬆️ |
+| pod_get (ms) | 98 | 98 | 1.0 ⬆️ |
+| pod_update (ms) | 187 | 132 | 1.417 ⬆️ |
+| pod_delete (ms) | 119 | 108 | 1.102 ⬆️ |
+| svc_create (ms) | 156 | 149 | 1.047 ⬆️ |
+| svc_list (ms) | 97 | 96 | 1.01 ⬆️ |
+| svc_get (ms) | 97 | 96 | 1.01 ⬆️ |
+| svc_update (ms) | 100 | 101 | 0.99 ⬇️ |
+| svc_delete (ms) | 143 | 139 | 1.029 ⬆️ |
+| depl_create (ms) | 201 | 218 | 0.922 ⬇️ |
+| depl_list (ms) | 101 | 101 | 1.0 ⬆️ |
+| depl_update (ms) | 111 | 110 | 1.009 ⬆️ |
+| depl_scale (ms) | 391 | 391 | 1.0 ⬆️ |
+| depl_delete (ms) | 401 | 402 | 0.998 ⬇️ |
+| net_internode_snd (Mbit/s) | 953.0 | 964.0 | 1.01 ⬆️ |
+| net_intranode_snd (Mbit/s) | 18500.0 | 18600.0 | 1.01 ⬆️ |
+| fio_root_async_R70W30_R (MiB/s) | 0.45 | 0.45| 1.0 ⬆️ |
+| fio_root_async_R70W30_W (MiB/s) | 0.20 | 0.20 | 1.0 ⬆️ |
+| fio_root_async_R100W0_R (MiB/s) | 0.59 | 0.59 | 1.0 ⬆️ |
+| fio_root_async_R0W100_W (MiB/s) | 1.18 | 1.18 | 1.0 ⬆️ |
+
+</details>
+
+### Drawing Performance Charts
+The action also draws graphs as used in the [Constellation docs](https://docs.edgeless.systems/constellation/next/overview/performance). The graphs compare the performance of Constellation to the performance of managed Kubernetes clusters.
+
+Graphs are created with every run of the benchmarking action. The action attaches them to the `benchmark` artifact of the workflow run.
+
+## Updating Stored Records
+
+### Managed Kubernetes
+One must manually update the stored benchmark records of managed Kubernetes:
+
+### AKS
+Follow the [Azure documentation](https://learn.microsoft.com/en-us/azure/aks/learn/quick-kubernetes-deploy-portal?tabs=azure-cli) to create an AKS cluster of desired benchmarking settings (region, instance types). If comparing against Constellation clusters with CVM instances, make sure to select the matching CVM instance type on Azure as well.
+
+Once the cluster is ready, set up managing access via `kubectl` and take the benchmark:
+```bash
+# Setup
+git clone https://github.com/edgelesssys/k-bench.git
+cd k-bench && git checkout feat/constellation
+./install.sh
+
+# Remove the Constellation encrypted storage class
+# Remember to revert this change before running K-Bench on Constellation!
+yq 'del(.spec.storageClassName)' config/dp_fio/fio_pvc.yaml
+yq 'del(.spec.storageClassName)' config/dp_netperf_internode/netperf_pvc.yml
+yq 'del(.spec.storageClassName)' config/dp_network_internode/netperf_pvc.yaml
+yq 'del(.spec.storageClassName)' config/dp_network_intranode/netperf_pvc.yml
+
+# Run K-Bench
+mkdir -p ./out
+kubectl create namespace kbench-pod-namespace --dry-run=client -o yaml | kubectl apply -f -
+./run.sh -r "kbench-AKS" -t "default" -o "./out/"
+kubectl delete namespace kbench-pod-namespace --wait=true || true
+kubectl create namespace kbench-pod-namespace --dry-run=client -o yaml |  kubectl apply -f -
+./run.sh -r "kbench-AKS" -t "dp_fio" -o "./out/"
+kubectl delete namespace kbench-pod-namespace --wait=true || true
+kubectl create namespace kbench-pod-namespace --dry-run=client -o yaml |  kubectl apply -f -
+./run.sh -r "kbench-AKS" -t "dp_network_internode" -o "./out/"
+kubectl delete namespace kbench-pod-namespace --wait=true || true
+kubectl create namespace kbench-pod-namespace --dry-run=client -o yaml |  kubectl apply -f -
+./run.sh -r "kbench-AKS" -t "dp_network_intranode" -o "./out/"
+
+# Benchmarks done, do processing.
+mkdir -p "./out/AKS"
+mv ./out/results_kbench-AKS_*m/* "./out/kbench-AKS/"
+
+# Parse
+git clone https://github.com/edgelesssys/constellation.git
+mkdir -p benchmarks
+BDIR=benchmarks
+EXT_NAME=AKS
+KBENCH_RESULTS=k-bench/out/
+
+python constellation/.github/actions/e2e_kbench/evaluate/parse.py
+
+# Upload result to S3
+S3_PATH=s3://edgeless-artifact-store/constellation/benchmarks
+aws s3 cp benchmarks/AKS.json ${S3_PATH}/AKS.json
+```
+
+### GKE
+Create a GKE cluster of desired benchmarking settings (region, instance types). If comparing against Constellation clusters with CVM instances, make sure to select the matching CVM instance type on GCP and enable **confidential** VMs as well.
+
+Once the cluster is ready, set up managing access via `kubectl` and take the benchmark:
+```bash
+# Setup
+git clone https://github.com/edgelesssys/k-bench.git
+cd k-bench && git checkout feat/constellation
+./install.sh
+
+# Remove the Constellation encrypted storage class
+# Remember to revert this change before running K-Bench on Constellation!
+yq 'del(.spec.storageClassName)' config/dp_fio/fio_pvc.yaml
+yq 'del(.spec.storageClassName)' config/dp_netperf_internode/netperf_pvc.yml
+yq 'del(.spec.storageClassName)' config/dp_network_internode/netperf_pvc.yaml
+yq 'del(.spec.storageClassName)' config/dp_network_intranode/netperf_pvc.yml
+
+# Run K-Bench
+mkdir -p ./out
+kubectl create namespace kbench-pod-namespace --dry-run=client -o yaml | kubectl apply -f -
+./run.sh -r "kbench-GKE" -t "default" -o "./out/"
+kubectl delete namespace kbench-pod-namespace --wait=true || true
+kubectl create namespace kbench-pod-namespace --dry-run=client -o yaml |  kubectl apply -f -
+./run.sh -r "kbench-GKE" -t "dp_fio" -o "./out/"
+kubectl delete namespace kbench-pod-namespace --wait=true || true
+kubectl create namespace kbench-pod-namespace --dry-run=client -o yaml |  kubectl apply -f -
+./run.sh -r "kbench-GKE" -t "dp_network_internode" -o "./out/"
+kubectl delete namespace kbench-pod-namespace --wait=true || true
+kubectl create namespace kbench-pod-namespace --dry-run=client -o yaml |  kubectl apply -f -
+./run.sh -r "kbench-GKE" -t "dp_network_intranode" -o "./out/"
+
+# Benchmarks done, do processing.
+mkdir -p "./out/GKE"
+mv ./out/results_kbench-GKE_*m/* "./out/kbench-GKE/"
+
+# Parse
+git clone https://github.com/edgelesssys/constellation.git
+mkdir -p benchmarks
+BDIR=benchmarks
+EXT_NAME=GKE
+KBENCH_RESULTS=k-bench/out/
+
+python constellation/.github/actions/e2e_kbench/evaluate/parse.py
+
+# Upload result to S3
+S3_PATH=s3://edgeless-artifact-store/constellation/benchmarks
+aws s3 cp benchmarks/GKE.json ${S3_PATH}/GKE.json
+```
+
+### Constellation
+The action updates the stored Constellation records for the selected cloud provider when running on the main branch.
diff --git a/.github/actions/e2e_kbench/action.yml b/.github/actions/e2e_kbench/action.yml
index 1c20946fe..a32cae6e5 100644
--- a/.github/actions/e2e_kbench/action.yml
+++ b/.github/actions/e2e_kbench/action.yml
@@ -11,6 +11,7 @@ inputs:
 
 runs:
   using: "composite"
+
   steps:
     - name: Setup python
       uses: actions/setup-python@13ae5bb136fac2878aff31522b9efb785519f984 # tag=v4.3.0
@@ -79,18 +80,57 @@ runs:
         path: "k-bench/out/kbench-constellation-${{ inputs.cloudProvider }}"
         name: "k-bench-constellation-${{ inputs.cloudProvider }}"
 
-    - name: Parse test results and create diagrams
+    - name: Assume AWS role to retrieve and update benchmarks in S3
+      uses: aws-actions/configure-aws-credentials@67fbcbb121271f7775d2e7715933280b06314838 # tag=v1.7.0
+      with:
+        role-to-assume: arn:aws:iam::795746500882:role/GithubActionUpdateBenchmarks
+        aws-region: us-east-2
+
+    - name: Set S3 artifact store
       shell: bash
-      run: python .github/actions/e2e_kbench/evaluate/main.py
+      run: echo S3_PATH=s3://${ARTIFACT_BUCKET_CONSTELLATION}/benchmarks >> $GITHUB_ENV
+      env:
+        ARTIFACT_BUCKET_CONSTELLATION: "edgeless-artifact-store/constellation"
+
+    - name: Get previous benchmark records from S3
+      shell: bash
+      run: |
+        mkdir -p benchmarks
+        aws s3 cp --recursive ${S3_PATH} benchmarks --no-progress
+        mv benchmarks/constellation-${CSP}.json benchmarks/constellation-${CSP}-previous.json
       env:
         KBENCH_RESULTS: ${{ github.workspace }}/k-bench/out/
         CSP: ${{ inputs.cloudProvider }}
 
-    - name: Upload benchmark results
+    - name: Parse K-Bench results, create diagrams and post the progression summary
+      shell: bash
+      run: |
+        python .github/actions/e2e_kbench/evaluate/parse.py
+        python .github/actions/e2e_kbench/evaluate/graph.py
+        python .github/actions/e2e_kbench/evaluate/compare.py >> $GITHUB_STEP_SUMMARY
+      env:
+        # Original K-Bench result directory
+        KBENCH_RESULTS: k-bench/out/
+        # Working directory containing the previous results as JSON and to contain the graphs
+        BDIR: benchmarks
+        # Paths to benchmark results as JSON of the previous run and the current run
+        PREV_BENCH: benchmarks/constellation-${{ inputs.cloudProvider }}-previous.json
+        CURR_BENCH: benchmarks/constellation-${{ inputs.cloudProvider }}.json
+        CSP: ${{ inputs.cloudProvider }}
+
+    - name: Upload benchmark results and graphs to action run
       uses: actions/upload-artifact@83fd05a356d7e2593de66fc9913b3002723633cb # tag=v3.1.1
       if: ${{ !env.ACT }}
       with:
         path: |
-          *_perf.png
-          kbench_results.json
-        name: "benchmark_results"
+          benchmarks/*_perf.png
+          benchmarks/constellation-${{ inputs.cloudProvider }}.json
+        name: "benchmarks"
+
+    - name: Update benchmark records in S3
+      shell: bash
+      run: |
+        aws s3 cp benchmarks/constellation-${CSP}.json ${S3_PATH}/constellation-${CSP}.json
+      env:
+        CSP: ${{ inputs.cloudProvider }}
+      if: github.ref == 'refs/heads/main'
diff --git a/.github/actions/e2e_kbench/evaluate/__init__.py b/.github/actions/e2e_kbench/evaluate/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/.github/actions/e2e_kbench/evaluate/compare.py b/.github/actions/e2e_kbench/evaluate/compare.py
new file mode 100644
index 000000000..4d536fdb8
--- /dev/null
+++ b/.github/actions/e2e_kbench/evaluate/compare.py
@@ -0,0 +1,131 @@
+"""Compare the current benchmark data against the previous."""
+import os
+import json
+from typing import Tuple
+
+# Progress indicator icons
+PROGRESS = ['⬇️', '⬆️']
+
+# List of benchmarks for which higher numbers are better
+BIGGER_BETTER = [
+    'net_internode_snd',
+    'net_intranode_snd',
+    'fio_root_async_R70W30_R',
+    'fio_root_async_R70W30_W',
+    'fio_root_async_R100W0_R',
+    'fio_root_async_R0W100_W',
+]
+
+# Lookup for test suite -> unit
+UNIT_STR = {
+    'net_internode_snd': 'Mbit/s',
+    'net_intranode_snd': 'Mbit/s',
+    'fio_root_async_R70W30_R': 'MiB/s',
+    'fio_root_async_R70W30_W': 'MiB/s',
+    'fio_root_async_R100W0_R': 'MiB/s',
+    'fio_root_async_R0W100_W': 'MiB/s',
+}
+# API units are ms, so this is shorter than cluttering the dictionary:
+API_UNIT_STR = "ms"
+
+
+def is_bigger_better(bench_suite: str) -> bool:
+    return bench_suite in BIGGER_BETTER
+
+
+def get_paths() -> Tuple[str, str]:
+    """Read the benchmark data paths.
+
+    Expects ENV vars (required):
+    - PREV_BENCH=/path/to/previous.json
+    - CURR_BENCH=/path/to/current.json
+
+    Raises TypeError if at least one of them is missing.
+
+    Returns: a tuple of (prev_bench_path, curr_bench_path).
+    """
+    path_prev = os.environ.get('PREV_BENCH', None)
+    path_curr = os.environ.get('CURR_BENCH', None)
+    if not path_prev or not path_curr:
+        raise TypeError(
+            'Both ENV variables PREV_BENCH and CURR_BENCH are required.')
+    return path_prev, path_curr
+
+
+def main() -> None:
+    """Compare the current benchmark data against the previous.
+
+    Create a markdown table showing the benchmark progressions.
+
+    Print the result to stdout.
+    """
+    path_prev, path_curr = get_paths()
+    try:
+        with open(path_prev) as f_prev:
+            bench_prev = json.load(f_prev)
+        with open(path_curr) as f_curr:
+            bench_curr = json.load(f_curr)
+    except OSError as e:
+        raise ValueError('Failed reading benchmark file: {e}'.format(e=e))
+
+    try:
+        name = bench_curr['subject']
+    except KeyError:
+        raise ValueError(
+            'Current benchmark record file does not contain subject.')
+    try:
+        prev_name = bench_prev['subject']
+    except KeyError:
+        raise ValueError(
+            'Previous benchmark record file does not contain subject.')
+    if name != prev_name:
+        raise ValueError(
+            'Cloud providers of previous and current benchmark data do not match.')
+
+    if 'kbench' not in bench_prev.keys() or 'kbench' not in bench_curr.keys():
+        raise ValueError('Benchmarks do not both contain K-Bench records.')
+
+    md_lines = [
+        '# {name}'.format(name=name),
+        '',
+        '<details>',
+        '',
+        '- Commit of current benchmark: {ch}'.format(ch=bench_curr['commit']),
+        '- Commit of previous benchmark: {ch}'.format(ch=bench_prev['commit']),
+        '',
+        '| Benchmark suite | Current | Previous | Ratio |',
+        '|-|-|-|-|',
+    ]
+
+    for subtest, _ in bench_prev['kbench'].items():
+        if subtest not in bench_curr['kbench']:
+            raise ValueError(
+                'Benchmark record from previous benchmark not in current.')
+        val_prev = bench_prev['kbench'][subtest]
+        val_curr = bench_curr['kbench'][subtest]
+
+        # get unit string or use default API unit string
+        unit = UNIT_STR.get(subtest, API_UNIT_STR)
+
+        if val_curr == 0 or val_prev == 0:
+            ratio = 'N/A'
+        else:
+            if is_bigger_better(bench_suite=subtest):
+                ratio_num = val_prev / val_curr
+            else:
+                ratio_num = val_curr / val_prev
+            ratio_num = round(ratio_num, 3)
+            emoji = PROGRESS[int(ratio_num >= 1)]
+            ratio = '{ratio} {emoji}'.format(ratio=ratio_num, emoji=emoji)
+
+        line = '| {test} ({unit}) | {val_curr} | {val_prev} | {ratio} |'.format(
+            test=subtest, unit=unit, val_curr=val_curr, val_prev=val_prev, ratio=ratio,
+        )
+        md_lines.append(line)
+
+    md_lines += ['', '</details>']
+    print('\n'.join(md_lines))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/.github/actions/e2e_kbench/evaluate/evaluators/default.py b/.github/actions/e2e_kbench/evaluate/evaluators/default.py
index 1fedf9c04..e471c2f9a 100644
--- a/.github/actions/e2e_kbench/evaluate/evaluators/default.py
+++ b/.github/actions/e2e_kbench/evaluate/evaluators/default.py
@@ -1,7 +1,6 @@
 """Evaluator for the K-Bench default test."""
 import os
 import re
-from collections import defaultdict
 from typing import Dict
 
 pod_latencies = {
@@ -29,14 +28,14 @@ service_latencies = {
 }
 
 
-def eval(tests: Dict[str, str]) -> Dict[str, Dict[str, float]]:
+def evaluate(tests: Dict[str, str]) -> Dict[str, Dict[str, int]]:
     """Read the results of the default tests.
 
     Return a result dictionary.
     """
     result = {}
     for t in tests:
-        row = defaultdict(float)
+        row = {}
         # read the default result file
         kbench = []
         with open(os.path.join(tests[t], 'default', 'kbench.log'), 'r') as f:
@@ -52,7 +51,8 @@ def eval(tests: Dict[str, str]) -> Dict[str, Dict[str, float]]:
                 line = get_line_containing_needle(
                     lines=kbench, needle=latency_dict[key])
                 median = get_median_from_line(line=line)
-                row[key] = float(median)
+                # round API latency to full ms granularity
+                row[key] = round(float(median))
 
         result[t] = row
     return result
@@ -67,5 +67,6 @@ def get_line_containing_needle(lines, needle):
     """Find matching line from list of lines."""
     matches = list(filter(lambda l: needle in l, lines))
     if len(matches) > 1:
-        raise Exception(f"'{needle}' matched multiple times..")
+        raise Exception(
+            "'{needle}' matched multiple times..".format(needle=needle))
     return matches[0]
diff --git a/.github/actions/e2e_kbench/evaluate/evaluators/fio.py b/.github/actions/e2e_kbench/evaluate/evaluators/fio.py
index 38e2b986b..2435d9366 100644
--- a/.github/actions/e2e_kbench/evaluate/evaluators/fio.py
+++ b/.github/actions/e2e_kbench/evaluate/evaluators/fio.py
@@ -14,7 +14,6 @@ Run status group 0 (all jobs):
 
 import os
 import re
-from collections import defaultdict
 from pathlib import Path
 from typing import Dict
 
@@ -26,26 +25,28 @@ subtests = {
 }
 
 
-def eval(tests: Dict[str, str]) -> Dict[str, Dict[str, float]]:
+def evaluate(tests: Dict[str, str]) -> Dict[str, Dict[str, float]]:
     """Read the results of the fio tests.
     Return a result dictionary.
     """
     result = {}
     for t in tests:
         base_path = os.path.join(tests[t], 'dp_fio')
-        row = defaultdict(str)
+        row = {}
         for subtest in subtests:
             try:
                 log_path = next(Path(base_path).rglob(subtests[subtest]))
             except StopIteration:
                 raise Exception(
-                    f"Error: No iperfclient.out found for network test {subtest} in {base_path}"
+                    "Error: No iperfclient.out found for network test {subtest} in {base_path}".format(
+                        subtest=subtest, base_path=base_path)
                 )
 
             with open(log_path) as f:
                 fio = f.readlines()
             if not fio:
-                raise Exception(f"Empty fio log {subtest}?")
+                raise Exception(
+                    "Empty fio log {subtest}?".format(subtest=subtest))
 
             for line in fio:
                 if "READ" in line:
@@ -58,7 +59,7 @@ def eval(tests: Dict[str, str]) -> Dict[str, Dict[str, float]]:
     return result
 
 
-# Dictionary to convert units
+# Dictionary for conversion to MiB
 units = {
     'KiB': 1/1024,
     'MiB': 1,
@@ -76,6 +77,8 @@ def get_io_bw_from_line(line) -> float:
     if not match:
         raise Exception("Could not extract bw from fio line.")
     num = float(match.group(1))
+
+    # return in MiB/s with 2 decimal digits
     num = num * units[match.group(2)]
-    # return in MiB/s
+    num = round(num, 2)
     return num
diff --git a/.github/actions/e2e_kbench/evaluate/evaluators/network.py b/.github/actions/e2e_kbench/evaluate/evaluators/network.py
index 7830ec365..2864728f9 100644
--- a/.github/actions/e2e_kbench/evaluate/evaluators/network.py
+++ b/.github/actions/e2e_kbench/evaluate/evaluators/network.py
@@ -14,7 +14,6 @@ s1:  iperf Done.
 """
 import os
 import re
-from collections import defaultdict
 from pathlib import Path
 from typing import Dict
 
@@ -24,20 +23,21 @@ subtests = {
 }
 
 
-def eval(tests: Dict[str, str]) -> Dict[str, Dict[str, float]]:
+def evaluate(tests: Dict[str, str]) -> Dict[str, Dict[str, float]]:
     """Read the results of the network tests.
     Return a result dictionary.
     """
     result = {}
     for t in tests:
-        row = defaultdict(str)
+        row = {}
         for subtest in subtests:
             base_path = os.path.join(tests[t], subtests[subtest])
             try:
                 log_path = next(Path(base_path).rglob('iperfclient.out'))
             except StopIteration:
                 raise Exception(
-                    f"Error: No iperfclient.out found for network test {subtest} in {base_path}"
+                    "Error: No iperfclient.out found for network test {subtest} in {base_path}".format(
+                        subtest=subtest, base_path=base_path)
                 )
 
             with open(log_path) as f:
@@ -59,7 +59,7 @@ def eval(tests: Dict[str, str]) -> Dict[str, Dict[str, float]]:
     return result
 
 
-# Dictionary to convert units
+# Dictionary for conversion to Mbit
 units = {
     'bits': 1e-6,
     'Mbits': 1,
@@ -78,6 +78,8 @@ def get_speed_from_line(line) -> float:
     if not match:
         raise Exception("Could not extract speed from iperf line.")
     num = float(match.group(1))
+
+    # return in Mbit/s with 2 decimal digits
     num = num * units[match.group(2)]
-    # return in Mbit/s
+    num = round(num, 2)
     return float(num)
diff --git a/.github/actions/e2e_kbench/evaluate/graph.py b/.github/actions/e2e_kbench/evaluate/graph.py
new file mode 100644
index 000000000..236d85191
--- /dev/null
+++ b/.github/actions/e2e_kbench/evaluate/graph.py
@@ -0,0 +1,194 @@
+"""Generate graphs comparing K-Bench benchmarks across cloud providers and Constellation."""
+import json
+import os
+from collections import defaultdict
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+SUBJECTS = [
+    'constellation-azure',
+    'AKS',
+    'constellation-gcp',
+    'GKE',
+]
+
+LEGEND_NAMES = [
+    'Constellation on Azure',
+    'AKS',
+    'Constellation on GCP',
+    'GKE',
+]
+
+BAR_COLORS = ['#90FF99', '#929292', '#8B04DD', '#000000']
+
+# Rotate bar labels by X degrees
+LABEL_ROTATE_BY = 30
+LABEL_FONTSIZE = 9
+
+# Some lookup dictionaries for x axis
+api_suffix = 'ms'
+pod_key2header = {
+    'pod_create':   'Pod Create',
+    'pod_list':     'Pod List',
+    'pod_get':      'Pod Get',
+    'pod_update':   'Pod Update',
+    'pod_delete':   'Pod Delete',
+}
+svc_key2header = {
+    'svc_create':   'Service Create',
+    'svc_list':     'Service List',
+    'svc_update':   'Service Update',
+    'svc_delete':   'Service Delete',
+    'svc_get':      'Service Get',
+}
+depl_key2header = {
+    'depl_create':  'Deployment Create',
+    'depl_list':    'Deployment List',
+    'depl_update':  'Deployment Update',
+    'depl_scale':   'Deployment Scale',
+    'depl_delete':  'Deployment Delete',
+}
+
+fio_suffix = 'MiB/s'
+fio_key2header = {
+    'fio_root_async_R70W30_R':   'async_R70W30 mix,\n seq. reads',
+    'fio_root_async_R70W30_W':   'async_R70W30 mix,\n seq. writes',
+    'fio_root_async_R100W0_R':   'async_R100W0 mix,\n seq. reads',
+    'fio_root_async_R0W100_W':   'async_R0W100 mix,\n seq. writes',
+}
+
+net_suffix = 'Mbit/s'
+net_key2header = {
+    'net_internode_snd':            'iperf internode \n send ({net_suffix})'.format(net_suffix=net_suffix),
+    'net_intranode_snd':            'iperf intranode \n send ({net_suffix})'.format(net_suffix=net_suffix),
+}
+
+
+def configure() -> str:
+    """Read the benchmark data paths.
+
+    Expects ENV vars (required):
+    - BDIR=benchmarks
+
+    Raises TypeError if at least one of them is missing.
+
+    Returns: out_dir
+    """
+    out_dir = os.environ.get('BDIR', None)
+    if not out_dir:
+        raise TypeError(
+            'ENV variables BDIR is required.')
+    return out_dir
+
+
+def bar_chart(data, headers, title='', suffix='', val_label=True, y_log=False):
+    """Draws a bar chart with multiple bars per data point.
+
+    Args:
+        data (dict[str, list]): Benchmark data dictionary: subject -> lists of value points
+        headers (list): List of headers (x-axis).
+        title (str, optional): The title for the chart. Defaults to "".
+        suffix (str, optional): The suffix for values e.g. "MiB/s". Defaults to "".
+        val_label (bool, optional): Put a label of the value over the bar chart. Defaults to True.
+        y_log (bool, optional): Set the y-axis to a logarithmic scale. Defaults to False.
+    Returns:
+        fig (matplotlib.pyplot.figure): The pyplot figure
+    """
+    fig, ax = plt.subplots(figsize=(10, 5))
+    fig.patch.set_facecolor('white')
+
+    # Number of bars per group
+    n_bars = len(data)
+
+    # The width of a single bar
+    bar_width = 0.8 / n_bars
+
+    # List containing handles for the drawn bars, used for the legend
+    bars = []
+
+    # Iterate over all data
+    for i, values in enumerate(data.values()):
+        # The offset in x direction of that bar
+        x_offset = (i - n_bars / 2) * bar_width + bar_width / 2
+
+        # Draw a bar for every value of that type
+        for x, y in enumerate(values):
+            bar = ax.bar(x + x_offset, y, width=bar_width * 0.9,
+                         color=BAR_COLORS[i % len(BAR_COLORS)], edgecolor='black')
+            if val_label:
+                ax.bar_label(bar, padding=1,
+                             fmt='%g {suffix}'.format(suffix=suffix))
+        # Add a handle to the last drawn bar, which we'll need for the legend
+        bars.append(bar[0])
+    # Draw legend
+    ax.legend(bars, LEGEND_NAMES)
+    if y_log:
+        ax.set_yscale('log')
+    ax.set_xticks(np.arange(len(headers)))
+    ax.set_xticklabels(headers)
+
+    plt.setp(ax.get_xticklabels(), fontsize=LABEL_FONTSIZE,
+             rotation=LABEL_ROTATE_BY)
+    plt.title('{title} ({suffix})'.format(title=title, suffix=suffix))
+    plt.tight_layout()
+    return fig
+
+
+def main():
+    """Read the files and create diagrams."""
+    out_dir = configure()
+    combined_results = defaultdict(dict)
+
+    for test in SUBJECTS:
+        # Read the previous results
+        read_path = os.path.join(
+            out_dir, '{subject}.json'.format(subject=test))
+        try:
+            with open(read_path, 'r') as res_file:
+                combined_results[test].update(json.load(res_file))
+        except OSError as e:
+            raise ValueError(
+                'Failed reading {subject} benchmark records: {e}'.format(subject=test, e=e))
+
+    # Combine the evaluation of the Kubernetes API benchmarks
+    for i, api in enumerate([pod_key2header, svc_key2header, depl_key2header]):
+        api_data = {}
+        for s in SUBJECTS:
+            points = combined_results[s]['kbench']
+            subject_data = [points[h] for h in api]
+            api_data[s] = subject_data
+        hdrs = list(api.values())
+        bar_chart(data=api_data, headers=hdrs,
+                  title='API Latency', suffix=api_suffix, y_log=True)
+
+        save_name = os.path.join(out_dir, 'api_{i}_perf.png'.format(i=i))
+        plt.savefig(save_name, bbox_inches='tight')
+
+    # Network chart
+    net_data = {}
+    for s in SUBJECTS:
+        points = combined_results[s]['kbench']
+        subject_data = [points[h] for h in net_key2header]
+        net_data[s] = subject_data
+    hdrs = list(net_key2header.values())
+    bar_chart(data=net_data, headers=hdrs,
+              title='Network Throughput', suffix=net_suffix, y_log=True)
+    save_name = os.path.join(out_dir, 'net_perf.png')
+    plt.savefig(save_name, bbox_inches='tight')
+
+    # fio chart
+    fio_data = {}
+    for s in SUBJECTS:
+        points = combined_results[s]['kbench']
+        subject_data = [points[h] for h in fio_key2header]
+        fio_data[s] = subject_data
+    hdrs = list(fio_key2header.values())
+    bar_chart(data=fio_data, headers=hdrs,
+              title='Storage Throughput', suffix=fio_suffix, y_log=True)
+    save_name = os.path.join(out_dir, 'storage_perf.png')
+    plt.savefig(save_name, bbox_inches='tight')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/.github/actions/e2e_kbench/evaluate/parse.py b/.github/actions/e2e_kbench/evaluate/parse.py
new file mode 100644
index 000000000..76f7a9dde
--- /dev/null
+++ b/.github/actions/e2e_kbench/evaluate/parse.py
@@ -0,0 +1,82 @@
+"""Parse logs of K-Bench tests and generate performance graphs."""
+import json
+import os
+from collections import defaultdict
+from typing import Tuple
+
+from evaluators import default, fio, network
+
+
+def configure() -> Tuple[str, str, str, str | None, str]:
+    """Read the benchmark data paths.
+
+    Expects ENV vars (required):
+    - KBENCH_RESULTS=/path/to/k-bench/out
+    - CSP=azure
+    - BDIR=benchmarks
+
+    Optional:
+    - EXT_NAME=AKS  # Overrides "constellation-$CSP" naming to parse results from managed Kubernetes
+    - GITHUB_SHA=ffac5... # Set by GitHub actions, stored in the result JSON.
+
+    Raises TypeError if at least one of them is missing.
+
+    Returns: a tuple of (base_path, csp, out_dir, ext_provider_name).
+    """
+    base_path = os.environ.get('KBENCH_RESULTS', None)
+    csp = os.environ.get('CSP', None)
+    out_dir = os.environ.get('BDIR', None)
+    if not base_path or not csp or not out_dir:
+        raise TypeError(
+            'ENV variables KBENCH_RESULTS, CSP, BDIR are required.')
+
+    ext_provider_name = os.environ.get('EXT_NAME', None)
+    commit_hash = os.environ.get('GITHUB_SHA', 'N/A')
+    return base_path, csp, out_dir, ext_provider_name, commit_hash
+
+
+def main() -> None:
+    """Read and parse the K-Bench tests.
+
+    Write results of the current environment to a JSON file.
+    """
+    base_path, csp, out_dir, ext_provider_name, commit_hash = configure()
+
+    if ext_provider_name is None:
+        # Constellation benchmark.
+        ext_provider_name = 'constellation-{csp}'.format(csp=csp)
+
+    # Expect the results in directory:
+    # kbench-EXT_PROVIDER_NAME/
+    benchmark_path = os.path.join(
+        base_path,
+        'kbench-{csp}'.format(csp=ext_provider_name),
+    )
+    tests = {ext_provider_name: benchmark_path}
+    out_file_name = '{nm}.json'.format(nm=ext_provider_name)
+
+    if not os.path.exists(benchmark_path):
+        raise ValueError(
+            'Benchmarks do not exist at {path}.'.format(path=benchmark_path))
+
+    # Parse subtest
+    default_results = default.evaluate(tests=tests)
+    network_results = network.evaluate(tests=tests)
+    fio_results = fio.evaluate(tests=tests)
+
+    combined_results = {'commit': commit_hash,
+                        'subject': ext_provider_name, 'kbench': {}}
+
+    for test in tests:
+        combined_results['kbench'].update(default_results[test])
+        combined_results['kbench'].update(network_results[test])
+        combined_results['kbench'].update(fio_results[test])
+
+    # Write the compact results.
+    save_path = os.path.join(out_dir, out_file_name)
+    with open(save_path, 'w') as w:
+        json.dump(combined_results, fp=w, sort_keys=False, indent=2)
+
+
+if __name__ == '__main__':
+    main()