docs: group perf graphics by csp
153
.github/actions/e2e_benchmark/evaluate/graph.py
vendored
@ -10,20 +10,12 @@ import numpy as np
|
|||||||
from matplotlib import pyplot as plt
|
from matplotlib import pyplot as plt
|
||||||
from matplotlib import font_manager as fm
|
from matplotlib import font_manager as fm
|
||||||
|
|
||||||
|
SUBJECTS_AZURE = ['constellation-azure', 'AKS']
|
||||||
|
SUBJECTS_GCP = ['constellation-gcp', 'GKE']
|
||||||
|
|
||||||
SUBJECTS = [
|
LEGEND_NAMES_AZURE = ['Constellation', 'AKS']
|
||||||
'constellation-azure',
|
LEGEND_NAMES_GCP = ['Constellation', 'GKE']
|
||||||
'AKS',
|
|
||||||
'constellation-gcp',
|
|
||||||
'GKE',
|
|
||||||
]
|
|
||||||
|
|
||||||
LEGEND_NAMES = [
|
|
||||||
'Constellation on Azure',
|
|
||||||
'AKS',
|
|
||||||
'Constellation on GCP',
|
|
||||||
'GKE',
|
|
||||||
]
|
|
||||||
|
|
||||||
BAR_COLORS = ['#90FF99', '#929292', '#8B04DD', '#000000']
|
BAR_COLORS = ['#90FF99', '#929292', '#8B04DD', '#000000']
|
||||||
|
|
||||||
@ -33,7 +25,7 @@ FONT_SIZE = 13
|
|||||||
|
|
||||||
# Some lookup dictionaries for x axis
|
# Some lookup dictionaries for x axis
|
||||||
fio_iops_unit = 'IOPS'
|
fio_iops_unit = 'IOPS'
|
||||||
fio_bw_unit = 'KiB/s'
|
fio_bw_unit = 'MiB/s'
|
||||||
|
|
||||||
net_unit = 'Mbit/s'
|
net_unit = 'Mbit/s'
|
||||||
|
|
||||||
@ -123,7 +115,7 @@ def main():
|
|||||||
out_dir = configure()
|
out_dir = configure()
|
||||||
combined_results = defaultdict(dict)
|
combined_results = defaultdict(dict)
|
||||||
|
|
||||||
for test in SUBJECTS:
|
for test in SUBJECTS_AZURE+SUBJECTS_GCP:
|
||||||
# Read the previous results
|
# Read the previous results
|
||||||
read_path = os.path.join(
|
read_path = os.path.join(
|
||||||
out_dir, '{subject}.json'.format(subject=test))
|
out_dir, '{subject}.json'.format(subject=test))
|
||||||
@ -135,91 +127,122 @@ def main():
|
|||||||
'Failed reading {subject} benchmark records: {e}'.format(subject=test, e=e))
|
'Failed reading {subject} benchmark records: {e}'.format(subject=test, e=e))
|
||||||
|
|
||||||
# Network charts
|
# Network charts
|
||||||
# P2P TCP
|
# P2P TCP + UDP Azure
|
||||||
net_data = {}
|
net_data = {}
|
||||||
for s, l in zip(SUBJECTS, LEGEND_NAMES):
|
for s, l in zip(SUBJECTS_AZURE, LEGEND_NAMES_AZURE):
|
||||||
net_data[l] = int(combined_results[s]['knb']['pod2pod']['tcp_bw_mbit'])
|
net_data[l+" - TCP"] = int(combined_results[s]
|
||||||
|
['knb']['pod2pod']['tcp_bw_mbit'])
|
||||||
|
for s, l in zip(SUBJECTS_AZURE, LEGEND_NAMES_AZURE):
|
||||||
|
net_data[l+" - UDP"] = int(combined_results[s]
|
||||||
|
['knb']['pod2pod']['udp_bw_mbit'])
|
||||||
bar_chart(data=net_data,
|
bar_chart(data=net_data,
|
||||||
title='K8S CNI Benchmark - Pod to Pod - TCP - Bandwidth',
|
title='K8S CNI Benchmark - Pod to Pod - Azure - Bandwidth',
|
||||||
unit=net_unit,
|
unit=net_unit,
|
||||||
x_label=f" TCP Bandwidth in {net_unit} - Higher is better")
|
x_label=f"Bandwidth in {net_unit} - Higher is better")
|
||||||
save_name = os.path.join(out_dir, 'benchmark_net_p2p_tcp.png')
|
save_name = os.path.join(out_dir, 'benchmark_net_p2p_azure.png')
|
||||||
plt.savefig(save_name)
|
plt.savefig(save_name)
|
||||||
|
|
||||||
# P2P TCP
|
# P2P TCP + UDP GCP
|
||||||
net_data = {}
|
net_data = {}
|
||||||
for s, l in zip(SUBJECTS, LEGEND_NAMES):
|
for s, l in zip(SUBJECTS_GCP, LEGEND_NAMES_GCP):
|
||||||
net_data[l] = int(combined_results[s]['knb']['pod2pod']['udp_bw_mbit'])
|
net_data[l+" - TCP"] = int(combined_results[s]
|
||||||
|
['knb']['pod2pod']['tcp_bw_mbit'])
|
||||||
|
for s, l in zip(SUBJECTS_GCP, LEGEND_NAMES_GCP):
|
||||||
|
net_data[l+" - UDP"] = int(combined_results[s]
|
||||||
|
['knb']['pod2pod']['udp_bw_mbit'])
|
||||||
bar_chart(data=net_data,
|
bar_chart(data=net_data,
|
||||||
title='K8S CNI Benchmark - Pod to Pod - UDP - Bandwidth',
|
title='K8S CNI Benchmark - Pod to Pod - GCP - Bandwidth',
|
||||||
unit=net_unit,
|
unit=net_unit,
|
||||||
x_label=f" UDP Bandwidth in {net_unit} - Higher is better")
|
x_label=f"Bandwidth in {net_unit} - Higher is better")
|
||||||
save_name = os.path.join(out_dir, 'benchmark_net_p2p_udp.png')
|
save_name = os.path.join(out_dir, 'benchmark_net_p2p_gcp.png')
|
||||||
plt.savefig(save_name)
|
plt.savefig(save_name)
|
||||||
|
|
||||||
# P2SVC TCP
|
# P2SVC TCP + UDP Azure
|
||||||
net_data = {}
|
net_data = {}
|
||||||
for s, l in zip(SUBJECTS, LEGEND_NAMES):
|
for s, l in zip(SUBJECTS_AZURE, LEGEND_NAMES_AZURE):
|
||||||
net_data[l] = int(combined_results[s]['knb']['pod2svc']['tcp_bw_mbit'])
|
net_data[l+" - TCP"] = int(combined_results[s]
|
||||||
|
['knb']['pod2svc']['tcp_bw_mbit'])
|
||||||
|
for s, l in zip(SUBJECTS_AZURE, LEGEND_NAMES_AZURE):
|
||||||
|
net_data[l+" - UDP"] = int(combined_results[s]
|
||||||
|
['knb']['pod2svc']['udp_bw_mbit'])
|
||||||
bar_chart(data=net_data,
|
bar_chart(data=net_data,
|
||||||
title='K8S CNI Benchmark - Pod to Service - TCP - Bandwidth',
|
title='K8S CNI Benchmark - Pod to Service - Azure - Bandwidth',
|
||||||
unit=net_unit,
|
unit=net_unit,
|
||||||
x_label=f" TCP Bandwidth in {net_unit} - Higher is better")
|
x_label=f"Bandwidth in {net_unit} - Higher is better")
|
||||||
save_name = os.path.join(out_dir, 'benchmark_net_p2svc_tcp.png')
|
save_name = os.path.join(out_dir, 'benchmark_net_p2svc_azure.png')
|
||||||
plt.savefig(save_name)
|
plt.savefig(save_name)
|
||||||
|
|
||||||
# P2SVC UDP
|
# P2P TCP + UDP GCP
|
||||||
net_data = {}
|
net_data = {}
|
||||||
for s, l in zip(SUBJECTS, LEGEND_NAMES):
|
for s, l in zip(SUBJECTS_GCP, LEGEND_NAMES_GCP):
|
||||||
net_data[l] = int(combined_results[s]['knb']['pod2svc']['udp_bw_mbit'])
|
net_data[l+" - TCP"] = int(combined_results[s]
|
||||||
|
['knb']['pod2svc']['tcp_bw_mbit'])
|
||||||
|
for s, l in zip(SUBJECTS_GCP, LEGEND_NAMES_GCP):
|
||||||
|
net_data[l+" - UDP"] = int(combined_results[s]
|
||||||
|
['knb']['pod2svc']['udp_bw_mbit'])
|
||||||
bar_chart(data=net_data,
|
bar_chart(data=net_data,
|
||||||
title='K8S CNI Benchmark - Pod to Service - UDP - Bandwidth',
|
title='K8S CNI Benchmark - Pod to Service - GCP - Bandwidth',
|
||||||
unit=net_unit,
|
unit=net_unit,
|
||||||
x_label=f" UDP Bandwidth in {net_unit} - Higher is better")
|
x_label=f"Bandwidth in {net_unit} - Higher is better")
|
||||||
save_name = os.path.join(out_dir, 'benchmark_net_p2svc_udp.png')
|
save_name = os.path.join(out_dir, 'benchmark_net_p2svc_gcp.png')
|
||||||
plt.savefig(save_name)
|
plt.savefig(save_name)
|
||||||
|
|
||||||
# FIO chart
|
# FIO charts
|
||||||
# Read IOPS
|
|
||||||
|
# IOPS on Azure
|
||||||
fio_data = {}
|
fio_data = {}
|
||||||
for s, l in zip(SUBJECTS, LEGEND_NAMES):
|
for s, l in zip(SUBJECTS_AZURE, LEGEND_NAMES_AZURE):
|
||||||
fio_data[l] = int(combined_results[s]['fio']['read_iops']['iops'])
|
fio_data[l+" - Read"] = int(combined_results[s]
|
||||||
|
['fio']['read_iops']['iops'])
|
||||||
|
for s, l in zip(SUBJECTS_AZURE, LEGEND_NAMES_AZURE):
|
||||||
|
fio_data[l+" - Write"] = int(combined_results[s]
|
||||||
|
['fio']['write_iops']['iops'])
|
||||||
bar_chart(data=fio_data,
|
bar_chart(data=fio_data,
|
||||||
title='FIO Benchmark - Read - IOPS',
|
title='FIO Benchmark - Azure - IOPS',
|
||||||
x_label=f" Read {fio_iops_unit} - Higher is better")
|
x_label=f"{fio_iops_unit} - Higher is better")
|
||||||
save_name = os.path.join(out_dir, 'benchmark_fio_read_iops.png')
|
save_name = os.path.join(out_dir, 'benchmark_fio_azure_iops.png')
|
||||||
plt.savefig(save_name)
|
plt.savefig(save_name)
|
||||||
|
|
||||||
# Write IOPS
|
# IOPS on GCP
|
||||||
fio_data = {}
|
fio_data = {}
|
||||||
for s, l in zip(SUBJECTS, LEGEND_NAMES):
|
for s, l in zip(SUBJECTS_GCP, LEGEND_NAMES_GCP):
|
||||||
fio_data[l] = int(combined_results[s]['fio']['write_iops']['iops'])
|
fio_data[l+" - Read"] = int(combined_results[s]
|
||||||
|
['fio']['read_iops']['iops'])
|
||||||
|
for s, l in zip(SUBJECTS_GCP, LEGEND_NAMES_GCP):
|
||||||
|
fio_data[l+" - Write"] = int(combined_results[s]
|
||||||
|
['fio']['write_iops']['iops'])
|
||||||
bar_chart(data=fio_data,
|
bar_chart(data=fio_data,
|
||||||
title='FIO Benchmark - Write - IOPS',
|
title='FIO Benchmark - GCP - IOPS',
|
||||||
x_label=f" Write {fio_iops_unit} - Higher is better")
|
x_label=f"{fio_iops_unit} - Higher is better")
|
||||||
save_name = os.path.join(out_dir, 'benchmark_fio_write_iops.png')
|
save_name = os.path.join(out_dir, 'benchmark_fio_gcp_iops.png')
|
||||||
plt.savefig(save_name)
|
plt.savefig(save_name)
|
||||||
|
|
||||||
# Read Bandwidth
|
# Bandwidth on Azure
|
||||||
fio_data = {}
|
fio_data = {}
|
||||||
for s, l in zip(SUBJECTS, LEGEND_NAMES):
|
for s, l in zip(SUBJECTS_AZURE, LEGEND_NAMES_AZURE):
|
||||||
fio_data[l] = int(combined_results[s]['fio']['read_bw']['bw_kbytes'])
|
fio_data[l+" - Read"] = int(combined_results[s]
|
||||||
|
['fio']['read_bw']['bw_kbytes'] / 1024)
|
||||||
|
for s, l in zip(SUBJECTS_AZURE, LEGEND_NAMES_AZURE):
|
||||||
|
fio_data[l+" - Write"] = int(combined_results[s]
|
||||||
|
['fio']['write_bw']['bw_kbytes'] / 1024)
|
||||||
bar_chart(data=fio_data,
|
bar_chart(data=fio_data,
|
||||||
title='FIO Benchmark - Read - Bandwidth',
|
title='FIO Benchmark - Azure - Bandwidth',
|
||||||
unit=fio_bw_unit,
|
x_label=f"Bandwidth in {fio_bw_unit} - Higher is better")
|
||||||
x_label=f" Read Bandwidth in {fio_bw_unit} - Higher is better")
|
save_name = os.path.join(out_dir, 'benchmark_fio_azure_bw.png')
|
||||||
save_name = os.path.join(out_dir, 'benchmark_fio_read_bw.png')
|
|
||||||
plt.savefig(save_name)
|
plt.savefig(save_name)
|
||||||
|
|
||||||
# Write Bandwidth
|
# Bandwidth on GCP
|
||||||
fio_data = {}
|
fio_data = {}
|
||||||
for s, l in zip(SUBJECTS, LEGEND_NAMES):
|
for s, l in zip(SUBJECTS_GCP, LEGEND_NAMES_GCP):
|
||||||
fio_data[l] = int(combined_results[s]['fio']['write_bw']['bw_kbytes'])
|
fio_data[l+" - Read"] = int(combined_results[s]
|
||||||
|
['fio']['read_bw']['bw_kbytes'] / 1024)
|
||||||
|
for s, l in zip(SUBJECTS_GCP, LEGEND_NAMES_GCP):
|
||||||
|
fio_data[l+" - Write"] = int(combined_results[s]
|
||||||
|
['fio']['write_bw']['bw_kbytes'] / 1024)
|
||||||
bar_chart(data=fio_data,
|
bar_chart(data=fio_data,
|
||||||
title='FIO Benchmark - Write - Bandwidth',
|
title='FIO Benchmark - GCP - Bandwidth',
|
||||||
unit=fio_bw_unit,
|
x_label=f"Bandwidth in {fio_bw_unit} - Higher is better")
|
||||||
x_label=f" Write Bandwidth in {fio_bw_unit} - Higher is better")
|
save_name = os.path.join(out_dir, 'benchmark_fio_gcp_bw.png')
|
||||||
save_name = os.path.join(out_dir, 'benchmark_fio_write_bw.png')
|
|
||||||
plt.savefig(save_name)
|
plt.savefig(save_name)
|
||||||
|
|
||||||
|
|
||||||
|
BIN
docs/docs/_media/benchmark_fio_azure_bw.png
Normal file
After Width: | Height: | Size: 30 KiB |
BIN
docs/docs/_media/benchmark_fio_azure_iops.png
Normal file
After Width: | Height: | Size: 29 KiB |
BIN
docs/docs/_media/benchmark_fio_gcp_bw.png
Normal file
After Width: | Height: | Size: 30 KiB |
BIN
docs/docs/_media/benchmark_fio_gcp_iops.png
Normal file
After Width: | Height: | Size: 30 KiB |
Before Width: | Height: | Size: 35 KiB |
Before Width: | Height: | Size: 29 KiB |
Before Width: | Height: | Size: 34 KiB |
Before Width: | Height: | Size: 29 KiB |
BIN
docs/docs/_media/benchmark_net_p2p_azure.png
Normal file
After Width: | Height: | Size: 36 KiB |
BIN
docs/docs/_media/benchmark_net_p2p_gcp.png
Normal file
After Width: | Height: | Size: 36 KiB |
Before Width: | Height: | Size: 36 KiB |
Before Width: | Height: | Size: 36 KiB |
BIN
docs/docs/_media/benchmark_net_p2svc_azure.png
Normal file
After Width: | Height: | Size: 37 KiB |
BIN
docs/docs/_media/benchmark_net_p2svc_gcp.png
Normal file
After Width: | Height: | Size: 38 KiB |
Before Width: | Height: | Size: 37 KiB |
Before Width: | Height: | Size: 37 KiB |
@ -10,11 +10,11 @@ AMD and Azure jointly released a [performance benchmark](https://community.amd.c
|
|||||||
|
|
||||||
Similarly, AMD and Google jointly released a [performance benchmark](https://www.amd.com/system/files/documents/3rd-gen-epyc-gcp-c2d-conf-compute-perf-brief.pdf) for CVMs based on 3rd Gen AMD EPYC processors (Milan) with SEV-SNP. With high-performance computing workloads like WRF, NAMD, Ansys CFS, and Ansys LS_DYNA, they found similar results with only small (2%--4%) performance degradation compared to standard VMs. You can expect to see similar performance for compute-intensive workloads running with Constellation on GCP.
|
Similarly, AMD and Google jointly released a [performance benchmark](https://www.amd.com/system/files/documents/3rd-gen-epyc-gcp-c2d-conf-compute-perf-brief.pdf) for CVMs based on 3rd Gen AMD EPYC processors (Milan) with SEV-SNP. With high-performance computing workloads like WRF, NAMD, Ansys CFS, and Ansys LS_DYNA, they found similar results with only small (2%--4%) performance degradation compared to standard VMs. You can expect to see similar performance for compute-intensive workloads running with Constellation on GCP.
|
||||||
|
|
||||||
## Performance impact from I/O and network
|
## Performance impact from storage and network
|
||||||
|
|
||||||
To assess the overall performance of Constellation, we benchmarked Constellation v2.6.0 in terms of storage I/O using [`fio`](https://fio.readthedocs.io/en/latest/fio_doc.html) and network performance using the [Kubernetes Network Benchmark](https://github.com/InfraBuilder/k8s-bench-suite#knb--kubernetes-network-be).
|
To assess the overall performance of Constellation, this benchmark evaluates Constellation v2.6.0 in terms of storage I/O using [`fio`](https://fio.readthedocs.io/en/latest/fio_doc.html) and network performance using the [Kubernetes Network Benchmark](https://github.com/InfraBuilder/k8s-bench-suite#knb--kubernetes-network-be).
|
||||||
|
|
||||||
We tested Constellation on Azure and GCP and compared the results against the managed Kubernetes offerings AKS and GKE.
|
This benchmark tested Constellation on Azure and GCP and compared the results against the managed Kubernetes offerings AKS and GKE.
|
||||||
|
|
||||||
### Configurations
|
### Configurations
|
||||||
|
|
||||||
@ -40,10 +40,10 @@ Constellation on GCP:
|
|||||||
|
|
||||||
#### AKS
|
#### AKS
|
||||||
|
|
||||||
On AKS, we ran the benchmark with Kubernetes `v1.24.9` and nodes with version `AKSUbuntu-1804gen2containerd-2023.02.15`.
|
On AKS, the benchmark used Kubernetes `v1.24.9` and nodes with version `AKSUbuntu-1804gen2containerd-2023.02.15`.
|
||||||
The version we tested on AKS ran with the [`kubenet`](https://learn.microsoft.com/en-us/azure/aks/concepts-network#kubenet-basic-networking) CNI and the [default CSI driver](https://learn.microsoft.com/en-us/azure/aks/azure-disk-csi) for Azure Disk.
|
AKS ran with the [`kubenet`](https://learn.microsoft.com/en-us/azure/aks/concepts-network#kubenet-basic-networking) CNI and the [default CSI driver](https://learn.microsoft.com/en-us/azure/aks/azure-disk-csi) for Azure Disk.
|
||||||
|
|
||||||
We used the following infrastructure configurations.
|
The following infrastructure configurations was used:
|
||||||
|
|
||||||
- Nodes: 2 (2 Worker)
|
- Nodes: 2 (2 Worker)
|
||||||
- Machines: `D4as_v5`: 3rd Generation AMD EPYC 7763v (Milan) processor with 4 Cores, 16 GiB memory
|
- Machines: `D4as_v5`: 3rd Generation AMD EPYC 7763v (Milan) processor with 4 Cores, 16 GiB memory
|
||||||
@ -53,10 +53,10 @@ We used the following infrastructure configurations.
|
|||||||
|
|
||||||
#### GKE
|
#### GKE
|
||||||
|
|
||||||
On GKE, we used Kubernetes `v1.24.9` and nodes with version `1.24.9-gke.3200`.
|
On GKE, the benchmark used Kubernetes `v1.24.9` and nodes with version `1.24.9-gke.3200`.
|
||||||
The version we tested on GKE ran with the [`kubenet`](https://cloud.google.com/kubernetes-engine/docs/concepts/network-overview) CNI and the [default CSI driver](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/gce-pd-csi-driver) for Compute Engine persistent disk.
|
GKE ran with the [`kubenet`](https://cloud.google.com/kubernetes-engine/docs/concepts/network-overview) CNI and the [default CSI driver](https://cloud.google.com/kubernetes-engine/docs/how-to/persistent-volumes/gce-pd-csi-driver) for Compute Engine persistent disk.
|
||||||
|
|
||||||
We used the following infrastructure configurations.
|
The following infrastructure configurations was used:
|
||||||
|
|
||||||
- Nodes: 2 (2 Worker)
|
- Nodes: 2 (2 Worker)
|
||||||
- Machines: `n2d-standard-4` 2nd Generation AMD EPYC (Rome) processor with 4 Cores, 16 GiB of memory
|
- Machines: `n2d-standard-4` 2nd Generation AMD EPYC (Rome) processor with 4 Cores, 16 GiB of memory
|
||||||
@ -68,13 +68,13 @@ We used the following infrastructure configurations.
|
|||||||
|
|
||||||
#### Network
|
#### Network
|
||||||
|
|
||||||
We performed a thorough analysis of the network performance of Constellation, specifically focusing on measuring TCP and UDP bandwidth.
|
This section gives a thorough analysis of the network performance of Constellation, specifically focusing on measuring TCP and UDP bandwidth.
|
||||||
The benchmark measured the bandwidth of pod-to-pod and pod-to-service connections between two different nodes using [`iperf`](https://iperf.fr/).
|
The benchmark measured the bandwidth of pod-to-pod and pod-to-service connections between two different nodes using [`iperf`](https://iperf.fr/).
|
||||||
|
|
||||||
GKE and Constellation on GCP had a maximum network bandwidth of [10 Gbps](https://cloud.google.com/compute/docs/general-purpose-machines#n2d_machineshttps://cloud.google.com/compute/docs/general-purpose-machines#n2d_machines).
|
GKE and Constellation on GCP had a maximum network bandwidth of [10 Gbps](https://cloud.google.com/compute/docs/general-purpose-machines#n2d_machineshttps://cloud.google.com/compute/docs/general-purpose-machines#n2d_machines).
|
||||||
AKS with `Standard_D4as_v5` machines a maximum network bandwidth of [12.5 Gbps](https://learn.microsoft.com/en-us/azure/virtual-machines/dasv5-dadsv5-series#dasv5-series).
|
AKS with `Standard_D4as_v5` machines a maximum network bandwidth of [12.5 Gbps](https://learn.microsoft.com/en-us/azure/virtual-machines/dasv5-dadsv5-series#dasv5-series).
|
||||||
The Confidential VM equivalent `Standard_DC4as_v5` currently has a network bandwidth of [1.25 Gbps](https://learn.microsoft.com/en-us/azure/virtual-machines/dcasv5-dcadsv5-series#dcasv5-series-products).
|
The Confidential VM equivalent `Standard_DC4as_v5` currently has a network bandwidth of [1.25 Gbps](https://learn.microsoft.com/en-us/azure/virtual-machines/dcasv5-dcadsv5-series#dcasv5-series-products).
|
||||||
Therefore, to make the test comparable we ran both AKS and Constellation on Azure with `Standard_DC4as_v5` machines and 1.25 Gbps bandwidth.
|
Therefore, to make the test comparable, both AKS and Constellation on Azure were running with `Standard_DC4as_v5` machines and 1.25 Gbps bandwidth.
|
||||||
|
|
||||||
Constellation on Azure and AKS used an MTU of 1500.
|
Constellation on Azure and AKS used an MTU of 1500.
|
||||||
Constellation on GCP used an MTU of 8896. GKE used an MTU of 1450.
|
Constellation on GCP used an MTU of 8896. GKE used an MTU of 1450.
|
||||||
@ -100,17 +100,17 @@ flowchart LR
|
|||||||
Client ==>|traffic| Server
|
Client ==>|traffic| Server
|
||||||
```
|
```
|
||||||
|
|
||||||
The results for "Pod-to-Pod" TCP are as follows:
|
The results for "Pod-to-Pod" on Azure are as follows:
|
||||||
|
|
||||||
![Network Pod2Pod TCP benchmark graph](../_media/benchmark_net_p2p_tcp.png)
|
![Network Pod2Pod Azure benchmark graph](../_media/benchmark_net_p2p_azure.png)
|
||||||
|
|
||||||
The results for "Pod-to-Pod" UDP are as follows:
|
The results for "Pod-to-Pod" on GCP are as follows:
|
||||||
|
|
||||||
![Network Pod2Pod UDP benchmark graph](../_media/benchmark_net_p2p_udp.png)
|
![Network Pod2Pod GCP benchmark graph](../_media/benchmark_net_p2p_gcp.png)
|
||||||
|
|
||||||
##### Pod-to-Service
|
##### Pod-to-Service
|
||||||
|
|
||||||
Tn this scenario, the client Pod connects to the server Pod via a ClusterIP service. This is more relevant to real-world use cases.
|
In this scenario, the client Pod connects to the server Pod via a ClusterIP service. This is more relevant to real-world use cases.
|
||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
flowchart LR
|
flowchart LR
|
||||||
@ -123,17 +123,17 @@ flowchart LR
|
|||||||
Service ==>|traffic| Server
|
Service ==>|traffic| Server
|
||||||
```
|
```
|
||||||
|
|
||||||
The results for “Pod-to-Service” TCP are as follows:
|
The results for "Pod-to-Pod" on Azure are as follows:
|
||||||
|
|
||||||
![Network Pod2SVC TCP benchmark graph](../_media/benchmark_net_p2svc_tcp.png)
|
![Network Pod2SVC Azure benchmark graph](../_media/benchmark_net_p2svc_azure.png)
|
||||||
|
|
||||||
The results for “Pod-to-Service” UDP are as follows:
|
The results for "Pod-to-Pod" on GCP are as follows:
|
||||||
|
|
||||||
![Network Pod2SVC TCP benchmark graph](../_media/benchmark_net_p2svc_udp.png)
|
![Network Pod2SVC GCP benchmark graph](../_media/benchmark_net_p2svc_gcp.png)
|
||||||
|
|
||||||
Comparing Constellation on GCP with GKE, Constellation has 58% less TCP bandwidth.
|
In our recent comparison of Constellation on GCP with GKE, Constellation has 58% less TCP bandwidth. However, UDP bandwidth was slightly better with Constellation, thanks to its higher MTU.
|
||||||
UDP bandwidth is slightly better with Constellation due to the higher MTU.
|
|
||||||
Constellation on Azure compared against AKS with CVMs achieves ~10% less TCP and ~40% less UDP bandwidth.
|
Similarly, when comparing Constellation on Azure with AKS using CVMs, Constellation achieved approximately 10% less TCP and 40% less UDP bandwidth.
|
||||||
#### Storage I/O
|
#### Storage I/O
|
||||||
|
|
||||||
Azure and GCP offer persistent storage for their Kubernetes services AKS and GKE via the Container Storage Interface (CSI). CSI storage in Kubernetes is available via `PersistentVolumes` (PV) and consumed via `PersistentVolumeClaims` (PVC).
|
Azure and GCP offer persistent storage for their Kubernetes services AKS and GKE via the Container Storage Interface (CSI). CSI storage in Kubernetes is available via `PersistentVolumes` (PV) and consumed via `PersistentVolumeClaims` (PVC).
|
||||||
@ -141,16 +141,16 @@ Upon requesting persistent storage through a PVC, GKE and AKS will provision a P
|
|||||||
Constellation provides persistent storage on Azure and GCP [that's encrypted on the CSI layer](../architecture/encrypted-storage.md).
|
Constellation provides persistent storage on Azure and GCP [that's encrypted on the CSI layer](../architecture/encrypted-storage.md).
|
||||||
Similarly, upon a PVC request, Constellation will provision a PV via a default storage class.
|
Similarly, upon a PVC request, Constellation will provision a PV via a default storage class.
|
||||||
|
|
||||||
For Constellation on Azure and AKS, we ran the benchmark with Azure Disk storage [Standard SSD](https://learn.microsoft.com/en-us/azure/virtual-machines/disks-types#standard-ssds) of 400 GiB size.
|
For Constellation on Azure and AKS, the benchmark ran with Azure Disk storage [Standard SSD](https://learn.microsoft.com/en-us/azure/virtual-machines/disks-types#standard-ssds) of 400 GiB size.
|
||||||
The [DC4as machine type](https://learn.microsoft.com/en-us/azure/virtual-machines/dasv5-dadsv5-series#dasv5-series) with four cores provides the following maximum performance:
|
The [DC4as machine type](https://learn.microsoft.com/en-us/azure/virtual-machines/dasv5-dadsv5-series#dasv5-series) with four cores provides the following maximum performance:
|
||||||
- 6400 (20000 burst) IOPS
|
- 6400 (20000 burst) IOPS
|
||||||
- 144 MB/s (600 MB/s burst) throughput
|
- 144 MB/s (600 MB/s burst) throughput
|
||||||
|
|
||||||
However, the performance is bound by the capabilities of the [512 GiB Standard SSD size](https://learn.microsoft.com/en-us/azure/virtual-machines/disks-types#standard-ssds) (the size we get when we allocate 400 GiB volumes):
|
However, the performance is bound by the capabilities of the [512 GiB Standard SSD size](https://learn.microsoft.com/en-us/azure/virtual-machines/disks-types#standard-ssds) (the size class of 400 GiB volumes):
|
||||||
- 500 (600 burst) IOPS
|
- 500 (600 burst) IOPS
|
||||||
- 60 MB/s (150 MB/s burst) throughput
|
- 60 MB/s (150 MB/s burst) throughput
|
||||||
|
|
||||||
For Constellation on GCP and GKE, we ran the benchmark with Compute Engine Persistent Disk Storage [pd-balanced](https://cloud.google.com/compute/docs/disks) of 400 GiB size.
|
For Constellation on GCP and GKE, the benchmark ran with Compute Engine Persistent Disk Storage [pd-balanced](https://cloud.google.com/compute/docs/disks) of 400 GiB size.
|
||||||
The N2D machine type with four cores and pd-balanced provides the following [maximum performance](https://cloud.google.com/compute/docs/disks/performance#n2d_vms):
|
The N2D machine type with four cores and pd-balanced provides the following [maximum performance](https://cloud.google.com/compute/docs/disks/performance#n2d_vms):
|
||||||
- 3,000 read IOPS
|
- 3,000 read IOPS
|
||||||
- 15,000 write IOPS
|
- 15,000 write IOPS
|
||||||
@ -164,7 +164,7 @@ However, the performance is bound by the capabilities of a [`Zonal balanced PD`]
|
|||||||
- 112 MB/s write throughput
|
- 112 MB/s write throughput
|
||||||
|
|
||||||
The [`fio`](https://fio.readthedocs.io/en/latest/fio_doc.html) benchmark consists of several tests.
|
The [`fio`](https://fio.readthedocs.io/en/latest/fio_doc.html) benchmark consists of several tests.
|
||||||
We used [`Kubestr`](https://github.com/kastenhq/kubestr) to run `fio` in Kubernetes.
|
The benchmark used [`Kubestr`](https://github.com/kastenhq/kubestr) to run `fio` in Kubernetes.
|
||||||
The default test performs randomized access patterns that accurately depict worst-case I/O scenarios for most applications.
|
The default test performs randomized access patterns that accurately depict worst-case I/O scenarios for most applications.
|
||||||
|
|
||||||
The following `fio` settings were used:
|
The following `fio` settings were used:
|
||||||
@ -181,28 +181,30 @@ The following `fio` settings were used:
|
|||||||
For more details, see the [`fio` test configuration](../../../.github/actions/e2e_benchmark/fio.ini).
|
For more details, see the [`fio` test configuration](../../../.github/actions/e2e_benchmark/fio.ini).
|
||||||
|
|
||||||
|
|
||||||
The results for "Rand Read" IOPS are as follows:
|
The results for IOPS on Azure are as follows:
|
||||||
|
|
||||||
![I/O read IOPS benchmark graph](../_media/benchmark_fio_read_iops.png)
|
![I/O IOPS Azure benchmark graph](../_media/benchmark_fio_azure_iops.png)
|
||||||
|
|
||||||
The results for "Rand Write" IOPS are as follows:
|
The results for IOPS on GCP are as follows:
|
||||||
|
|
||||||
![I/O write IOPS benchmark graph](../_media/benchmark_fio_write_iops.png)
|
![I/O IOPS GCP benchmark graph](../_media/benchmark_fio_gcp_iops.png)
|
||||||
|
|
||||||
The results for "Rand Read" bandwidth are as follows:
|
The results for bandwidth on Azure are as follows:
|
||||||
|
|
||||||
![I/O read bandwidth benchmark graph](../_media/benchmark_fio_read_bw.png)
|
![I/O bandwidth Azure benchmark graph](../_media/benchmark_fio_azure_bw.png)
|
||||||
|
|
||||||
The results for "Rand Write" bandwidth are as follows:
|
The results for bandwidth on GCP are as follows:
|
||||||
|
|
||||||
![I/O write bandwidth benchmark graph](../_media/benchmark_fio_write_bw.png)
|
![I/O bandwidth GCP benchmark graph](../_media/benchmark_fio_gcp_bw.png)
|
||||||
|
|
||||||
On GCP, we can see that we exceed the maximum performance guarantees of the chosen disk type.
|
On GCP, the results exceed the maximum performance guarantees of the chosen disk type. There are two possible explanations for this. The first is that there may be cloud caching in place that isn't configurable. Alternatively, the underlying provisioned disk size may be larger than what was requested, resulting in higher performance boundaries.
|
||||||
There are two possible explanations for this. (1) There is some cloud caching in place we don't control. (2) The underlying provisioned disk size is larger than the requested on, which would yield higher performance boundaries.
|
|
||||||
|
|
||||||
Comparing Constellation on GCP with GKE, Constellation has a similar bandwidth but ~10% less IOPS performance.
|
When comparing Constellation on GCP with GKE, Constellation has similar bandwidth but about 10% less IOPS performance. On Azure, Constellation has similar IOPS performance compared to AKS, where both likely hit the maximum storage performance. However, Constellation has approximately 15% less read and write bandwidth.
|
||||||
Constellation on Azure has a similar IOPS performance compared to AKS, where both probably hit the maximum storage performance. Constellation has ~15% less read and write bandwidth.
|
|
||||||
|
|
||||||
## Conclusion
|
## Conclusion
|
||||||
|
|
||||||
Despite providing substantial [security benefits](./security-benefits.md), Constellation overall only has a slight performance overhead over the managed Kubernetes offerings AKS and GKE. Constellation is on par in most benchmarks but is slightly slower in certain scenarios due to network and storage encryption.
|
Despite the added [security benefits](./security-benefits.md) that Constellation provides, it only incurs a slight performance overhead when compared to managed Kubernetes offerings such as AKS and GKE. In most compute benchmarks, Constellation is on par, and while it may be slightly slower in certain I/O scenarios due to network and storage encryption, we're confident that we can reduce this overhead to single digits.
|
||||||
|
|
||||||
|
For instance, storage encryption only adds between 10% to 15% overhead in terms of bandwidth and IOPS. Meanwhile, the biggest performance impact that Constellation currently faces is network encryption, which can incur up to 58% overhead on a 10 Gbps network. However, the Cilium team has conducted [benchmarks with Cilium using WireGuard encryption](https://docs.cilium.io/en/latest/operations/performance/benchmark/#encryption-wireguard-ipsec) on a 100 Gbps network that yielded over 15 Gbps, and we're confident that we can provide a similar level of performance with Constellation in our upcoming releases.
|
||||||
|
|
||||||
|
Overall, Constellation strikes a great balance between security and performance, and we're continuously working to improve its performance capabilities while maintaining its high level of security.
|
||||||
|