terraform: aws node groups

This commit is contained in:
Malte Poll 2023-06-23 17:19:43 +02:00 committed by Malte Poll
parent 6dd8a571ec
commit 22ebdace43
12 changed files with 304 additions and 170 deletions

View File

@ -130,16 +130,27 @@ func (c *Creator) Create(ctx context.Context, opts CreateOptions) (clusterid.Fil
func (c *Creator) createAWS(ctx context.Context, cl terraformClient, opts CreateOptions) (idFile clusterid.File, retErr error) {
vars := terraform.AWSClusterVariables{
CommonVariables: terraform.CommonVariables{
Name: opts.Config.Name,
CountControlPlanes: opts.ControlPlaneCount,
CountWorkers: opts.WorkerCount,
StateDiskSizeGB: opts.Config.StateDiskSizeGB,
Name: opts.Config.Name,
NodeGroups: map[string]terraform.AWSNodeGroup{
"control_plane_default": {
Role: role.ControlPlane.TFString(),
StateDiskSizeGB: opts.Config.StateDiskSizeGB,
InitialCount: opts.ControlPlaneCount,
Zone: opts.Config.Provider.AWS.Zone,
InstanceType: opts.InsType,
DiskType: opts.Config.Provider.AWS.StateDiskType,
},
"worker_default": {
Role: role.Worker.TFString(),
StateDiskSizeGB: opts.Config.StateDiskSizeGB,
InitialCount: opts.WorkerCount,
Zone: opts.Config.Provider.AWS.Zone,
InstanceType: opts.InsType,
DiskType: opts.Config.Provider.AWS.StateDiskType,
},
},
StateDiskType: opts.Config.Provider.AWS.StateDiskType,
Region: opts.Config.Provider.AWS.Region,
Zone: opts.Config.Provider.AWS.Zone,
InstanceType: opts.InsType,
AMIImageID: opts.image,
IAMProfileControlPlane: opts.Config.Provider.AWS.IAMProfileControlPlane,
IAMProfileWorkerNodes: opts.Config.Provider.AWS.IAMProfileWorkerNodes,

View File

@ -214,21 +214,28 @@ func parseTerraformUpgradeVars(cmd *cobra.Command, conf *config.Config, fetcher
return nil, fmt.Errorf("fetching image reference: %w", err)
}
commonVariables := terraform.CommonVariables{
Name: conf.Name,
StateDiskSizeGB: conf.StateDiskSizeGB,
// Ignore node count as their values are only being respected for creation
// See here: https://developer.hashicorp.com/terraform/language/meta-arguments/lifecycle#ignore_changes
}
switch conf.GetProvider() {
case cloudprovider.AWS:
vars := &terraform.AWSClusterVariables{
CommonVariables: commonVariables,
StateDiskType: conf.Provider.AWS.StateDiskType,
Name: conf.Name,
NodeGroups: map[string]terraform.AWSNodeGroup{
"control_plane_default": {
Role: role.ControlPlane.TFString(),
StateDiskSizeGB: conf.StateDiskSizeGB,
Zone: conf.Provider.AWS.Zone,
InstanceType: conf.Provider.AWS.InstanceType,
DiskType: conf.Provider.AWS.StateDiskType,
},
"worker_default": {
Role: role.Worker.TFString(),
StateDiskSizeGB: conf.StateDiskSizeGB,
Zone: conf.Provider.AWS.Zone,
InstanceType: conf.Provider.AWS.InstanceType,
DiskType: conf.Provider.AWS.StateDiskType,
},
},
Region: conf.Provider.AWS.Region,
Zone: conf.Provider.AWS.Zone,
InstanceType: conf.Provider.AWS.InstanceType,
AMIImageID: imageRef,
IAMProfileControlPlane: conf.Provider.AWS.IAMProfileControlPlane,
IAMProfileWorkerNodes: conf.Provider.AWS.IAMProfileWorkerNodes,

View File

@ -27,6 +27,25 @@ locals {
ports_verify = "30081"
ports_recovery = "9999"
ports_debugd = "4000"
target_group_arns = {
control-plane : flatten([
module.load_balancer_target_bootstrapper.target_group_arn,
module.load_balancer_target_kubernetes.target_group_arn,
module.load_balancer_target_verify.target_group_arn,
module.load_balancer_target_recovery.target_group_arn,
module.load_balancer_target_konnectivity.target_group_arn,
var.debug ? [module.load_balancer_target_debugd[0].target_group_arn] : [],
])
worker : []
}
iam_instance_profile = {
control-plane : var.iam_instance_profile_control_plane
worker : var.iam_instance_profile_worker_nodes
}
# zones are all availability zones that are used by the node groups
zones = distinct(sort([
for node_group in var.node_groups : node_group.zone
]))
tags = { constellation-uid = local.uid }
}
@ -50,15 +69,17 @@ module "public_private_subnet" {
source = "./modules/public_private_subnet"
name = local.name
vpc_id = aws_vpc.vpc.id
cidr_vpc_subnet_nodes = "192.168.178.0/24"
cidr_vpc_subnet_internet = "192.168.0.0/24"
cidr_vpc_subnet_nodes = "192.168.176.0/20"
cidr_vpc_subnet_internet = "192.168.0.0/20"
zone = var.zone
zones = local.zones
tags = local.tags
}
resource "aws_eip" "lb" {
domain = "vpc"
tags = local.tags
for_each = toset(module.public_private_subnet.all_zones)
domain = "vpc"
tags = local.tags
}
resource "aws_lb" "front_end" {
@ -67,9 +88,12 @@ resource "aws_lb" "front_end" {
load_balancer_type = "network"
tags = local.tags
subnet_mapping {
subnet_id = module.public_private_subnet.public_subnet_id
allocation_id = aws_eip.lb.id
dynamic "subnet_mapping" {
for_each = toset(module.public_private_subnet.all_zones)
content {
subnet_id = module.public_private_subnet.public_subnet_id[subnet_mapping.key]
allocation_id = aws_eip.lb[subnet_mapping.key].id
}
}
enable_cross_zone_load_balancing = true
}
@ -216,59 +240,42 @@ module "load_balancer_target_konnectivity" {
healthcheck_protocol = "TCP"
}
module "instance_group_control_plane" {
source = "./modules/instance_group"
name = local.name
role = "control-plane"
uid = local.uid
instance_type = var.instance_type
instance_count = var.control_plane_count
image_id = var.ami
state_disk_type = var.state_disk_type
state_disk_size = var.state_disk_size
target_group_arns = flatten([
module.load_balancer_target_bootstrapper.target_group_arn,
module.load_balancer_target_kubernetes.target_group_arn,
module.load_balancer_target_verify.target_group_arn,
module.load_balancer_target_recovery.target_group_arn,
module.load_balancer_target_konnectivity.target_group_arn,
var.debug ? [module.load_balancer_target_debugd[0].target_group_arn] : [],
])
module "instance_group" {
source = "./modules/instance_group"
for_each = var.node_groups
base_name = local.name
node_group_name = each.key
role = each.value.role
zone = each.value.zone
uid = local.uid
instance_type = each.value.instance_type
instance_count = each.value.instance_count
image_id = var.ami
state_disk_type = each.value.disk_type
state_disk_size = each.value.disk_size
target_group_arns = local.target_group_arns[each.value.role]
security_groups = [aws_security_group.security_group.id]
subnetwork = module.public_private_subnet.private_subnet_id
iam_instance_profile = var.iam_instance_profile_control_plane
subnetwork = module.public_private_subnet.private_subnet_id[each.value.zone]
iam_instance_profile = local.iam_instance_profile[each.value.role]
enable_snp = var.enable_snp
tags = merge(
local.tags,
{ Name = local.name },
{ constellation-role = "control-plane" },
{ constellation-role = each.value.role },
{ constellation-uid = local.uid },
{ constellation-init-secret-hash = local.initSecretHash },
{ "kubernetes.io/cluster/${local.name}" = "owned" }
)
}
module "instance_group_worker_nodes" {
source = "./modules/instance_group"
name = local.name
role = "worker"
uid = local.uid
instance_type = var.instance_type
instance_count = var.worker_count
image_id = var.ami
state_disk_type = var.state_disk_type
state_disk_size = var.state_disk_size
subnetwork = module.public_private_subnet.private_subnet_id
target_group_arns = []
security_groups = [aws_security_group.security_group.id]
iam_instance_profile = var.iam_instance_profile_worker_nodes
enable_snp = var.enable_snp
tags = merge(
local.tags,
{ Name = local.name },
{ constellation-role = "worker" },
{ constellation-uid = local.uid },
{ constellation-init-secret-hash = local.initSecretHash },
{ "kubernetes.io/cluster/${local.name}" = "owned" }
)
// TODO(AB#3248): Remove this migration after we can assume that all existing clusters have been migrated.
moved {
from = module.instance_group_control_plane
to = module.instance_group["control_plane_default"]
}
// TODO(AB#3248): Remove this migration after we can assume that all existing clusters have been migrated.
moved {
from = module.instance_group_worker_nodes
to = module.instance_group["worker_default"]
}

View File

@ -4,13 +4,21 @@ terraform {
source = "hashicorp/aws"
version = "5.1.0"
}
random = {
source = "hashicorp/random"
version = "3.5.1"
}
}
}
locals {
name = "${var.name}-${lower(var.role)}"
group_uid = random_id.uid.hex
name = "${var.base_name}-${lower(var.role)}-${local.group_uid}"
}
resource "random_id" "uid" {
byte_length = 4
}
resource "aws_launch_template" "launch_template" {
name_prefix = local.name
@ -37,18 +45,23 @@ resource "aws_launch_template" "launch_template" {
}
}
# See: https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/launch_template#cpu-options
cpu_options {
# use "enabled" to enable SEV-SNP
# use "disabled" to disable SEV-SNP (but still require SNP-capable hardware)
# use null to leave the setting unset (allows non-SNP-capable hardware to be used)
amd_sev_snp = var.enable_snp ? "enabled" : null
}
lifecycle {
create_before_destroy = true
ignore_changes = [
cpu_options, # required. we cannot change the CPU options of a launch template
name_prefix, # required. Allow legacy scale sets to keep their old names
default_version, # required. update procedure creates new versions of the launch template
image_id, # required. update procedure modifies the image id externally
]
}
# See: https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/launch_template#cpu-options
cpu_options {
amd_sev_snp = var.enable_snp ? "enabled" : "disabled"
}
}
resource "aws_autoscaling_group" "autoscaling_group" {
@ -74,6 +87,7 @@ resource "aws_autoscaling_group" "autoscaling_group" {
lifecycle {
create_before_destroy = true
ignore_changes = [
name, # required. Allow legacy scale sets to keep their old names
launch_template.0.version, # required. update procedure creates new versions of the launch template
min_size, # required. autoscaling modifies the instance count externally
max_size, # required. autoscaling modifies the instance count externally

View File

@ -1,8 +1,13 @@
variable "name" {
variable "base_name" {
type = string
description = "Base name of the instance group."
}
variable "node_group_name" {
type = string
description = "Constellation name for the node group (used for configuration and CSP-independent naming)."
}
variable "role" {
type = string
description = "The role of the instance group."
@ -72,3 +77,8 @@ variable "enable_snp" {
default = true
description = "Enable AMD SEV SNP. Setting this to true sets the cpu-option AmdSevSnp to enable."
}
variable "zone" {
type = string
description = "Zone to deploy the instance group in."
}

View File

@ -7,23 +7,70 @@ terraform {
}
}
locals {
# az_number is a stable mapping of az suffix to a number used for calculating the subnet cidr
az_number = {
# we start counting at 2 to have the legacy subnet before the first newly created networks
# the legacy subnet did not start at a /20 boundary
# 0 => 192.168.176.0/24 (unused private subnet cidr)
# 1 => 192.168.177.0/24 (unused private subnet cidr)
legacy = 2 # => 192.168.178.0/24 (legacy private subnet)
a = 3 # => 192.168.178.1/24 (first newly created zonal private subnet)
b = 4
c = 5
d = 6
e = 7
f = 8
g = 9
h = 10
i = 11
j = 12
k = 13
l = 14
m = 15 # => 192.168.191.0/24 (last reserved zonal private subnet cidr). In reality, AWS doesn't have that many zones in a region.
}
}
data "aws_availability_zones" "available" {
state = "available"
}
data "aws_availability_zone" "all" {
for_each = toset(data.aws_availability_zones.available.names)
name = each.key
}
resource "aws_eip" "nat" {
domain = "vpc"
tags = var.tags
for_each = toset(var.zones)
domain = "vpc"
tags = var.tags
}
resource "aws_subnet" "private" {
for_each = data.aws_availability_zone.all
vpc_id = var.vpc_id
cidr_block = var.cidr_vpc_subnet_nodes
availability_zone = var.zone
cidr_block = cidrsubnet(var.cidr_vpc_subnet_nodes, 4, local.az_number[each.value.name_suffix])
availability_zone = each.key
tags = merge(var.tags, { Name = "${var.name}-subnet-nodes" })
lifecycle {
ignore_changes = [
cidr_block, # required. Legacy subnets used fixed cidr blocks for the single zone that don't match the new scheme.
]
}
}
resource "aws_subnet" "public" {
for_each = data.aws_availability_zone.all
vpc_id = var.vpc_id
cidr_block = var.cidr_vpc_subnet_internet
availability_zone = var.zone
cidr_block = cidrsubnet(var.cidr_vpc_subnet_internet, 4, local.az_number[each.value.name_suffix])
availability_zone = each.key
tags = merge(var.tags, { Name = "${var.name}-subnet-internet" })
lifecycle {
ignore_changes = [
cidr_block, # required. Legacy subnets used fixed cidr blocks for the single zone that don't match the new scheme.
]
}
}
resource "aws_internet_gateway" "gw" {
@ -32,24 +79,27 @@ resource "aws_internet_gateway" "gw" {
}
resource "aws_nat_gateway" "gw" {
subnet_id = aws_subnet.public.id
allocation_id = aws_eip.nat.id
for_each = toset(var.zones)
subnet_id = aws_subnet.public[each.key].id
allocation_id = aws_eip.nat[each.key].id
tags = merge(var.tags, { Name = "${var.name}-nat-gateway" })
}
resource "aws_route_table" "private_nat" {
vpc_id = var.vpc_id
tags = merge(var.tags, { Name = "${var.name}-private-nat" })
for_each = toset(var.zones)
vpc_id = var.vpc_id
tags = merge(var.tags, { Name = "${var.name}-private-nat" })
route {
cidr_block = "0.0.0.0/0"
nat_gateway_id = aws_nat_gateway.gw.id
nat_gateway_id = aws_nat_gateway.gw[each.key].id
}
}
resource "aws_route_table" "public_igw" {
vpc_id = var.vpc_id
tags = merge(var.tags, { Name = "${var.name}-public-igw" })
for_each = toset(var.zones)
vpc_id = var.vpc_id
tags = merge(var.tags, { Name = "${var.name}-public-igw" })
route {
cidr_block = "0.0.0.0/0"
@ -57,12 +107,14 @@ resource "aws_route_table" "public_igw" {
}
}
resource "aws_route_table_association" "private-nat" {
subnet_id = aws_subnet.private.id
route_table_id = aws_route_table.private_nat.id
resource "aws_route_table_association" "private_nat" {
for_each = toset(var.zones)
subnet_id = aws_subnet.private[each.key].id
route_table_id = aws_route_table.private_nat[each.key].id
}
resource "aws_route_table_association" "route_to_internet" {
subnet_id = aws_subnet.public.id
route_table_id = aws_route_table.public_igw.id
for_each = toset(var.zones)
subnet_id = aws_subnet.public[each.key].id
route_table_id = aws_route_table.public_igw[each.key].id
}

View File

@ -1,7 +1,19 @@
output "private_subnet_id" {
value = aws_subnet.private.id
value = {
for az in data.aws_availability_zone.all :
az.name => aws_subnet.private[az.name].id
}
}
output "public_subnet_id" {
value = aws_subnet.public.id
value = {
for az in data.aws_availability_zone.all :
az.name => aws_subnet.public[az.name].id
}
}
# all_zones is a list of all availability zones in the region
# it also contains zones that are not currently used by node groups (but might be in the future)
output "all_zones" {
value = distinct(sort([for az in data.aws_availability_zone.all : az.name]))
}

View File

@ -10,7 +10,12 @@ variable "vpc_id" {
variable "zone" {
type = string
description = "Availability zone."
description = "Main availability zone. Only used for legacy reasons."
}
variable "zones" {
type = list(string)
description = "Availability zones."
}
variable "cidr_vpc_subnet_nodes" {

View File

@ -1,5 +1,5 @@
output "ip" {
value = aws_eip.lb.public_ip
value = aws_eip.lb[var.zone].public_ip
}
output "uid" {

View File

@ -1,6 +1,5 @@
variable "name" {
type = string
default = "constell"
description = "Name of your Constellation"
validation {
condition = length(var.name) < 10
@ -8,6 +7,22 @@ variable "name" {
}
}
variable "node_groups" {
type = map(object({
role = string
instance_count = optional(number)
instance_type = string
disk_size = number
disk_type = string
zone = string
}))
description = "A map of node group names to node group configurations."
validation {
condition = can([for group in var.node_groups : group.role == "control-plane" || group.role == "worker"])
error_message = "The role has to be 'control-plane' or 'worker'."
}
}
variable "iam_instance_profile_worker_nodes" {
type = string
description = "Name of the IAM instance profile for worker nodes"
@ -18,33 +33,6 @@ variable "iam_instance_profile_control_plane" {
description = "Name of the IAM instance profile for control plane nodes"
}
variable "instance_type" {
type = string
description = "Instance type for worker nodes"
}
variable "state_disk_type" {
type = string
default = "gp2"
description = "EBS disk type for the state disk of the nodes"
}
variable "state_disk_size" {
type = number
default = 30
description = "Disk size for the state disk of the nodes [GB]"
}
variable "control_plane_count" {
type = number
description = "Number of control plane nodes"
}
variable "worker_count" {
type = number
description = "Number of worker nodes"
}
variable "ami" {
type = string
description = "AMI ID"

View File

@ -44,42 +44,46 @@ func (v *CommonVariables) String() string {
// AWSClusterVariables is user configuration for creating a cluster with Terraform on AWS.
type AWSClusterVariables struct {
// CommonVariables contains common variables.
CommonVariables
// Name of the cluster.
Name string `hcl:"name" cty:"name"`
// Region is the AWS region to use.
Region string
Region string `hcl:"region" cty:"region"`
// Zone is the AWS zone to use in the given region.
Zone string
Zone string `hcl:"zone" cty:"zone"`
// AMIImageID is the ID of the AMI image to use.
AMIImageID string
// InstanceType is the type of the EC2 instance to use.
InstanceType string
// StateDiskType is the EBS disk type to use for the state disk.
StateDiskType string
AMIImageID string `hcl:"ami" cty:"ami"`
// IAMGroupControlPlane is the IAM group to use for the control-plane nodes.
IAMProfileControlPlane string
IAMProfileControlPlane string `hcl:"iam_instance_profile_control_plane" cty:"iam_instance_profile_control_plane"`
// IAMGroupWorkerNodes is the IAM group to use for the worker nodes.
IAMProfileWorkerNodes string
IAMProfileWorkerNodes string `hcl:"iam_instance_profile_worker_nodes" cty:"iam_instance_profile_worker_nodes"`
// Debug is true if debug mode is enabled.
Debug bool
Debug bool `hcl:"debug" cty:"debug"`
// EnableSNP controls enablement of the EC2 cpu-option "AmdSevSnp".
EnableSNP bool
EnableSNP bool `hcl:"enable_snp" cty:"enable_snp"`
// NodeGroups is a map of node groups to create.
NodeGroups map[string]AWSNodeGroup `hcl:"node_groups" cty:"node_groups"`
}
// AWSNodeGroup is a node group to create on AWS.
type AWSNodeGroup struct {
// Role is the role of the node group.
Role string `hcl:"role" cty:"role"`
// StateDiskSizeGB is the size of the state disk to allocate to each node, in GB.
StateDiskSizeGB int `hcl:"disk_size" cty:"disk_size"`
// InitialCount is the initial number of nodes to create in the node group.
InitialCount int `hcl:"initial_count" cty:"initial_count"`
// Zone is the AWS availability-zone to use in the given region.
Zone string `hcl:"zone" cty:"zone"`
// InstanceType is the type of the EC2 instance to use.
InstanceType string `hcl:"instance_type" cty:"instance_type"`
// DiskType is the EBS disk type to use for the state disk.
DiskType string `hcl:"disk_type" cty:"disk_type"`
}
func (v *AWSClusterVariables) String() string {
b := &strings.Builder{}
b.WriteString(v.CommonVariables.String())
writeLinef(b, "region = %q", v.Region)
writeLinef(b, "zone = %q", v.Zone)
writeLinef(b, "ami = %q", v.AMIImageID)
writeLinef(b, "instance_type = %q", v.InstanceType)
writeLinef(b, "state_disk_type = %q", v.StateDiskType)
writeLinef(b, "iam_instance_profile_control_plane = %q", v.IAMProfileControlPlane)
writeLinef(b, "iam_instance_profile_worker_nodes = %q", v.IAMProfileWorkerNodes)
writeLinef(b, "debug = %t", v.Debug)
writeLinef(b, "enable_snp = %t", v.EnableSNP)
return b.String()
f := hclwrite.NewEmptyFile()
gohcl.EncodeIntoBody(v, f.Body())
return string(f.Bytes())
}
// AWSIAMVariables is user configuration for creating the IAM configuration with Terraform on Microsoft Azure.

View File

@ -16,17 +16,28 @@ import (
func TestAWSClusterVariables(t *testing.T) {
vars := AWSClusterVariables{
CommonVariables: CommonVariables{
Name: "cluster-name",
CountControlPlanes: 1,
CountWorkers: 2,
StateDiskSizeGB: 30,
Name: "cluster-name",
NodeGroups: map[string]AWSNodeGroup{
"control_plane_default": {
Role: role.ControlPlane.TFString(),
StateDiskSizeGB: 30,
InitialCount: 1,
Zone: "eu-central-1b",
InstanceType: "x1.foo",
DiskType: "foodisk",
},
"worker_default": {
Role: role.Worker.TFString(),
StateDiskSizeGB: 30,
InitialCount: 2,
Zone: "eu-central-1c",
InstanceType: "x1.bar",
DiskType: "bardisk",
},
},
Region: "eu-central-1",
Zone: "eu-central-1a",
AMIImageID: "ami-0123456789abcdef",
InstanceType: "x1.foo",
StateDiskType: "bardisk",
IAMProfileControlPlane: "arn:aws:iam::123456789012:instance-profile/cluster-name-controlplane",
IAMProfileWorkerNodes: "arn:aws:iam::123456789012:instance-profile/cluster-name-worker",
Debug: true,
@ -34,19 +45,32 @@ func TestAWSClusterVariables(t *testing.T) {
}
// test that the variables are correctly rendered
want := `name = "cluster-name"
control_plane_count = 1
worker_count = 2
state_disk_size = 30
region = "eu-central-1"
zone = "eu-central-1a"
ami = "ami-0123456789abcdef"
instance_type = "x1.foo"
state_disk_type = "bardisk"
want := `name = "cluster-name"
region = "eu-central-1"
zone = "eu-central-1a"
ami = "ami-0123456789abcdef"
iam_instance_profile_control_plane = "arn:aws:iam::123456789012:instance-profile/cluster-name-controlplane"
iam_instance_profile_worker_nodes = "arn:aws:iam::123456789012:instance-profile/cluster-name-worker"
debug = true
enable_snp = true
iam_instance_profile_worker_nodes = "arn:aws:iam::123456789012:instance-profile/cluster-name-worker"
debug = true
enable_snp = true
node_groups = {
control_plane_default = {
disk_size = 30
disk_type = "foodisk"
initial_count = 1
instance_type = "x1.foo"
role = "control-plane"
zone = "eu-central-1b"
}
worker_default = {
disk_size = 30
disk_type = "bardisk"
initial_count = 2
instance_type = "x1.bar"
role = "worker"
zone = "eu-central-1c"
}
}
`
got := vars.String()
assert.Equal(t, want, got)