constellation/operators/constellation-node-operator/internal/deploy/deploy.go

241 lines
9.6 KiB
Go
Raw Normal View History

/*
Copyright (c) Edgeless Systems GmbH
SPDX-License-Identifier: AGPL-3.0-only
*/
// Package deploy provides functions to deploy initial resources for the node operator.
package deploy
import (
"context"
"errors"
"fmt"
"strings"
mainconstants "github.com/edgelesssys/constellation/v2/internal/constants"
2023-01-04 13:04:28 -05:00
updatev1alpha1 "github.com/edgelesssys/constellation/v2/operators/constellation-node-operator/v2/api/v1alpha1"
operators: infrastructure autodiscovery (#1958) * helm: configure GCP cloud controller manager to search in all zones of a region See also: https://github.com/kubernetes/cloud-provider-gcp/blob/d716fdd45233c59b10fb76d5b72964cb8ec6d398/providers/gce/gce.go#L376-L380 * operators: add nodeGroupName to ScalingGroup CRD NodeGroupName is the human friendly name of the node group that will be exposed to customers via the Constellation config in the future. * operators: support simple executor / scheduler to reconcile on non-k8s resources * operators: add new return type for ListScalingGroups to support arbitrary node groups * operators: ListScalingGroups should return additionally created node groups on AWS * operators: ListScalingGroups should return additionally created node groups on Azure * operators: ListScalingGroups should return additionally created node groups on GCP * operators: ListScalingGroups should return additionally created node groups on unsupported CSPs * operators: implement external scaling group reconciler This controller scans the cloud provider infrastructure and changes k8s resources accordingly. It creates ScaleSet resources when new node groups are created and deletes them if the node groups are removed. * operators: no longer create scale sets when the operator starts In the future, scale sets are created dynamically. * operators: watch for node join/leave events using a controller * operators: deploy new controllers * docs: update auto scaling documentation with support for node groups
2023-07-05 01:27:34 -04:00
cspapi "github.com/edgelesssys/constellation/v2/operators/constellation-node-operator/v2/internal/cloud/api"
2023-01-04 13:04:28 -05:00
"github.com/edgelesssys/constellation/v2/operators/constellation-node-operator/v2/internal/constants"
corev1 "k8s.io/api/core/v1"
k8sErrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)
// InitialResources creates the initial resources for the node operator.
func InitialResources(ctx context.Context, k8sClient client.Client, imageInfo imageInfoGetter, scalingGroupGetter scalingGroupGetter, uid string) error {
logr := log.FromContext(ctx)
if err := cleanupPlaceholders(ctx, k8sClient); err != nil {
return fmt.Errorf("cleaning up placeholder node version: %w", err)
}
logr.Info("cleaned up placeholders")
operators: infrastructure autodiscovery (#1958) * helm: configure GCP cloud controller manager to search in all zones of a region See also: https://github.com/kubernetes/cloud-provider-gcp/blob/d716fdd45233c59b10fb76d5b72964cb8ec6d398/providers/gce/gce.go#L376-L380 * operators: add nodeGroupName to ScalingGroup CRD NodeGroupName is the human friendly name of the node group that will be exposed to customers via the Constellation config in the future. * operators: support simple executor / scheduler to reconcile on non-k8s resources * operators: add new return type for ListScalingGroups to support arbitrary node groups * operators: ListScalingGroups should return additionally created node groups on AWS * operators: ListScalingGroups should return additionally created node groups on Azure * operators: ListScalingGroups should return additionally created node groups on GCP * operators: ListScalingGroups should return additionally created node groups on unsupported CSPs * operators: implement external scaling group reconciler This controller scans the cloud provider infrastructure and changes k8s resources accordingly. It creates ScaleSet resources when new node groups are created and deletes them if the node groups are removed. * operators: no longer create scale sets when the operator starts In the future, scale sets are created dynamically. * operators: watch for node join/leave events using a controller * operators: deploy new controllers * docs: update auto scaling documentation with support for node groups
2023-07-05 01:27:34 -04:00
scalingGroups, err := scalingGroupGetter.ListScalingGroups(ctx, uid)
if err != nil {
return fmt.Errorf("listing scaling groups: %w", err)
}
operators: infrastructure autodiscovery (#1958) * helm: configure GCP cloud controller manager to search in all zones of a region See also: https://github.com/kubernetes/cloud-provider-gcp/blob/d716fdd45233c59b10fb76d5b72964cb8ec6d398/providers/gce/gce.go#L376-L380 * operators: add nodeGroupName to ScalingGroup CRD NodeGroupName is the human friendly name of the node group that will be exposed to customers via the Constellation config in the future. * operators: support simple executor / scheduler to reconcile on non-k8s resources * operators: add new return type for ListScalingGroups to support arbitrary node groups * operators: ListScalingGroups should return additionally created node groups on AWS * operators: ListScalingGroups should return additionally created node groups on Azure * operators: ListScalingGroups should return additionally created node groups on GCP * operators: ListScalingGroups should return additionally created node groups on unsupported CSPs * operators: implement external scaling group reconciler This controller scans the cloud provider infrastructure and changes k8s resources accordingly. It creates ScaleSet resources when new node groups are created and deletes them if the node groups are removed. * operators: no longer create scale sets when the operator starts In the future, scale sets are created dynamically. * operators: watch for node join/leave events using a controller * operators: deploy new controllers * docs: update auto scaling documentation with support for node groups
2023-07-05 01:27:34 -04:00
if len(scalingGroups) == 0 {
return errors.New("determining initial node image: no scaling group found")
}
if err := createAutoscalingStrategy(ctx, k8sClient, scalingGroupGetter.AutoscalingCloudProvider()); err != nil {
return fmt.Errorf("creating initial autoscaling strategy: %w", err)
}
operators: infrastructure autodiscovery (#1958) * helm: configure GCP cloud controller manager to search in all zones of a region See also: https://github.com/kubernetes/cloud-provider-gcp/blob/d716fdd45233c59b10fb76d5b72964cb8ec6d398/providers/gce/gce.go#L376-L380 * operators: add nodeGroupName to ScalingGroup CRD NodeGroupName is the human friendly name of the node group that will be exposed to customers via the Constellation config in the future. * operators: support simple executor / scheduler to reconcile on non-k8s resources * operators: add new return type for ListScalingGroups to support arbitrary node groups * operators: ListScalingGroups should return additionally created node groups on AWS * operators: ListScalingGroups should return additionally created node groups on Azure * operators: ListScalingGroups should return additionally created node groups on GCP * operators: ListScalingGroups should return additionally created node groups on unsupported CSPs * operators: implement external scaling group reconciler This controller scans the cloud provider infrastructure and changes k8s resources accordingly. It creates ScaleSet resources when new node groups are created and deletes them if the node groups are removed. * operators: no longer create scale sets when the operator starts In the future, scale sets are created dynamically. * operators: watch for node join/leave events using a controller * operators: deploy new controllers * docs: update auto scaling documentation with support for node groups
2023-07-05 01:27:34 -04:00
imageReference, err := scalingGroupGetter.GetScalingGroupImage(ctx, scalingGroups[0].GroupID)
if err != nil {
return fmt.Errorf("determining initial node image: %w", err)
}
imageVersion, err := imageInfo.ImageVersion()
if err != nil {
// do not fail if the image version cannot be determined
// this is important for backwards compatibility
logr.Error(err, "determining initial node image version")
imageVersion = ""
}
if err := createNodeVersion(ctx, k8sClient, imageReference, imageVersion); err != nil {
return fmt.Errorf("creating initial node version %q: %w", imageReference, err)
}
return nil
}
// createAutoscalingStrategy creates the autoscaling strategy resource if it does not exist yet.
func createAutoscalingStrategy(ctx context.Context, k8sClient client.Writer, provider string) error {
err := k8sClient.Create(ctx, &updatev1alpha1.AutoscalingStrategy{
TypeMeta: metav1.TypeMeta{APIVersion: "update.edgeless.systems/v1alpha1", Kind: "AutoscalingStrategy"},
ObjectMeta: metav1.ObjectMeta{
Name: constants.AutoscalingStrategyResourceName,
},
Spec: updatev1alpha1.AutoscalingStrategySpec{
Enabled: true,
DeploymentName: "constellation-cluster-autoscaler",
DeploymentNamespace: "kube-system",
AutoscalerExtraArgs: map[string]string{
"cloud-provider": provider,
"logtostderr": "true",
"stderrthreshold": "info",
"v": "2",
"namespace": "kube-system",
},
},
})
if k8sErrors.IsAlreadyExists(err) {
return nil
}
return err
}
// createNodeVersion creates the initial nodeversion resource if it does not exist yet.
func createNodeVersion(ctx context.Context, k8sClient client.Client, imageReference, imageVersion string) error {
latestComponentCM, err := findLatestK8sComponentsConfigMap(ctx, k8sClient)
if err != nil {
return fmt.Errorf("finding latest k8s-components configmap: %w", err)
}
err = k8sClient.Create(ctx, &updatev1alpha1.NodeVersion{
TypeMeta: metav1.TypeMeta{APIVersion: "update.edgeless.systems/v1alpha1", Kind: "NodeVersion"},
ObjectMeta: metav1.ObjectMeta{
Name: mainconstants.NodeVersionResourceName,
},
Spec: updatev1alpha1.NodeVersionSpec{
ImageReference: imageReference,
ImageVersion: imageVersion,
KubernetesComponentsReference: latestComponentCM.Name,
KubernetesClusterVersion: latestComponentCM.Data[mainconstants.K8sVersionFieldName],
},
})
if k8sErrors.IsAlreadyExists(err) {
return nil
} else if err != nil {
return err
}
return nil
}
// cleanupPlaceholders deletes the existing resources from older operator versions if they are placeholders.
func cleanupPlaceholders(ctx context.Context, k8sClient client.Client) error {
if err := cleanupPlaceholderAutoscalingStrategy(ctx, k8sClient); err != nil {
return err
}
if err := cleanupPlaceholderScalingGroups(ctx, k8sClient); err != nil {
return err
}
return cleanupPlaceholderNodeVersion(ctx, k8sClient)
}
func cleanupPlaceholderAutoscalingStrategy(ctx context.Context, k8sClient client.Client) error {
logr := log.FromContext(ctx)
autoscalingStrategy := &updatev1alpha1.AutoscalingStrategy{}
err := k8sClient.Get(ctx, types.NamespacedName{Name: constants.AutoscalingStrategyResourceName}, autoscalingStrategy)
if k8sErrors.IsNotFound(err) {
logr.Info("no old autoscalingstrategy resource found - skipping cleanup", "name", constants.AutoscalingStrategyResourceName)
return nil
} else if err != nil {
logr.Info("cleaning up old autoscalingstrategy resource", "name", constants.AutoscalingStrategyResourceName, "error", err)
return err
}
if autoscalingStrategy.Spec.AutoscalerExtraArgs["cloud-provider"] != constants.PlaceholderImageName {
logr.Info("old autoscalingstrategy resource is not a placeholder - skipping cleanup", "name", constants.AutoscalingStrategyResourceName)
return nil
}
logr.Info("deleting old autoscalingstrategy resource", "name", constants.AutoscalingStrategyResourceName)
return k8sClient.Delete(ctx, autoscalingStrategy)
}
// cleanupPlaceholderScalingGroups deletes the existing scalinggroup resource from older operator versions if they are placeholders.
func cleanupPlaceholderScalingGroups(ctx context.Context, k8sClient client.Client) error {
logr := log.FromContext(ctx)
names := []string{constants.PlaceholderControlPlaneScalingGroupName, constants.PlaceholderWorkerScalingGroupName}
for _, name := range names {
scalingGroup := &updatev1alpha1.ScalingGroup{}
err := k8sClient.Get(ctx, types.NamespacedName{Name: name}, scalingGroup)
if k8sErrors.IsNotFound(err) {
logr.Info("no old scalinggroup resource found - skipping cleanup", "name", name)
continue
} else if err != nil {
logr.Info("cleaning up old scalinggroup resource", "name", name, "error", err)
return err
}
if scalingGroup.Spec.AutoscalerGroupName != name || scalingGroup.Spec.GroupID != name {
logr.Info("real scalinggroup resource found - skipping cleanup", "name", name)
continue
}
logr.Info("cleaning up old scalinggroup resource")
if err := k8sClient.Delete(ctx, scalingGroup); err != nil {
logr.Info("cleaning up old scalinggroup resource", "name", name, "error", err)
return err
}
}
return nil
}
// cleanupPlaceholder deletes the existing nodeversion resource from older operator versions if it was a placeholder.
func cleanupPlaceholderNodeVersion(ctx context.Context, k8sClient client.Client) error {
logr := log.FromContext(ctx)
nodeVersion := &updatev1alpha1.NodeVersion{}
err := k8sClient.Get(ctx, types.NamespacedName{Name: mainconstants.NodeVersionResourceName}, nodeVersion)
if k8sErrors.IsNotFound(err) {
logr.Info("no old nodeversion resource found - skipping cleanup")
return nil
} else if err != nil {
logr.Info("cleaning up old nodeversion resource", "error", err)
return err
}
if nodeVersion.Spec.ImageReference != constants.PlaceholderImageName {
logr.Info("real nodeversion resource found - skipping cleanup")
return nil
}
logr.Info("cleaning up old nodeversion resource")
return k8sClient.Delete(ctx, nodeVersion)
}
// findLatestK8sComponentsConfigMap finds most recently created k8s-components configmap in the kube-system namespace.
// It returns an error if there is no or multiple configmaps matching the prefix "k8s-components".
func findLatestK8sComponentsConfigMap(ctx context.Context, k8sClient client.Client) (corev1.ConfigMap, error) {
var configMaps corev1.ConfigMapList
err := k8sClient.List(ctx, &configMaps, client.InNamespace("kube-system"))
if err != nil {
return corev1.ConfigMap{}, fmt.Errorf("listing configmaps: %w", err)
}
// collect all k8s-components configmaps
componentConfigMaps := []corev1.ConfigMap{}
for _, configMap := range configMaps.Items {
if strings.HasPrefix(configMap.Name, "k8s-components") {
componentConfigMaps = append(componentConfigMaps, configMap)
}
}
if len(componentConfigMaps) == 0 {
return corev1.ConfigMap{}, fmt.Errorf("no configmaps found")
}
// find latest configmap
var latestConfigMap corev1.ConfigMap
for _, cm := range componentConfigMaps {
if cm.CreationTimestamp.After(latestConfigMap.CreationTimestamp.Time) {
latestConfigMap = cm
}
}
return latestConfigMap, nil
}
type imageInfoGetter interface {
ImageVersion() (string, error)
}
type scalingGroupGetter interface {
// GetScalingGroupImage retrieves the image currently used by a scaling group.
GetScalingGroupImage(ctx context.Context, scalingGroupID string) (string, error)
// GetScalingGroupName retrieves the name of a scaling group.
GetScalingGroupName(scalingGroupID string) (string, error)
// GetScalingGroupName retrieves the name of a scaling group as needed by the cluster-autoscaler.
GetAutoscalingGroupName(scalingGroupID string) (string, error)
// ListScalingGroups retrieves a list of scaling groups for the cluster.
operators: infrastructure autodiscovery (#1958) * helm: configure GCP cloud controller manager to search in all zones of a region See also: https://github.com/kubernetes/cloud-provider-gcp/blob/d716fdd45233c59b10fb76d5b72964cb8ec6d398/providers/gce/gce.go#L376-L380 * operators: add nodeGroupName to ScalingGroup CRD NodeGroupName is the human friendly name of the node group that will be exposed to customers via the Constellation config in the future. * operators: support simple executor / scheduler to reconcile on non-k8s resources * operators: add new return type for ListScalingGroups to support arbitrary node groups * operators: ListScalingGroups should return additionally created node groups on AWS * operators: ListScalingGroups should return additionally created node groups on Azure * operators: ListScalingGroups should return additionally created node groups on GCP * operators: ListScalingGroups should return additionally created node groups on unsupported CSPs * operators: implement external scaling group reconciler This controller scans the cloud provider infrastructure and changes k8s resources accordingly. It creates ScaleSet resources when new node groups are created and deletes them if the node groups are removed. * operators: no longer create scale sets when the operator starts In the future, scale sets are created dynamically. * operators: watch for node join/leave events using a controller * operators: deploy new controllers * docs: update auto scaling documentation with support for node groups
2023-07-05 01:27:34 -04:00
ListScalingGroups(ctx context.Context, uid string) ([]cspapi.ScalingGroup, error)
// AutoscalingCloudProvider returns the cloud-provider name as used by k8s cluster-autoscaler.
AutoscalingCloudProvider() string
}