Skip to content

Commit

Permalink
Fix: E2E failures in CI
Browse files Browse the repository at this point in the history
Removed:
- Metrics and pod logs collection. Crust gather collects logs for all
  resources.

Fixed:
- MachineDeployment checks for running machines. MachineSets are picked
  at random, as they are indistinguishable based on labels, and belong
  to the same MachineDeployment. This causes flakes as old MachineSet is
  expected to scale accordingly, while the new one performed it instead.
- Increased ClusterClass apply timeouts. CAPD webhooks may take longer
  to stand up.

Signed-off-by: Danil-Grigorev <[email protected]>
  • Loading branch information
Danil-Grigorev committed Sep 13, 2024
1 parent 6445575 commit a547119
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 52 deletions.
113 changes: 105 additions & 8 deletions test/e2e/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ import (
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/cmd/clusterctl/client/config"
"sigs.k8s.io/cluster-api/test/framework"
"sigs.k8s.io/cluster-api/test/framework/clusterctl"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/yaml"
)

Expand All @@ -55,16 +57,11 @@ func Byf(format string, a ...interface{}) {
By(fmt.Sprintf(format, a...))
}

func setupSpecNamespace(ctx context.Context, specName string, clusterProxy framework.ClusterProxy, artifactFolder string) (*corev1.Namespace, context.CancelFunc) {
func setupSpecNamespace(ctx context.Context, specName string, clusterProxy framework.ClusterProxy, _ string) (*corev1.Namespace, context.CancelFunc) {
Byf("Creating a namespace for hosting the %q test spec", specName)
namespace, cancelWatches := framework.CreateNamespaceAndWatchEvents(ctx, framework.CreateNamespaceAndWatchEventsInput{
Creator: clusterProxy.GetClient(),
ClientSet: clusterProxy.GetClientSet(),
Name: fmt.Sprintf("%s-%s", specName, util.RandomString(6)),
LogFolder: filepath.Join(artifactFolder, "clusters", clusterProxy.GetName()),
})

return namespace, cancelWatches
_, cancelWatches := context.WithCancel(ctx)
return framework.CreateNamespace(ctx, framework.CreateNamespaceInput{Creator: clusterProxy.GetClient(), Name: fmt.Sprintf("%s-%s", specName, util.RandomString(6))}, "40s", "10s"), cancelWatches
}

func cleanupInstallation(ctx context.Context, clusterctlLogFolder, clusterctlConfigPath string, proxy framework.ClusterProxy) func() {
Expand Down Expand Up @@ -191,3 +188,103 @@ func localLoadE2EConfig(configPath string) *clusterctl.E2EConfig {

return config
}

// UpgradeManagementCluster upgrades provider a management cluster using clusterctl, and waits for the cluster to be ready.
func UpgradeManagementCluster(ctx context.Context, input clusterctl.UpgradeManagementClusterAndWaitInput) {
Expect(ctx).NotTo(BeNil(), "ctx is required for UpgradeManagementCluster")
Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling UpgradeManagementCluster")
Expect(input.ClusterctlConfigPath).To(BeAnExistingFile(), "Invalid argument. input.ClusterctlConfigPath must be an existing file when calling UpgradeManagementCluster")

// Check if the user want a custom upgrade
isCustomUpgrade := input.CoreProvider != "" ||
len(input.BootstrapProviders) > 0 ||
len(input.ControlPlaneProviders) > 0 ||
len(input.InfrastructureProviders) > 0 ||
len(input.IPAMProviders) > 0 ||
len(input.RuntimeExtensionProviders) > 0 ||
len(input.AddonProviders) > 0

Expect((input.Contract != "" && !isCustomUpgrade) || (input.Contract == "" && isCustomUpgrade)).To(BeTrue(), `Invalid argument. Either the input.Contract parameter or at least one of the following providers has to be set:
input.CoreProvider, input.BootstrapProviders, input.ControlPlaneProviders, input.InfrastructureProviders, input.IPAMProviders, input.RuntimeExtensionProviders, input.AddonProviders`)

Expect(os.MkdirAll(input.LogFolder, 0750)).To(Succeed(), "Invalid argument. input.LogFolder can't be created for UpgradeManagementClusterAndWait")

upgradeInput := clusterctl.UpgradeInput{
ClusterctlConfigPath: input.ClusterctlConfigPath,
ClusterctlVariables: input.ClusterctlVariables,
ClusterName: input.ClusterProxy.GetName(),
KubeconfigPath: input.ClusterProxy.GetKubeconfigPath(),
Contract: input.Contract,
CoreProvider: input.CoreProvider,
BootstrapProviders: input.BootstrapProviders,
ControlPlaneProviders: input.ControlPlaneProviders,
InfrastructureProviders: input.InfrastructureProviders,
IPAMProviders: input.IPAMProviders,
RuntimeExtensionProviders: input.RuntimeExtensionProviders,
AddonProviders: input.AddonProviders,
LogFolder: input.LogFolder,
}

clusterctl.Upgrade(ctx, upgradeInput)

// We have to skip collecting metrics, as it causes failures in CI
}

// InitManagementCluster initializes a management using clusterctl.
func InitManagementCluster(ctx context.Context, input clusterctl.InitManagementClusterAndWatchControllerLogsInput, intervals ...interface{}) {
Expect(ctx).NotTo(BeNil(), "ctx is required for InitManagementCluster")
Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling InitManagementCluster")
Expect(input.ClusterctlConfigPath).To(BeAnExistingFile(), "Invalid argument. input.ClusterctlConfigPath must be an existing file when calling InitManagementCluster")
Expect(input.InfrastructureProviders).ToNot(BeEmpty(), "Invalid argument. input.InfrastructureProviders can't be empty when calling InitManagementCluster")
Expect(os.MkdirAll(input.LogFolder, 0750)).To(Succeed(), "Invalid argument. input.LogFolder can't be created for InitManagementCluster")

logger := log.FromContext(ctx)

if input.CoreProvider == "" {
input.CoreProvider = config.ClusterAPIProviderName
}
if len(input.BootstrapProviders) == 0 {
input.BootstrapProviders = []string{config.KubeadmBootstrapProviderName}
}
if len(input.ControlPlaneProviders) == 0 {
input.ControlPlaneProviders = []string{config.KubeadmControlPlaneProviderName}
}

client := input.ClusterProxy.GetClient()
controllersDeployments := framework.GetControllerDeployments(ctx, framework.GetControllerDeploymentsInput{
Lister: client,
})
if len(controllersDeployments) == 0 {
initInput := clusterctl.InitInput{
// pass reference to the management cluster hosting this test
KubeconfigPath: input.ClusterProxy.GetKubeconfigPath(),
// pass the clusterctl config file that points to the local provider repository created for this test
ClusterctlConfigPath: input.ClusterctlConfigPath,
// setup the desired list of providers for a single-tenant management cluster
CoreProvider: input.CoreProvider,
BootstrapProviders: input.BootstrapProviders,
ControlPlaneProviders: input.ControlPlaneProviders,
InfrastructureProviders: input.InfrastructureProviders,
IPAMProviders: input.IPAMProviders,
RuntimeExtensionProviders: input.RuntimeExtensionProviders,
AddonProviders: input.AddonProviders,
// setup clusterctl logs folder
LogFolder: input.LogFolder,
}

clusterctl.Init(ctx, initInput)
}

logger.Info("Waiting for provider controllers to be running")

controllersDeployments = framework.GetControllerDeployments(ctx, framework.GetControllerDeploymentsInput{
Lister: client,
})
Expect(controllersDeployments).ToNot(BeEmpty(), "The list of controller deployments should not be empty")
for _, deployment := range controllersDeployments {
framework.WaitForDeploymentsAvailable(ctx, framework.WaitForDeploymentsAvailableInput{
Getter: client,
Deployment: deployment,
}, intervals...)
}
}
4 changes: 3 additions & 1 deletion test/e2e/e2e_clusterclass_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,9 @@ var _ = Describe("Workload cluster creation", func() {
}
})
Expect(err).ToNot(HaveOccurred())
Expect(bootstrapClusterProxy.Apply(ctx, []byte(clusterClassConfig))).To(Succeed(), "Failed to apply ClusterClass definition")
Eventually(func() error {
return bootstrapClusterProxy.Apply(ctx, []byte(clusterClassConfig))
}, e2eConfig.GetIntervals(specName, "wait-cluster")...).Should(Succeed(), "Failed to apply ClusterClass definition")

By("Create a Docker Cluster from topology")

Expand Down
6 changes: 4 additions & 2 deletions test/e2e/e2e_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ func setupBootstrapCluster(config *clusterctl.E2EConfig, scheme *runtime.Scheme,

// initBootstrapCluster initializes a bootstrap cluster with the latest minor version.
func initBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy, config *clusterctl.E2EConfig, clusterctlConfig, artifactFolder string) {
clusterctl.InitManagementClusterAndWatchControllerLogs(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{
InitManagementCluster(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{
ClusterProxy: bootstrapClusterProxy,
ClusterctlConfigPath: clusterctlConfig,
InfrastructureProviders: config.InfrastructureProviders(),
Expand All @@ -245,13 +245,14 @@ func initBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy, config *
BootstrapProviders: []string{"rke2-bootstrap"},
ControlPlaneProviders: []string{"rke2-control-plane"},
LogFolder: filepath.Join(artifactFolder, "clusters", bootstrapClusterProxy.GetName()),
DisableMetricsCollection: true,
}, config.GetIntervals(bootstrapClusterProxy.GetName(), "wait-controllers")...)
}

// initUpgradableBootstrapCluster initializes a bootstrap cluster with the latest minor version N-1 and used to perform an upgrade to the latest version.
// Make sure to update the version in the providers list to the latest minor version N-1.
func initUpgradableBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy, config *clusterctl.E2EConfig, clusterctlConfig, artifactFolder string) {
clusterctl.InitManagementClusterAndWatchControllerLogs(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{
InitManagementCluster(context.TODO(), clusterctl.InitManagementClusterAndWatchControllerLogsInput{
ClusterProxy: bootstrapClusterProxy,
ClusterctlConfigPath: clusterctlConfig,
InfrastructureProviders: config.InfrastructureProviders(),
Expand All @@ -260,6 +261,7 @@ func initUpgradableBootstrapCluster(bootstrapClusterProxy framework.ClusterProxy
BootstrapProviders: []string{"rke2-bootstrap:v0.6.0"},
ControlPlaneProviders: []string{"rke2-control-plane:v0.6.0"},
LogFolder: filepath.Join(artifactFolder, "clusters", bootstrapClusterProxy.GetName()),
DisableMetricsCollection: true,
}, config.GetIntervals(bootstrapClusterProxy.GetName(), "wait-controllers")...)
}

Expand Down
2 changes: 1 addition & 1 deletion test/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ var _ = Describe("Workload cluster creation", func() {
}, result)

WaitForClusterToUpgrade(ctx, WaitForClusterToUpgradeInput{
Lister: bootstrapClusterProxy.GetClient(),
Reader: bootstrapClusterProxy.GetClient(),
ControlPlane: result.ControlPlane,
MachineDeployments: result.MachineDeployments,
VersionAfterUpgrade: e2eConfig.GetVariable(KubernetesVersionUpgradeTo),
Expand Down
6 changes: 3 additions & 3 deletions test/e2e/e2e_upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,13 @@ var _ = Describe("Workload cluster creation", func() {
}, e2eConfig.GetIntervals(specName, "wait-control-plane")...)

By("Upgrading to latest boostrap/controlplane provider version")
clusterctl.UpgradeManagementClusterAndWait(ctx, clusterctl.UpgradeManagementClusterAndWaitInput{
UpgradeManagementCluster(ctx, clusterctl.UpgradeManagementClusterAndWaitInput{
ClusterProxy: bootstrapClusterProxy,
ClusterctlConfigPath: clusterctlConfigPath,
BootstrapProviders: []string{"rke2-bootstrap:v0.7.99"},
ControlPlaneProviders: []string{"rke2-control-plane:v0.7.99"},
LogFolder: clusterctlLogFolder,
}, e2eConfig.GetIntervals(specName, "wait-controllers")...)
})

WaitForControlPlaneToBeReady(ctx, WaitForControlPlaneToBeReadyInput{
Getter: bootstrapClusterProxy.GetClient(),
Expand Down Expand Up @@ -174,7 +174,7 @@ var _ = Describe("Workload cluster creation", func() {
}, result)

WaitForClusterToUpgrade(ctx, WaitForClusterToUpgradeInput{
Lister: bootstrapClusterProxy.GetClient(),
Reader: bootstrapClusterProxy.GetClient(),
ControlPlane: result.ControlPlane,
MachineDeployments: result.MachineDeployments,
VersionAfterUpgrade: e2eConfig.GetVariable(KubernetesVersionUpgradeTo),
Expand Down
76 changes: 39 additions & 37 deletions test/e2e/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"

Expand Down Expand Up @@ -138,14 +137,6 @@ func ApplyClusterTemplateAndWait(ctx context.Context, input ApplyClusterTemplate
})
Expect(workloadClusterTemplate).ToNot(BeNil(), "Failed to get the cluster template")

// Ensure we have a Cluster for dump and cleanup steps in AfterEach even if ApplyClusterTemplateAndWait fails.
result.Cluster = &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: input.ConfigCluster.ClusterName,
Namespace: input.ConfigCluster.Namespace,
},
}

ApplyCustomClusterTemplateAndWait(ctx, ApplyCustomClusterTemplateAndWaitInput{
ClusterProxy: input.ClusterProxy,
CustomTemplateYAML: workloadClusterTemplate,
Expand Down Expand Up @@ -174,19 +165,10 @@ func ApplyCustomClusterTemplateAndWait(ctx context.Context, input ApplyCustomClu

Byf("Creating the workload cluster with name %q from the provided yaml", input.ClusterName)

// Ensure we have a Cluster for dump and cleanup steps in AfterEach even if ApplyClusterTemplateAndWait fails.
result.Cluster = &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: input.ClusterName,
Namespace: input.Namespace,
},
}

Byf("Applying the cluster template yaml of cluster %s", klog.KRef(input.Namespace, input.ClusterName))
Eventually(func() error {
return input.ClusterProxy.Apply(ctx, input.CustomTemplateYAML, input.Args...)
// return input.ClusterProxy.CreateOrUpdate(ctx, input.CustomTemplateYAML, input.CreateOrUpdateOpts...)
}, 1*time.Minute).Should(Succeed(), "Failed to apply the cluster template")
}, input.WaitForClusterIntervals...).Should(Succeed(), "Failed to apply the cluster template")

// Once we applied the cluster template we can run PreWaitForCluster.
// Note: This can e.g. be used to verify the BeforeClusterCreate lifecycle hook is executed
Expand Down Expand Up @@ -218,7 +200,7 @@ func ApplyCustomClusterTemplateAndWait(ctx context.Context, input ApplyCustomClu
input.WaitForControlPlaneMachinesReady(ctx, input, result)

Byf("Waiting for the machine deployments of cluster %s to be provisioned", klog.KRef(input.Namespace, input.ClusterName))
result.MachineDeployments = framework.DiscoveryAndWaitForMachineDeployments(ctx, framework.DiscoveryAndWaitForMachineDeploymentsInput{
result.MachineDeployments = DiscoveryAndWaitForMachineDeployments(ctx, framework.DiscoveryAndWaitForMachineDeploymentsInput{
Lister: input.ClusterProxy.GetClient(),
Cluster: result.Cluster,
}, input.WaitForMachineDeployments...)
Expand Down Expand Up @@ -285,7 +267,7 @@ func DiscoveryAndWaitForRKE2ControlPlaneInitialized(ctx context.Context, input D
Namespace: input.Cluster.Namespace,
})
g.Expect(controlPlane).ToNot(BeNil())
}, "10s", "1s").Should(Succeed(), "Couldn't get the control plane for the cluster %s", klog.KObj(input.Cluster))
}, "2m", "1s").Should(Succeed(), "Couldn't get the control plane for the cluster %s", klog.KObj(input.Cluster))

return controlPlane
}
Expand Down Expand Up @@ -445,7 +427,7 @@ func WaitForMachineConditions(ctx context.Context, input WaitForMachineCondition

// WaitForClusterToUpgradeInput is the input for WaitForClusterToUpgrade.
type WaitForClusterToUpgradeInput struct {
Lister framework.Lister
Reader framework.GetLister
ControlPlane *controlplanev1.RKE2ControlPlane
MachineDeployments []*clusterv1.MachineDeployment
VersionAfterUpgrade string
Expand All @@ -455,32 +437,52 @@ type WaitForClusterToUpgradeInput struct {
func WaitForClusterToUpgrade(ctx context.Context, input WaitForClusterToUpgradeInput, intervals ...interface{}) {
By("Waiting for machines to update")

var totalMachineCount int32
totalMachineCount = *input.ControlPlane.Spec.Replicas
Eventually(func() error {
cp := input.ControlPlane.DeepCopy()
if err := input.Reader.Get(ctx, client.ObjectKeyFromObject(input.ControlPlane), cp); err != nil {
return fmt.Errorf("failed to get control plane: %w", err)
}

for _, md := range input.MachineDeployments {
totalMachineCount += *md.Spec.Replicas
}
updatedDeployments := []*clusterv1.MachineDeployment{}
for _, md := range input.MachineDeployments {
copy := &clusterv1.MachineDeployment{}
if err := input.Reader.Get(ctx, client.ObjectKeyFromObject(md), copy); client.IgnoreNotFound(err) != nil {
return fmt.Errorf("failed to get updated machine deployment: %w", err)
}

Eventually(func() (bool, error) {
machineList := &clusterv1.MachineList{}
if err := input.Lister.List(ctx, machineList); err != nil {
return false, fmt.Errorf("failed to list machines: %w", err)
updatedDeployments = append(updatedDeployments, copy)
}

if len(machineList.Items) != int(totalMachineCount) { // not all replicas are created
return false, nil
machineList := &clusterv1.MachineList{}
if err := input.Reader.List(ctx, machineList); err != nil {
return fmt.Errorf("failed to list machines: %w", err)
}

for _, machine := range machineList.Items {
expectedVersion := input.VersionAfterUpgrade + "+rke2r1"
if machine.Spec.Version != nil && *machine.Spec.Version != expectedVersion {
return false, nil
if machine.Spec.Version == nil || *machine.Spec.Version != expectedVersion {
return fmt.Errorf("Expected machine version to match %s, got %v", expectedVersion, machine.Spec.Version)
}
}

return true, nil
}, intervals...).Should(BeTrue(), framework.PrettyPrint(input.ControlPlane))
ready := cp.Status.ReadyReplicas == cp.Status.Replicas
if !ready {
return fmt.Errorf("Control plane is not ready: %d ready from %d", cp.Status.ReadyReplicas, cp.Status.Replicas)
}

expected := cp.Spec.Replicas != nil && *cp.Spec.Replicas == cp.Status.Replicas
if !expected {
return fmt.Errorf("Control plane is not scaled: %d replicas from %d", cp.Spec.Replicas, cp.Status.Replicas)
}

for _, md := range updatedDeployments {
if md.Spec.Replicas == nil || *md.Spec.Replicas != md.Status.ReadyReplicas {
return fmt.Errorf("Not all machine deployments are updated yet expected %v!=%d", md.Spec.Replicas, md.Status.ReadyReplicas)
}
}

return nil
}, intervals...).Should(Succeed())
}

// setDefaults sets the default values for ApplyCustomClusterTemplateAndWaitInput if not set.
Expand Down

0 comments on commit a547119

Please sign in to comment.