From 939f02d18702bd6314e451e4d578f9748faaeaa9 Mon Sep 17 00:00:00 2001 From: "Weber.Yang" Date: Mon, 22 Apr 2024 16:41:27 +0800 Subject: [PATCH] SKS-2345: Add support for system disk expansion (#169) --- api/v1beta1/conditions_consts.go | 23 + api/v1beta1/elfmachine_types.go | 47 ++ api/v1beta1/types.go | 5 + api/v1beta1/zz_generated.deepcopy.go | 16 + ...tructure.cluster.x-k8s.io_elfmachines.yaml | 7 + .../bases/kubesmart.smtx.io_hostconfigs.yaml | 153 +++++ .../kubesmart.smtx.io_hostoperationjobs.yaml | 153 +++++ config/default/webhookcainjection_patch.yaml | 28 +- config/rbac/role.yaml | 26 + config/webhook/kustomizeconfig.yaml | 14 +- config/webhook/manifests.yaml | 46 ++ controllers/elfmachine_controller.go | 31 + .../elfmachine_controller_resources.go | 216 +++++++ .../elfmachine_controller_resources_test.go | 362 ++++++++++++ controllers/elfmachine_controller_test.go | 103 ++++ controllers/elfmachinetemplate_controller.go | 542 ++++++++++++++++++ .../elfmachinetemplate_controller_test.go | 507 ++++++++++++++++ controllers/suite_test.go | 22 +- go.mod | 1 + go.sum | 2 + main.go | 22 +- pkg/context/machine_template_context.go | 39 ++ pkg/hostagent/service.go | 74 +++ .../tasks/expand_root_partition.yaml | 39 ++ pkg/hostagent/tasks/tasks.go | 23 + pkg/manager/manager.go | 2 + pkg/service/errors.go | 5 + pkg/service/mock_services/vm_mock.go | 45 ++ pkg/service/util.go | 36 ++ pkg/service/util_test.go | 17 + pkg/service/vm.go | 76 ++- pkg/util/annotations/helpers.go | 20 + pkg/util/machine/machine.go | 76 +++ pkg/util/machine/machine_test.go | 77 +++ pkg/util/machine/md.go | 22 + pkg/util/machine/md_test.go | 18 + pkg/util/md/md.go | 95 +++ pkg/util/md/md_test.go | 254 ++++++++ templates/cluster-template.yaml | 64 +++ .../kubesmart.smtx.io_hostconfigs.yaml | 153 +++++ .../kubesmart.smtx.io_hostoperationjobs.yaml | 153 +++++ test/fake/controller_manager_context.go | 2 + test/fake/tower.go | 23 + test/fake/types.go | 28 + test/helpers/envtest.go | 12 + webhooks/elfmachine_webhook_validation.go | 104 ++++ .../elfmachine_webhook_validation_test.go | 136 +++++ .../elfmachinetemplate_webhook_validation.go | 90 +++ ...machinetemplate_webhook_validation_test.go | 143 +++++ webhooks/util.go | 35 ++ 50 files changed, 4142 insertions(+), 45 deletions(-) create mode 100644 config/crd/bases/kubesmart.smtx.io_hostconfigs.yaml create mode 100644 config/crd/bases/kubesmart.smtx.io_hostoperationjobs.yaml create mode 100644 controllers/elfmachine_controller_resources.go create mode 100644 controllers/elfmachine_controller_resources_test.go create mode 100644 controllers/elfmachinetemplate_controller.go create mode 100644 controllers/elfmachinetemplate_controller_test.go create mode 100644 pkg/context/machine_template_context.go create mode 100644 pkg/hostagent/service.go create mode 100644 pkg/hostagent/tasks/expand_root_partition.yaml create mode 100644 pkg/hostagent/tasks/tasks.go create mode 100644 pkg/util/md/md.go create mode 100644 pkg/util/md/md_test.go create mode 100644 test/config/host-agent/kubesmart.smtx.io_hostconfigs.yaml create mode 100644 test/config/host-agent/kubesmart.smtx.io_hostoperationjobs.yaml create mode 100644 webhooks/elfmachine_webhook_validation.go create mode 100644 webhooks/elfmachine_webhook_validation_test.go create mode 100644 webhooks/elfmachinetemplate_webhook_validation.go create mode 100644 webhooks/elfmachinetemplate_webhook_validation_test.go create mode 100644 webhooks/util.go diff --git a/api/v1beta1/conditions_consts.go b/api/v1beta1/conditions_consts.go index c54f149b..2a52625c 100644 --- a/api/v1beta1/conditions_consts.go +++ b/api/v1beta1/conditions_consts.go @@ -121,6 +121,29 @@ const ( // WaitingForAvailableHostWithEnoughGPUsReason (Severity=Info) documents an ElfMachine // waiting for an available host with enough GPUs to create VM. WaitingForAvailableHostWithEnoughGPUsReason = "WaitingForAvailableHostWithEnoughGPUs" + + // ResourcesHotUpdatedCondition documents the status of the hot updating resources of a VM. + ResourcesHotUpdatedCondition = "ResourceHotUpdated" + + // WaitingForResourcesHotUpdateReason (Severity=Info) documents an ElfMachine waiting for updating resources. + WaitingForResourcesHotUpdateReason = "WaitingForResourcesHotUpdate" + + // ExpandingVMDiskReason documents (Severity=Info) ElfMachine currently executing the expand disk operation. + ExpandingVMDiskReason = "ExpandingVMDisk" + + // ExpandingVMDiskFailedReason (Severity=Warning) documents an ElfMachine controller detecting + // an error while expanding disk; those kind of errors are usually transient and failed updating + // are automatically re-tried by the controller. + ExpandingVMDiskFailedReason = "ExpandingVMDiskFailed" + + // ExpandingRootPartitionReason documents (Severity=Info) ElfMachine currently executing the + // adding new disk capacity to root directory operation. + ExpandingRootPartitionReason = "ExpandingRootPartition" + + // ExpandingRootPartitionFailedReason (Severity=Warning) documents an ElfMachine controller + // detecting an error while adding new disk capacity to root directory; those kind of errors are + // usually transient and failed updating are automatically re-tried by the controller. + ExpandingRootPartitionFailedReason = "ExpandingRootPartitionFailed" ) // Conditions and Reasons related to make connections to a Tower. Can currently be used by ElfCluster and ElfMachine diff --git a/api/v1beta1/elfmachine_types.go b/api/v1beta1/elfmachine_types.go index a7214fdf..0a25429e 100644 --- a/api/v1beta1/elfmachine_types.go +++ b/api/v1beta1/elfmachine_types.go @@ -22,6 +22,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" capierrors "sigs.k8s.io/cluster-api/errors" + "sigs.k8s.io/cluster-api/util/conditions" ) const ( @@ -37,6 +38,9 @@ const ( // VMDisconnectionTimestampAnnotation is the annotation identifying the VM of ElfMachine disconnection time. VMDisconnectionTimestampAnnotation = "cape.infrastructure.cluster.x-k8s.io/vm-disconnection-timestamp" + + // VMFirstBootTimestampAnnotation is the annotation identifying the VM of ElfMachine first power on time. + VMFirstBootTimestampAnnotation = "cape.infrastructure.cluster.x-k8s.io/vm-first-boot-timestamp" ) // ElfMachineSpec defines the desired state of ElfMachine. @@ -124,6 +128,10 @@ type ElfMachineStatus struct { // +optional GPUDevices []GPUStatus `json:"gpuDevices,omitempty"` + // Resources records the resources allocated for the machine. + // +optional + Resources ResourcesStatus `json:"resources,omitempty"` + // FailureReason will be set in the event that there is a terminal problem // reconciling the Machine and will contain a succinct value suitable // for machine interpretation. @@ -241,6 +249,16 @@ func (m *ElfMachine) IsFailed() bool { return m.Status.FailureReason != nil || m.Status.FailureMessage != nil } +// IsHotUpdating returns whether the machine is being hot updated. +func (m *ElfMachine) IsHotUpdating() bool { + if conditions.Has(m, ResourcesHotUpdatedCondition) && + conditions.IsFalse(m, ResourcesHotUpdatedCondition) { + return true + } + + return false +} + func (m *ElfMachine) SetVMDisconnectionTimestamp(timestamp *metav1.Time) { if m.Annotations == nil { m.Annotations = make(map[string]string) @@ -318,6 +336,35 @@ func (m *ElfMachine) GetVMDisconnectionTimestamp() *metav1.Time { return nil } +func (m *ElfMachine) SetVMFirstBootTimestamp(timestamp *metav1.Time) { + annotations := m.GetAnnotations() + if annotations == nil { + annotations = map[string]string{} + } + m.Annotations[VMFirstBootTimestampAnnotation] = timestamp.Format(time.RFC3339) + m.SetAnnotations(annotations) +} + +func (m *ElfMachine) GetVMFirstBootTimestamp() *metav1.Time { + if m.Annotations == nil { + return nil + } + + if _, ok := m.Annotations[VMFirstBootTimestampAnnotation]; ok { + timestampAnnotation := m.Annotations[VMFirstBootTimestampAnnotation] + timestamp, err := time.Parse(time.RFC3339, timestampAnnotation) + if err != nil { + return nil + } + + firstBootTimestamp := metav1.NewTime(timestamp) + + return &firstBootTimestamp + } + + return nil +} + func (m *ElfMachine) RequiresGPUDevices() bool { return m.RequiresPassThroughGPUDevices() || m.RequiresVGPUDevices() } diff --git a/api/v1beta1/types.go b/api/v1beta1/types.go index e18cf5ab..6c5154cb 100644 --- a/api/v1beta1/types.go +++ b/api/v1beta1/types.go @@ -196,6 +196,11 @@ type GPUStatus struct { Name string `json:"name,omitempty"` } +// ResourcesStatus records the resources allocated to the virtual machine. +type ResourcesStatus struct { + Disk int32 `json:"disk,omitempty"` +} + //+kubebuilder:object:generate=false // PatchStringValue is for patching resources. diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index ff53d5d5..7e292e8d 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -247,6 +247,7 @@ func (in *ElfMachineStatus) DeepCopyInto(out *ElfMachineStatus) { *out = make([]GPUStatus, len(*in)) copy(*out, *in) } + out.Resources = in.Resources if in.FailureReason != nil { in, out := &in.FailureReason, &out.FailureReason *out = new(errors.MachineStatusError) @@ -483,6 +484,21 @@ func (in *NetworkStatus) DeepCopy() *NetworkStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourcesStatus) DeepCopyInto(out *ResourcesStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourcesStatus. +func (in *ResourcesStatus) DeepCopy() *ResourcesStatus { + if in == nil { + return nil + } + out := new(ResourcesStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Tower) DeepCopyInto(out *Tower) { *out = *in diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml index 83e45c2e..44368eba 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_elfmachines.yaml @@ -403,6 +403,13 @@ spec: ready: description: Ready is true when the provider resource is ready. type: boolean + resources: + description: Resources records the resources allocated for the machine. + properties: + disk: + format: int32 + type: integer + type: object taskRef: description: TaskRef is a managed object reference to a Task related to the machine. This value is set automatically at runtime and should diff --git a/config/crd/bases/kubesmart.smtx.io_hostconfigs.yaml b/config/crd/bases/kubesmart.smtx.io_hostconfigs.yaml new file mode 100644 index 00000000..cceab7c5 --- /dev/null +++ b/config/crd/bases/kubesmart.smtx.io_hostconfigs.yaml @@ -0,0 +1,153 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.12.0 + name: hostconfigs.kubesmart.smtx.io +spec: + group: kubesmart.smtx.io + names: + categories: + - kubesmart + kind: HostConfig + listKind: HostConfigList + plural: hostconfigs + shortNames: + - hc + singular: hostconfig + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: the current phase of HostConfig + jsonPath: .status.phase + name: Phase + type: string + - description: the last execution time + jsonPath: .status.lastExecutionTime + name: LastExecutionTime + type: string + - description: Time duration since creation of HostConfig + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: HostConfig is the Schema for the HostConfig API. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + properties: + config: + properties: + ansible: + description: Ansible 通过 ansible playbook 完成配置 + properties: + localPlaybook: + description: LocalPlaybook 本地的 playbook,单个 yaml 文件, secret + 引用或者 yaml 字符串 + properties: + content: + description: Content is the inline yaml text. + format: yaml + type: string + secretRef: + description: SecretRef specifies the secret which stores + yaml text. + properties: + name: + description: name is unique within a namespace to + reference a secret resource. + type: string + namespace: + description: namespace defines the space within which + the secret name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + type: object + remotePlaybook: + description: RemotePlaybook 在远端的 playbook,单个 .tar.gz 压缩包,内容可以是单个 + yaml 文件,也可以符合 ansible 要求的目录 + properties: + md5sum: + description: MD5sum 压缩包的 MD5,填写了会进行校验,已经下载过的 playbook + 校验通过后跳过重复下载 + type: string + name: + description: Name 要执行的 playbook 文件名,相对于压缩包顶层的位置 + type: string + url: + description: URL playbook 在远端的地址,支持 https + type: string + required: + - name + - url + type: object + values: + description: Values 执行 playbook 的参数,yaml 格式,可以是 secret 引用或者 + yaml 字符串 + properties: + content: + description: Content is the inline yaml text. + format: yaml + type: string + secretRef: + description: SecretRef specifies the secret which stores + yaml text. + properties: + name: + description: name is unique within a namespace to + reference a secret resource. + type: string + namespace: + description: namespace defines the space within which + the secret name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + type: object + type: object + timeout: + description: Timeout 执行一次配置的超时时间 + type: string + type: object + nodeName: + type: string + required: + - config + - nodeName + type: object + status: + properties: + failureMessage: + type: string + failureReason: + type: string + lastExecutionTime: + description: LastExecutionTime 最后执行的时间戳 + format: date-time + type: string + phase: + description: Phase 当前状态 + type: string + required: + - phase + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/kubesmart.smtx.io_hostoperationjobs.yaml b/config/crd/bases/kubesmart.smtx.io_hostoperationjobs.yaml new file mode 100644 index 00000000..b177299d --- /dev/null +++ b/config/crd/bases/kubesmart.smtx.io_hostoperationjobs.yaml @@ -0,0 +1,153 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.12.0 + name: hostoperationjobs.kubesmart.smtx.io +spec: + group: kubesmart.smtx.io + names: + categories: + - kubesmart + kind: HostOperationJob + listKind: HostOperationJobList + plural: hostoperationjobs + shortNames: + - hoj + singular: hostoperationjob + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: the current phase of HostOperationJob + jsonPath: .status.phase + name: Phase + type: string + - description: the last execution time + jsonPath: .status.lastExecutionTime + name: LastExecutionTime + type: string + - description: Time duration since creation of HostOperationJob + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: HostOperationJob is the Schema for the HostOperationJob API. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + properties: + nodeName: + type: string + operation: + properties: + ansible: + description: Ansible 通过 ansible playbook 完成操作 + properties: + localPlaybook: + description: LocalPlaybook 本地的 playbook,单个 yaml 文件, secret + 引用或者 yaml 字符串 + properties: + content: + description: Content is the inline yaml text. + format: yaml + type: string + secretRef: + description: SecretRef specifies the secret which stores + yaml text. + properties: + name: + description: name is unique within a namespace to + reference a secret resource. + type: string + namespace: + description: namespace defines the space within which + the secret name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + type: object + remotePlaybook: + description: RemotePlaybook 在远端的 playbook,单个 .tar.gz 压缩包,内容可以是单个 + yaml 文件,也可以符合 ansible 要求的目录 + properties: + md5sum: + description: MD5sum 压缩包的 MD5,填写了会进行校验,已经下载过的 playbook + 校验通过后跳过重复下载 + type: string + name: + description: Name 要执行的 playbook 文件名,相对于压缩包顶层的位置 + type: string + url: + description: URL playbook 在远端的地址,支持 https + type: string + required: + - name + - url + type: object + values: + description: Values 执行 playbook 的参数,yaml 格式,可以是 secret 引用或者 + yaml 字符串 + properties: + content: + description: Content is the inline yaml text. + format: yaml + type: string + secretRef: + description: SecretRef specifies the secret which stores + yaml text. + properties: + name: + description: name is unique within a namespace to + reference a secret resource. + type: string + namespace: + description: namespace defines the space within which + the secret name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + type: object + type: object + timeout: + description: Timeout 执行一次操作的超时时间 + type: string + type: object + required: + - nodeName + - operation + type: object + status: + properties: + failureMessage: + type: string + failureReason: + type: string + lastExecutionTime: + description: LastExecutionTime 最后执行的时间戳 + format: date-time + type: string + phase: + description: Phase 当前阶段 + type: string + required: + - phase + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/default/webhookcainjection_patch.yaml b/config/default/webhookcainjection_patch.yaml index e334e162..f5651701 100644 --- a/config/default/webhookcainjection_patch.yaml +++ b/config/default/webhookcainjection_patch.yaml @@ -13,17 +13,17 @@ metadata: name: mutating-webhook-configuration annotations: cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) -#! --- -#! apiVersion: admissionregistration.k8s.io/v1 -#! kind: ValidatingWebhookConfiguration -#! metadata: -#! labels: -#! app.kubernetes.io/name: validatingwebhookconfiguration -#! app.kubernetes.io/instance: validating-webhook-configuration -#! app.kubernetes.io/component: webhook -#! app.kubernetes.io/created-by: cluster-api-provider-elf -#! app.kubernetes.io/part-of: cluster-api-provider-elf -#! app.kubernetes.io/managed-by: kustomize -#! name: validating-webhook-configuration -#! annotations: -#! cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + labels: + app.kubernetes.io/name: validatingwebhookconfiguration + app.kubernetes.io/instance: validating-webhook-configuration + app.kubernetes.io/component: webhook + app.kubernetes.io/created-by: cluster-api-provider-elf + app.kubernetes.io/part-of: cluster-api-provider-elf + app.kubernetes.io/managed-by: kustomize + name: validating-webhook-configuration + annotations: + cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 30280427..f6a7881d 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -129,3 +129,29 @@ rules: - get - patch - update +- apiGroups: + - infrastructure.cluster.x-k8s.io + resources: + - elfmachinetemplates + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - infrastructure.cluster.x-k8s.io + resources: + - elfmachinetemplates/finalizers + verbs: + - update +- apiGroups: + - infrastructure.cluster.x-k8s.io + resources: + - elfmachinetemplates/status + verbs: + - get + - patch + - update diff --git a/config/webhook/kustomizeconfig.yaml b/config/webhook/kustomizeconfig.yaml index 310c4817..25e21e3c 100644 --- a/config/webhook/kustomizeconfig.yaml +++ b/config/webhook/kustomizeconfig.yaml @@ -7,19 +7,19 @@ nameReference: - kind: MutatingWebhookConfiguration group: admissionregistration.k8s.io path: webhooks/clientConfig/service/name - #! - kind: ValidatingWebhookConfiguration - #! group: admissionregistration.k8s.io - #! path: webhooks/clientConfig/service/name + - kind: ValidatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/name namespace: - kind: MutatingWebhookConfiguration group: admissionregistration.k8s.io path: webhooks/clientConfig/service/namespace create: true -#! - kind: ValidatingWebhookConfiguration -#! group: admissionregistration.k8s.io -#! path: webhooks/clientConfig/service/namespace -#! create: true +- kind: ValidatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/namespace + create: true varReference: - path: metadata/annotations diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml index 2985df2a..8baf195d 100644 --- a/config/webhook/manifests.yaml +++ b/config/webhook/manifests.yaml @@ -43,3 +43,49 @@ webhooks: resources: - elfmachinetemplates sideEffects: None +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: validating-webhook-configuration +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: webhook-service + namespace: system + path: /validate-infrastructure-cluster-x-k8s-io-v1beta1-elfmachine + failurePolicy: Fail + name: validation.elfmachine.infrastructure.x-k8s.io + rules: + - apiGroups: + - infrastructure.cluster.x-k8s.io + apiVersions: + - v1beta1 + operations: + - CREATE + - UPDATE + resources: + - elfmachines + sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: webhook-service + namespace: system + path: /validate-infrastructure-cluster-x-k8s-io-v1beta1-elfmachinetemplate + failurePolicy: Fail + name: validation.elfmachinetemplate.infrastructure.x-k8s.io + rules: + - apiGroups: + - infrastructure.cluster.x-k8s.io + apiVersions: + - v1beta1 + operations: + - CREATE + - UPDATE + resources: + - elfmachinetemplates + sideEffects: None diff --git a/controllers/elfmachine_controller.go b/controllers/elfmachine_controller.go index af04029e..17834aae 100644 --- a/controllers/elfmachine_controller.go +++ b/controllers/elfmachine_controller.go @@ -208,6 +208,7 @@ func (r *ElfMachineReconciler) Reconcile(ctx goctx.Context, req ctrl.Request) (r conditions.SetSummary(machineCtx.ElfMachine, conditions.WithConditions( infrav1.VMProvisionedCondition, + infrav1.ResourcesHotUpdatedCondition, infrav1.TowerAvailableCondition, ), ) @@ -664,6 +665,10 @@ func (r *ElfMachineReconciler) reconcileVM(ctx goctx.Context, machineCtx *contex return vm, false, err } + if ok, err := r.reconcileVMResources(ctx, machineCtx, vm); err != nil || !ok { + return vm, false, err + } + return vm, true, nil } @@ -755,6 +760,17 @@ func (r *ElfMachineReconciler) reconcileVMStatus(ctx goctx.Context, machineCtx * return false, r.updateVM(ctx, machineCtx, vm) } + // Before the virtual machine is started for the first time, if the + // current disk capacity of the virtual machine is smaller than expected, + // expand the disk capacity first and then start it. cloud-init will + // add the new disk capacity to root. + if machineCtx.ElfMachine.GetVMFirstBootTimestamp() == nil && + !machineCtx.ElfMachine.IsHotUpdating() { + if ok, err := r.reconcieVMVolume(ctx, machineCtx, vm, infrav1.VMProvisionedCondition); err != nil || !ok { + return ok, err + } + } + return false, r.powerOnVM(ctx, machineCtx, vm) case models.VMStatusSUSPENDED: // In some abnormal conditions, the VM will be in a suspended state, @@ -952,6 +968,12 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx goctx.Context, machineCtx *co unlockGPUDevicesLockedByVM(machineCtx.ElfCluster.Spec.Cluster, machineCtx.ElfMachine.Name) } + if service.IsPowerOnVMTask(task) && + machineCtx.ElfMachine.GetVMFirstBootTimestamp() == nil { + now := metav1.Now() + machineCtx.ElfMachine.SetVMFirstBootTimestamp(&now) + } + if service.IsCloneVMTask(task) || service.IsPowerOnVMTask(task) { releaseTicketForCreateVM(machineCtx.ElfMachine.Name) recordElfClusterStorageInsufficient(machineCtx, false) @@ -993,9 +1015,18 @@ func (r *ElfMachineReconciler) reconcileVMFailedTask(ctx goctx.Context, machineC case service.IsCloneVMTask(task): releaseTicketForCreateVM(machineCtx.ElfMachine.Name) + if service.IsVMDuplicateError(errorMessage) { + setVMDuplicate(machineCtx.ElfMachine.Name) + } + if machineCtx.ElfMachine.RequiresGPUDevices() { unlockGPUDevicesLockedByVM(machineCtx.ElfCluster.Spec.Cluster, machineCtx.ElfMachine.Name) } + case service.IsUpdateVMDiskTask(task, machineCtx.ElfMachine.Name): + reason := conditions.GetReason(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) + if reason == infrav1.ExpandingVMDiskReason || reason == infrav1.ExpandingVMDiskFailedReason { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskFailedReason, clusterv1.ConditionSeverityInfo, errorMessage) + } case service.IsPowerOnVMTask(task) || service.IsUpdateVMTask(task) || service.IsVMColdMigrationTask(task): if machineCtx.ElfMachine.RequiresGPUDevices() { unlockGPUDevicesLockedByVM(machineCtx.ElfCluster.Spec.Cluster, machineCtx.ElfMachine.Name) diff --git a/controllers/elfmachine_controller_resources.go b/controllers/elfmachine_controller_resources.go new file mode 100644 index 00000000..1e890c43 --- /dev/null +++ b/controllers/elfmachine_controller_resources.go @@ -0,0 +1,216 @@ +/* +Copyright 2024. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + goctx "context" + "fmt" + "time" + + "github.com/pkg/errors" + "github.com/smartxworks/cloudtower-go-sdk/v2/models" + agentv1 "github.com/smartxworks/host-config-agent-api/api/v1alpha1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + capiremote "sigs.k8s.io/cluster-api/controllers/remote" + "sigs.k8s.io/cluster-api/util/conditions" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" + "github.com/smartxworks/cluster-api-provider-elf/pkg/context" + "github.com/smartxworks/cluster-api-provider-elf/pkg/hostagent" + "github.com/smartxworks/cluster-api-provider-elf/pkg/service" + machineutil "github.com/smartxworks/cluster-api-provider-elf/pkg/util/machine" +) + +func (r *ElfMachineReconciler) reconcileVMResources(ctx goctx.Context, machineCtx *context.MachineContext, vm *models.VM) (bool, error) { + log := ctrl.LoggerFrom(ctx) + + hotUpdatedCondition := conditions.Get(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) + if hotUpdatedCondition != nil && + hotUpdatedCondition.Reason == infrav1.WaitingForResourcesHotUpdateReason && + hotUpdatedCondition.Message != "" { + log.Info("Waiting for hot updating resources", "message", hotUpdatedCondition.Message) + + return false, nil + } + + if ok, err := r.reconcieVMVolume(ctx, machineCtx, vm, infrav1.ResourcesHotUpdatedCondition); err != nil || !ok { + return ok, err + } + + // Agent needs to wait for the node exists before it can run and execute commands. + if machineutil.IsUpdatingElfMachineResources(machineCtx.ElfMachine) && + machineCtx.Machine.Status.NodeInfo == nil { + log.Info("Waiting for node exists for host agent expand vm root partition") + + return false, nil + } + + if ok, err := r.expandVMRootPartition(ctx, machineCtx); err != nil || !ok { + return ok, err + } + + if machineutil.IsUpdatingElfMachineResources(machineCtx.ElfMachine) { + conditions.MarkTrue(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) + } + + return true, nil +} + +// reconcieVMVolume ensures that the vm disk size is as expected. +// +// The conditionType param: VMProvisionedCondition/ResourcesHotUpdatedCondition. +func (r *ElfMachineReconciler) reconcieVMVolume(ctx goctx.Context, machineCtx *context.MachineContext, vm *models.VM, conditionType clusterv1.ConditionType) (bool, error) { + // If the capacity is 0, it means that the disk size has not changed and returns directly. + if machineCtx.ElfMachine.Spec.DiskGiB == 0 { + return true, nil + } + + log := ctrl.LoggerFrom(ctx) + + vmDiskIDs := make([]string, len(vm.VMDisks)) + for i := 0; i < len(vm.VMDisks); i++ { + vmDiskIDs[i] = *vm.VMDisks[i].ID + } + + vmDisks, err := machineCtx.VMService.GetVMDisks(vmDiskIDs) + if err != nil { + return false, errors.Wrapf(err, "failed to get disks for vm %s/%s", *vm.ID, *vm.Name) + } else if len(vmDisks) == 0 { + return false, errors.Errorf("no disks found for vm %s/%s", *vm.ID, *vm.Name) + } + systemDisk := service.GetVMSystemDisk(vmDisks) + + vmVolume, err := machineCtx.VMService.GetVMVolume(*systemDisk.VMVolume.ID) + if err != nil { + return false, err + } + + diskSize := service.ByteToGiB(*vmVolume.Size) + machineCtx.ElfMachine.Status.Resources.Disk = diskSize + + if machineCtx.ElfMachine.Spec.DiskGiB > diskSize { + return false, r.resizeVMVolume(ctx, machineCtx, vmVolume, *service.TowerDisk(machineCtx.ElfMachine.Spec.DiskGiB), conditionType) + } else if machineCtx.ElfMachine.Spec.DiskGiB < diskSize { + log.V(3).Info(fmt.Sprintf("Current disk capacity is larger than expected, skipping expand vm volume %s/%s", *vmVolume.ID, *vmVolume.Name), "currentSize", diskSize, "expectedSize", machineCtx.ElfMachine.Spec.DiskGiB) + } + + return true, nil +} + +// resizeVMVolume sets the volume to the specified size. +func (r *ElfMachineReconciler) resizeVMVolume(ctx goctx.Context, machineCtx *context.MachineContext, vmVolume *models.VMVolume, diskSize int64, conditionType clusterv1.ConditionType) error { + log := ctrl.LoggerFrom(ctx) + + reason := conditions.GetReason(machineCtx.ElfMachine, conditionType) + if reason == "" || + (reason != infrav1.ExpandingVMDiskReason && reason != infrav1.ExpandingVMDiskFailedReason) { + conditions.MarkFalse(machineCtx.ElfMachine, conditionType, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + + // Save the conditionType first, and then expand the disk capacity. + // This prevents the disk expansion from succeeding but failing to save the + // conditionType, causing ElfMachine to not record the conditionType. + return nil + } + + if service.IsTowerResourcePerformingAnOperation(vmVolume.EntityAsyncStatus) { + log.Info("Waiting for vm volume task done", "volume", fmt.Sprintf("%s/%s", *vmVolume.ID, *vmVolume.Name)) + + return nil + } + + withTaskVMVolume, err := machineCtx.VMService.ResizeVMVolume(*vmVolume.ID, diskSize) + if err != nil { + conditions.MarkFalse(machineCtx.ElfMachine, conditionType, infrav1.ExpandingVMDiskFailedReason, clusterv1.ConditionSeverityWarning, err.Error()) + + return errors.Wrapf(err, "failed to trigger expand size from %d to %d for vm volume %s/%s", *vmVolume.Size, diskSize, *vmVolume.ID, *vmVolume.Name) + } + + machineCtx.ElfMachine.SetTask(*withTaskVMVolume.TaskID) + + log.Info(fmt.Sprintf("Waiting for the vm volume %s/%s to be expanded", *vmVolume.ID, *vmVolume.Name), "taskRef", machineCtx.ElfMachine.Status.TaskRef, "oldSize", *vmVolume.Size, "newSize", diskSize) + + return nil +} + +// expandVMRootPartition adds new disk capacity to root partition. +func (r *ElfMachineReconciler) expandVMRootPartition(ctx goctx.Context, machineCtx *context.MachineContext) (bool, error) { + log := ctrl.LoggerFrom(ctx) + + reason := conditions.GetReason(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition) + if reason == "" { + return true, nil + } else if reason != infrav1.ExpandingVMDiskReason && + reason != infrav1.ExpandingVMDiskFailedReason && + reason != infrav1.ExpandingRootPartitionReason && + reason != infrav1.ExpandingRootPartitionFailedReason { + return true, nil + } + + if reason != infrav1.ExpandingRootPartitionFailedReason { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingRootPartitionReason, clusterv1.ConditionSeverityInfo, "") + } + + kubeClient, err := capiremote.NewClusterClient(ctx, "", r.Client, client.ObjectKey{Namespace: machineCtx.Cluster.Namespace, Name: machineCtx.Cluster.Name}) + if err != nil { + return false, err + } + + agentJob, err := hostagent.GetHostJob(ctx, kubeClient, machineCtx.ElfMachine.Namespace, hostagent.GetExpandRootPartitionJobName(machineCtx.ElfMachine)) + if err != nil && !apierrors.IsNotFound(err) { + return false, err + } + + if agentJob == nil { + agentJob, err = hostagent.ExpandRootPartition(ctx, kubeClient, machineCtx.ElfMachine) + if err != nil { + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingRootPartitionFailedReason, clusterv1.ConditionSeverityInfo, err.Error()) + + return false, err + } + + log.Info("Waiting for expanding root partition", "hostAgentJob", agentJob.Name) + + return false, nil + } + + switch agentJob.Status.Phase { + case agentv1.PhaseSucceeded: + log.Info("Expand root partition to root succeeded", "hostAgentJob", agentJob.Name) + case agentv1.PhaseFailed: + conditions.MarkFalse(machineCtx.ElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingRootPartitionFailedReason, clusterv1.ConditionSeverityWarning, agentJob.Status.FailureMessage) + log.Info("Expand root partition failed, will try again after three minutes", "hostAgentJob", agentJob.Name, "failureMessage", agentJob.Status.FailureMessage) + + lastExecutionTime := agentJob.Status.LastExecutionTime + if lastExecutionTime == nil { + lastExecutionTime = &agentJob.CreationTimestamp + } + // Three minutes after the job fails, delete the job and try again. + if time.Now().After(lastExecutionTime.Add(3 * time.Minute)) { + if err := kubeClient.Delete(ctx, agentJob); err != nil { + return false, errors.Wrapf(err, "failed to delete expand root partition job %s/%s for retry", agentJob.Namespace, agentJob.Name) + } + } + + return false, nil + default: + log.Info("Waiting for expanding root partition job done", "hostAgentJob", agentJob.Name, "jobStatus", agentJob.Status.Phase) + + return false, nil + } + + return true, nil +} diff --git a/controllers/elfmachine_controller_resources_test.go b/controllers/elfmachine_controller_resources_test.go new file mode 100644 index 00000000..3d6adf9a --- /dev/null +++ b/controllers/elfmachine_controller_resources_test.go @@ -0,0 +1,362 @@ +/* +Copyright 2024. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "bytes" + goctx "context" + "time" + + "github.com/go-logr/logr" + "github.com/golang/mock/gomock" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/smartxworks/cloudtower-go-sdk/v2/models" + agentv1 "github.com/smartxworks/host-config-agent-api/api/v1alpha1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/conditions" + "sigs.k8s.io/controller-runtime/pkg/client" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" + "github.com/smartxworks/cluster-api-provider-elf/pkg/hostagent" + "github.com/smartxworks/cluster-api-provider-elf/pkg/hostagent/tasks" + "github.com/smartxworks/cluster-api-provider-elf/pkg/service" + "github.com/smartxworks/cluster-api-provider-elf/pkg/service/mock_services" + "github.com/smartxworks/cluster-api-provider-elf/test/fake" + "github.com/smartxworks/cluster-api-provider-elf/test/helpers" +) + +var _ = Describe("ElfMachineReconciler", func() { + var ( + elfCluster *infrav1.ElfCluster + cluster *clusterv1.Cluster + elfMachine *infrav1.ElfMachine + machine *clusterv1.Machine + secret *corev1.Secret + kubeConfigSecret *corev1.Secret + logBuffer *bytes.Buffer + mockCtrl *gomock.Controller + mockVMService *mock_services.MockVMService + mockNewVMService service.NewVMServiceFunc + ) + + BeforeEach(func() { + logBuffer = new(bytes.Buffer) + klog.SetOutput(logBuffer) + + elfCluster, cluster, elfMachine, machine, secret = fake.NewClusterAndMachineObjects() + + // mock + mockCtrl = gomock.NewController(GinkgoT()) + mockVMService = mock_services.NewMockVMService(mockCtrl) + mockNewVMService = func(_ goctx.Context, _ infrav1.Tower, _ logr.Logger) (service.VMService, error) { + return mockVMService, nil + } + }) + + AfterEach(func() { + mockCtrl.Finish() + }) + + Context("reconcileVMResources", func() { + It("should reconcile when WaitingForResourcesHotUpdateReason is not empty", func() { + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "xx") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + vm := fake.NewTowerVMFromElfMachine(elfMachine) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileVMResources(ctx, machineContext, vm) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for hot updating resources")) + }) + + It("should wait for node exists", func() { + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + vmVolume := fake.NewVMVolume(elfMachine) + vmDisk := fake.NewVMDisk(vmVolume) + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} + mockVMService.EXPECT().GetVMDisks([]string{*vmDisk.ID}).Return([]*models.VMDisk{vmDisk}, nil) + mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Return(vmVolume, nil) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileVMResources(ctx, machineContext, vm) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for node exists for host agent expand vm root partition")) + }) + + It("should mark ResourcesHotUpdatedCondition to true", func() { + agentJob := newExpandRootPartitionJob(elfMachine) + Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) + Expect(testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob)).NotTo(HaveOccurred()) + agentJobPatchSource := agentJob.DeepCopy() + agentJob.Status.Phase = agentv1.PhaseSucceeded + Expect(testEnv.PatchAndWait(ctx, agentJob, agentJobPatchSource)).To(Succeed()) + kubeConfigSecret, err := helpers.NewKubeConfigSecret(testEnv, cluster.Namespace, cluster.Name) + Expect(err).ShouldNot(HaveOccurred()) + machine.Status.NodeInfo = &corev1.NodeSystemInfo{} + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + vmVolume := fake.NewVMVolume(elfMachine) + vmDisk := fake.NewVMDisk(vmVolume) + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} + mockVMService.EXPECT().GetVMDisks([]string{*vmDisk.ID}).Return([]*models.VMDisk{vmDisk}, nil) + mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Return(vmVolume, nil) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileVMResources(ctx, machineContext, vm) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{{conditionType: infrav1.ResourcesHotUpdatedCondition, status: corev1.ConditionTrue}}) + }) + }) + + Context("reconcieVMVolume", func() { + It("should not reconcile when disk size is 0", func() { + elfMachine.Spec.DiskGiB = 0 + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + vmVolume := fake.NewVMVolume(elfMachine) + vmDisk := fake.NewVMDisk(vmVolume) + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcieVMVolume(ctx, machineContext, vm, infrav1.VMProvisionedCondition) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + }) + + It("should not expand the disk when size is up to date", func() { + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + vmVolume := fake.NewVMVolume(elfMachine) + vmDisk := fake.NewVMDisk(vmVolume) + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} + mockVMService.EXPECT().GetVMDisks([]string{*vmDisk.ID}).Return([]*models.VMDisk{vmDisk}, nil) + mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Return(vmVolume, nil) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcieVMVolume(ctx, machineContext, vm, infrav1.VMProvisionedCondition) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + }) + + It("should expand the disk when size is not up to date", func() { + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + elfMachine.Spec.DiskGiB = 20 + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + vmVolume := fake.NewVMVolume(elfMachine) + vmVolume.Size = service.TowerDisk(10) + vmDisk := fake.NewVMDisk(vmVolume) + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} + mockVMService.EXPECT().GetVMDisks([]string{*vmDisk.ID}).Return([]*models.VMDisk{vmDisk}, nil) + mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Return(vmVolume, nil) + task := fake.NewTowerTask() + withTaskVMVolume := fake.NewWithTaskVMVolume(vmVolume, task) + mockVMService.EXPECT().ResizeVMVolume(*vmVolume.ID, *service.TowerDisk(20)).Return(withTaskVMVolume, nil) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcieVMVolume(ctx, machineContext, vm, infrav1.ResourcesHotUpdatedCondition) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for the vm volume")) + Expect(elfMachine.Status.TaskRef).To(Equal(*withTaskVMVolume.TaskID)) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMDiskReason}}) + }) + }) + + Context("resizeVMVolume", func() { + It("should save the conditionType first", func() { + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + vmVolume := fake.NewVMVolume(elfMachine) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + err := reconciler.resizeVMVolume(ctx, machineContext, vmVolume, 10, infrav1.VMProvisionedCondition) + Expect(err).NotTo(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMDiskReason}}) + + vmVolume.EntityAsyncStatus = models.NewEntityAsyncStatus(models.EntityAsyncStatusUPDATING) + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskFailedReason, clusterv1.ConditionSeverityWarning, "") + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext = newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + err = reconciler.resizeVMVolume(ctx, machineContext, vmVolume, 10, infrav1.ResourcesHotUpdatedCondition) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for vm volume task done")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMDiskFailedReason}}) + }) + + It("should wait task done", func() { + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + vmVolume := fake.NewVMVolume(elfMachine) + mockVMService.EXPECT().ResizeVMVolume(*vmVolume.ID, int64(10)).Return(nil, unexpectedError) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + err := reconciler.resizeVMVolume(ctx, machineContext, vmVolume, 10, infrav1.ResourcesHotUpdatedCondition) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to trigger expand size from")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingVMDiskFailedReason}}) + + task := fake.NewTowerTask() + withTaskVMVolume := fake.NewWithTaskVMVolume(vmVolume, task) + mockVMService.EXPECT().ResizeVMVolume(*vmVolume.ID, int64(10)).Return(withTaskVMVolume, nil) + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext = newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler = &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + err = reconciler.resizeVMVolume(ctx, machineContext, vmVolume, 10, infrav1.ResourcesHotUpdatedCondition) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for the vm volume")) + Expect(elfMachine.Status.TaskRef).To(Equal(*withTaskVMVolume.TaskID)) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMDiskReason}}) + }) + }) + + Context("expandVMRootPartition", func() { + BeforeEach(func() { + var err error + kubeConfigSecret, err = helpers.NewKubeConfigSecret(testEnv, cluster.Namespace, cluster.Name) + Expect(err).ShouldNot(HaveOccurred()) + }) + + It("should not expand root partition without ResourcesHotUpdatedCondition", func() { + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.expandVMRootPartition(ctx, machineContext) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{}) + }) + + It("should create agent job to expand root partition", func() { + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.expandVMRootPartition(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for expanding root partition")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingRootPartitionReason}}) + var agentJob *agentv1.HostOperationJob + Eventually(func() error { + var err error + agentJob, err = hostagent.GetHostJob(ctx, testEnv.Client, elfMachine.Namespace, hostagent.GetExpandRootPartitionJobName(elfMachine)) + return err + }, timeout).Should(BeNil()) + Expect(agentJob.Name).To(Equal(hostagent.GetExpandRootPartitionJobName(elfMachine))) + }) + + It("should retry when job failed", func() { + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + agentJob := newExpandRootPartitionJob(elfMachine) + Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.expandVMRootPartition(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for expanding root partition job done")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingRootPartitionReason}}) + + logBuffer.Reset() + Expect(testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob)).NotTo(HaveOccurred()) + agentJobPatchSource := agentJob.DeepCopy() + agentJob.Status.Phase = agentv1.PhaseFailed + Expect(testEnv.PatchAndWait(ctx, agentJob, agentJobPatchSource)).To(Succeed()) + ok, err = reconciler.expandVMRootPartition(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Expand root partition failed, will try again")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityWarning, infrav1.ExpandingRootPartitionFailedReason}}) + + Expect(testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob)).NotTo(HaveOccurred()) + agentJobPatchSource = agentJob.DeepCopy() + agentJob.Status.LastExecutionTime = &metav1.Time{Time: time.Now().Add(-3 * time.Minute).UTC()} + Expect(testEnv.PatchAndWait(ctx, agentJob, agentJobPatchSource)).To(Succeed()) + ok, err = reconciler.expandVMRootPartition(ctx, machineContext) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + + Eventually(func() bool { + err := testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob) + return apierrors.IsNotFound(err) + }, timeout).Should(BeTrue()) + }) + + It("should record job succeeded", func() { + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + agentJob := newExpandRootPartitionJob(elfMachine) + Expect(testEnv.CreateAndWait(ctx, agentJob)).NotTo(HaveOccurred()) + Expect(testEnv.Get(ctx, client.ObjectKey{Namespace: agentJob.Namespace, Name: agentJob.Name}, agentJob)).NotTo(HaveOccurred()) + agentJobPatchSource := agentJob.DeepCopy() + agentJob.Status.Phase = agentv1.PhaseSucceeded + Expect(testEnv.PatchAndWait(ctx, agentJob, agentJobPatchSource)).To(Succeed()) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kubeConfigSecret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.expandVMRootPartition(ctx, machineContext) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Expand root partition to root succeeded")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingRootPartitionReason}}) + }) + }) +}) + +func newExpandRootPartitionJob(elfMachine *infrav1.ElfMachine) *agentv1.HostOperationJob { + return &agentv1.HostOperationJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: hostagent.GetExpandRootPartitionJobName(elfMachine), + Namespace: "default", + }, + Spec: agentv1.HostOperationJobSpec{ + NodeName: elfMachine.Name, + Operation: agentv1.Operation{ + Ansible: &agentv1.Ansible{ + LocalPlaybookText: &agentv1.YAMLText{ + Inline: tasks.ExpandRootPartitionTask, + }, + }, + }, + }, + } +} diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go index 1332cba1..da75da69 100644 --- a/controllers/elfmachine_controller_test.go +++ b/controllers/elfmachine_controller_test.go @@ -456,6 +456,11 @@ var _ = Describe("ElfMachineReconciler", func() { mockVMService.EXPECT().UpsertLabel(gomock.Any(), gomock.Any()).Times(3).Return(fake.NewTowerLabel(), nil) mockVMService.EXPECT().AddLabelsToVM(gomock.Any(), gomock.Any()).Times(1) mockVMService.EXPECT().FindVMsByName(elfMachine.Name).Return(nil, nil) + vmVolume := fake.NewVMVolume(elfMachine) + vmDisk := fake.NewVMDisk(vmVolume) + vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} + mockVMService.EXPECT().GetVMDisks([]string{*vmDisk.ID}).Return([]*models.VMDisk{vmDisk}, nil) + mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Return(vmVolume, nil) reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} elfMachineKey := capiutil.ObjectKey(elfMachine) @@ -590,6 +595,8 @@ var _ = Describe("ElfMachineReconciler", func() { elfMachine.Status.TaskRef = *task1.ID placementGroup := fake.NewVMPlacementGroup([]string{*vm.ID}) ctrlutil.AddFinalizer(elfMachine, infrav1.MachineFinalizer) + now := metav1.Now() + elfMachine.SetVMFirstBootTimestamp(&now) ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) @@ -611,6 +618,37 @@ var _ = Describe("ElfMachineReconciler", func() { expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.PoweringOnReason}}) }) + It("should expand the disk before starting the virtual machine for the first time", func() { + vm := fake.NewTowerVMFromElfMachine(elfMachine) + vm.EntityAsyncStatus = nil + status := models.VMStatusSTOPPED + vm.Status = &status + task1 := fake.NewTowerTask() + taskStatus := models.TaskStatusSUCCESSED + task1.Status = &taskStatus + elfMachine.Status.VMRef = *vm.ID + elfMachine.Status.TaskRef = *task1.ID + placementGroup := fake.NewVMPlacementGroup([]string{*vm.ID}) + ctrlutil.AddFinalizer(elfMachine, infrav1.MachineFinalizer) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + + mockVMService.EXPECT().Get(elfMachine.Status.VMRef).Return(vm, nil) + mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil) + mockVMService.EXPECT().GetTask(elfMachine.Status.TaskRef).Return(task1, nil) + vmVolume := fake.NewVMVolume(elfMachine) + vmDisk := fake.NewVMDisk(vmVolume) + vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} + mockVMService.EXPECT().GetVMDisks([]string{*vmDisk.ID}).Return(nil, unexpectedError) + + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + elfMachineKey := capiutil.ObjectKey(elfMachine) + result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(result.RequeueAfter).To(BeZero()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to get disks for vm")) + }) + It("should wait for the ELF virtual machine to be created", func() { vm := fake.NewTowerVM() placeholderID := fmt.Sprintf("placeholder-%s", *vm.LocalID) @@ -653,6 +691,8 @@ var _ = Describe("ElfMachineReconciler", func() { elfMachine.Status.TaskRef = *task1.ID placementGroup := fake.NewVMPlacementGroup([]string{*vm.ID}) ctrlutil.AddFinalizer(elfMachine, infrav1.MachineFinalizer) + now := metav1.Now() + elfMachine.SetVMFirstBootTimestamp(&now) ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) @@ -686,6 +726,8 @@ var _ = Describe("ElfMachineReconciler", func() { elfMachine.Status.TaskRef = *task1.ID placementGroup := fake.NewVMPlacementGroup([]string{*vm.ID}) ctrlutil.AddFinalizer(elfMachine, infrav1.MachineFinalizer) + now := metav1.Now() + elfMachine.SetVMFirstBootTimestamp(&now) ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) @@ -810,6 +852,8 @@ var _ = Describe("ElfMachineReconciler", func() { vm := fake.NewTowerVMFromElfMachine(elfMachine) vm.Status = models.NewVMStatus(models.VMStatusSTOPPED) task := fake.NewTowerTask() + now := metav1.Now() + elfMachine.SetVMFirstBootTimestamp(&now) ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) @@ -1813,6 +1857,11 @@ var _ = Describe("ElfMachineReconciler", func() { mockVMService.EXPECT().GetVMNics(*vm.ID).Return(nil, nil) mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Times(2).Return(placementGroup, nil) mockVMService.EXPECT().FindVMsByName(elfMachine.Name).Return(nil, nil) + vmVolume := fake.NewVMVolume(elfMachine) + vmDisk := fake.NewVMDisk(vmVolume) + vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} + mockVMService.EXPECT().GetVMDisks([]string{*vmDisk.ID}).Return([]*models.VMDisk{vmDisk}, nil) + mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Return(vmVolume, nil) reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} elfMachineKey := capiutil.ObjectKey(elfMachine) @@ -1849,6 +1898,11 @@ var _ = Describe("ElfMachineReconciler", func() { mockVMService.EXPECT().UpsertLabel(gomock.Any(), gomock.Any()).Times(60).Return(fake.NewTowerLabel(), nil) mockVMService.EXPECT().AddLabelsToVM(gomock.Any(), gomock.Any()).Times(20) mockVMService.EXPECT().FindVMsByName(elfMachine.Name).Times(12).Return(nil, nil) + vmVolume := fake.NewVMVolume(elfMachine) + vmDisk := fake.NewVMDisk(vmVolume) + vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} + mockVMService.EXPECT().GetVMDisks([]string{*vmDisk.ID}).Times(20).Return([]*models.VMDisk{vmDisk}, nil) + mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Times(20).Return(vmVolume, nil) // test elfMachine has one network device with DHCP type elfMachine.Spec.Network.Devices = []infrav1.NetworkDeviceSpec{ @@ -2278,6 +2332,11 @@ var _ = Describe("ElfMachineReconciler", func() { mockVMService.EXPECT().UpsertLabel(gomock.Any(), gomock.Any()).Times(3).Return(fake.NewTowerLabel(), nil) mockVMService.EXPECT().AddLabelsToVM(gomock.Any(), gomock.Any()).Times(1) mockVMService.EXPECT().FindVMsByName(elfMachine.Name).Return(nil, nil) + vmVolume := fake.NewVMVolume(elfMachine) + vmDisk := fake.NewVMDisk(vmVolume) + vm.VMDisks = []*models.NestedVMDisk{{ID: vmDisk.ID}} + mockVMService.EXPECT().GetVMDisks([]string{*vmDisk.ID}).Return([]*models.VMDisk{vmDisk}, nil) + mockVMService.EXPECT().GetVMVolume(*vmVolume.ID).Return(vmVolume, nil) reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} elfMachineKey := capiutil.ObjectKey(elfMachine) @@ -3235,6 +3294,32 @@ var _ = Describe("ElfMachineReconciler", func() { Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID)) }) + It("should set vm first boot timestamp", func() { + task := fake.NewTowerTask() + task.Status = models.NewTaskStatus(models.TaskStatusSUCCESSED) + task.Description = service.TowerString("Start VM") + elfMachine.Status.TaskRef = *task.ID + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(elfCluster, cluster, elfMachine, machine, mockVMService) + machineContext.VMService = mockVMService + mockVMService.EXPECT().GetTask(elfMachine.Status.TaskRef).AnyTimes().Return(task, nil) + reconciler := &ElfMachineReconciler{ControllerManagerContext: ctrlMgrCtx, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileVMTask(ctx, machineContext, nil) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + firstBootTimestamp := elfMachine.GetVMFirstBootTimestamp() + Expect(firstBootTimestamp).NotTo(BeNil()) + + elfMachine.Status.TaskRef = *task.ID + ok, err = reconciler.reconcileVMTask(ctx, machineContext, nil) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + firstBootTimestamp2 := elfMachine.GetVMFirstBootTimestamp() + Expect(firstBootTimestamp2).NotTo(BeNil()) + Expect(firstBootTimestamp2).To(Equal(firstBootTimestamp)) + }) + It("should handle failed/succeeded task", func() { elfMachine.Spec.GPUDevices = []infrav1.GPUPassthroughDeviceSpec{{Model: "A16", Count: 1}} @@ -3318,6 +3403,24 @@ var _ = Describe("ElfMachineReconciler", func() { ok, _ = acquireTicketForCreateVM(elfMachine.Name, true) Expect(ok).To(BeFalse()) + // Edit VM disk + task.Status = models.NewTaskStatus(models.TaskStatusFAILED) + task.Description = service.TowerString(fmt.Sprintf("Edit VM %s disk", *vm.Name)) + task.ErrorMessage = service.TowerString(service.VMDuplicateError) + elfMachine.Status.TaskRef = *task.ID + ok, err = reconciler.reconcileVMTask(ctx, machineContext, nil) + Expect(ok).Should(BeTrue()) + Expect(err).ShouldNot(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.TaskFailureReason}}) + + elfMachine.Status.TaskRef = *task.ID + elfMachine.Status.Conditions = nil + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.ExpandingVMDiskReason, clusterv1.ConditionSeverityInfo, "") + ok, err = reconciler.reconcileVMTask(ctx, machineContext, nil) + Expect(ok).Should(BeTrue()) + Expect(err).ShouldNot(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.ExpandingVMDiskFailedReason}}) + // GPU gpuDeviceInfo := &service.GPUDeviceInfo{ID: "gpu", AllocatedCount: 0, AvailableCount: 1} gpuDeviceInfos := []*service.GPUDeviceInfo{gpuDeviceInfo} diff --git a/controllers/elfmachinetemplate_controller.go b/controllers/elfmachinetemplate_controller.go new file mode 100644 index 00000000..558fe747 --- /dev/null +++ b/controllers/elfmachinetemplate_controller.go @@ -0,0 +1,542 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + goctx "context" + "fmt" + + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + apitypes "k8s.io/apimachinery/pkg/types" + kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/klog/v2" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" + capiutil "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/annotations" + "sigs.k8s.io/cluster-api/util/collections" + "sigs.k8s.io/cluster-api/util/conditions" + "sigs.k8s.io/cluster-api/util/patch" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + ctrlmgr "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" + "github.com/smartxworks/cluster-api-provider-elf/pkg/config" + "github.com/smartxworks/cluster-api-provider-elf/pkg/context" + kcputil "github.com/smartxworks/cluster-api-provider-elf/pkg/util/kcp" + machineutil "github.com/smartxworks/cluster-api-provider-elf/pkg/util/machine" + mdutil "github.com/smartxworks/cluster-api-provider-elf/pkg/util/md" +) + +const ( + anotherMachineHotUpdateInProgressMessage = "another machine resources hot updating is in progress" +) + +// ElfMachineTemplateReconciler reconciles a ElfMachineTemplate object. +type ElfMachineTemplateReconciler struct { + *context.ControllerManagerContext +} + +//+kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=elfmachinetemplates,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=elfmachinetemplates/status,verbs=get;update;patch +//+kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=elfmachinetemplates/finalizers,verbs=update + +// AddMachineTemplateControllerToManager adds the ElfMachineTemplate controller to the provided +// manager. +func AddMachineTemplateControllerToManager(ctx goctx.Context, ctrlMgrCtx *context.ControllerManagerContext, mgr ctrlmgr.Manager, options controller.Options) error { + var ( + controlledType = &infrav1.ElfMachineTemplate{} + ) + + reconciler := &ElfMachineTemplateReconciler{ + ControllerManagerContext: ctrlMgrCtx, + } + + return ctrl.NewControllerManagedBy(mgr). + // Watch the controlled, infrastructure resource. + For(controlledType). + WithOptions(options). + // WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(ctrl.LoggerFrom(ctx), emtCtx.WatchFilterValue)). + Complete(reconciler) +} + +func (r *ElfMachineTemplateReconciler) Reconcile(ctx goctx.Context, req ctrl.Request) (_ ctrl.Result, reterr error) { + log := ctrl.LoggerFrom(ctx) + + // Get the ElfMachineTemplate resource for this request. + var elfMachineTemplate infrav1.ElfMachineTemplate + if err := r.Client.Get(ctx, req.NamespacedName, &elfMachineTemplate); err != nil { + if apierrors.IsNotFound(err) { + log.Info("ElfMachineTemplate not found, won't reconcile", "key", req.NamespacedName) + + return reconcile.Result{}, nil + } + + return reconcile.Result{}, err + } + + // Fetch the CAPI Cluster. + cluster, err := capiutil.GetOwnerCluster(ctx, r.Client, elfMachineTemplate.ObjectMeta) + if err != nil { + return reconcile.Result{}, err + } + if cluster == nil { + log.Info("Waiting for Cluster Controller to set OwnerRef on ElfMachineTemplate") + + return reconcile.Result{}, nil + } + log = log.WithValues("Cluster", klog.KObj(cluster)) + ctx = ctrl.LoggerInto(ctx, log) + + if annotations.IsPaused(cluster, &elfMachineTemplate) { + log.V(4).Info("ElfMachineTemplate linked to a cluster that is paused") + + return reconcile.Result{}, nil + } + + // Fetch the ElfCluster + var elfCluster infrav1.ElfCluster + if err := r.Client.Get(ctx, client.ObjectKey{ + Namespace: cluster.Namespace, + Name: cluster.Spec.InfrastructureRef.Name, + }, &elfCluster); err != nil { + if apierrors.IsNotFound(err) { + log.Info("ElfMachineTemplate Waiting for ElfCluster") + return reconcile.Result{}, nil + } + + return reconcile.Result{}, err + } + log = log.WithValues("ElfCluster", klog.KObj(cluster)) + ctx = ctrl.LoggerInto(ctx, log) + + // Create the machine context for this request. + emtCtx := &context.MachineTemplateContext{ + Cluster: cluster, + ElfCluster: &elfCluster, + ElfMachineTemplate: &elfMachineTemplate, + } + + // Handle deleted machines + if !elfMachineTemplate.ObjectMeta.DeletionTimestamp.IsZero() { + return ctrl.Result{}, nil + } + + // Handle non-deleted machines + return r.reconcileMachineResources(ctx, emtCtx) +} + +// reconcileMachineResources ensures that the resources(disk capacity) of the +// virtual machines are the same as expected by ElfMachine. +// TODO: CPU and memory will be supported in the future. +func (r *ElfMachineTemplateReconciler) reconcileMachineResources(ctx goctx.Context, emtCtx *context.MachineTemplateContext) (reconcile.Result, error) { + // The disk size is 0, it means the disk size is the same as the virtual machine template. + // So if the capacity is 0, it means that the disk size has not changed and returns directly. + if emtCtx.ElfMachineTemplate.Spec.Template.Spec.DiskGiB == 0 { + return reconcile.Result{}, nil + } + + if ok, err := r.reconcileCPResources(ctx, emtCtx); err != nil { + return reconcile.Result{}, err + } else if !ok { + return reconcile.Result{RequeueAfter: config.DefaultRequeueTimeout}, nil + } + + if ok, err := r.reconcileWorkerResources(ctx, emtCtx); err != nil { + return reconcile.Result{}, err + } else if !ok { + return reconcile.Result{RequeueAfter: config.DefaultRequeueTimeout}, nil + } + + return reconcile.Result{}, nil +} + +// reconcileCPResources ensures that the resources(disk capacity) of the +// control plane virtual machines are the same as expected by ElfMachine. +func (r *ElfMachineTemplateReconciler) reconcileCPResources(ctx goctx.Context, emtCtx *context.MachineTemplateContext) (bool, error) { + log := ctrl.LoggerFrom(ctx) + + var kcp controlplanev1.KubeadmControlPlane + if err := r.Client.Get(ctx, apitypes.NamespacedName{ + Namespace: emtCtx.Cluster.Spec.ControlPlaneRef.Namespace, + Name: emtCtx.Cluster.Spec.ControlPlaneRef.Name, + }, &kcp); err != nil { + return false, err + } + + if kcp.Spec.MachineTemplate.InfrastructureRef.Namespace != emtCtx.ElfMachineTemplate.Namespace || + kcp.Spec.MachineTemplate.InfrastructureRef.Name != emtCtx.ElfMachineTemplate.Name { + return true, nil + } + + elfMachines, err := machineutil.GetControlPlaneElfMachinesInCluster(ctx, r.Client, emtCtx.Cluster.Namespace, emtCtx.Cluster.Name) + if err != nil { + return false, err + } + + updatingResourcesElfMachines, needUpdatedResourcesElfMachines, err := r.selectResourcesNotUpToDateElfMachines(ctx, emtCtx.ElfMachineTemplate, elfMachines) + if err != nil { + return false, err + } else if len(updatingResourcesElfMachines) == 0 && len(needUpdatedResourcesElfMachines) == 0 { + log.V(4).Info(fmt.Sprintf("ElfMachines resources of kcp %s are up to date", klog.KObj(&kcp))) + + return true, nil + } + + // Only one CP ElfMachine is allowed to update resources at the same time. + if len(updatingResourcesElfMachines) > 0 { + log.V(1).Info("Waiting for control plane ElfMachines to be updated resources", "updatingCount", len(updatingResourcesElfMachines), "needUpdatedCount", len(needUpdatedResourcesElfMachines)) + + if err := r.markElfMachinesResourcesNotUpToDate(ctx, emtCtx.ElfMachineTemplate, needUpdatedResourcesElfMachines); err != nil { + return false, err + } + + return false, nil + } + + checksPassed, err := r.preflightChecksForCP(ctx, emtCtx, &kcp) + if err != nil { + return false, err + } + + var toBeUpdatedElfMachine *infrav1.ElfMachine + if checksPassed { + toBeUpdatedElfMachine = needUpdatedResourcesElfMachines[0] + needUpdatedResourcesElfMachines = needUpdatedResourcesElfMachines[1:] + } + + if err := r.markElfMachinesResourcesNotUpToDate(ctx, emtCtx.ElfMachineTemplate, needUpdatedResourcesElfMachines); err != nil { + return false, err + } + + updatingCount := 0 + if toBeUpdatedElfMachine != nil { + updatingCount = 1 + if err := r.markElfMachinesToBeUpdatedResources(ctx, emtCtx.ElfMachineTemplate, []*infrav1.ElfMachine{toBeUpdatedElfMachine}); err != nil { + return false, err + } + } + + log.V(1).Info("Waiting for control plane ElfMachines to be updated resources", "updatingCount", updatingCount, "needUpdatedCount", len(needUpdatedResourcesElfMachines)) + + return false, err +} + +// preflightChecksForCP checks if the control plane is stable before proceeding with a updating resources operation, +// where stable means that: +// - KCP not in rolling update. +// - There are no machine deletion in progress. +// - All the health conditions on KCP are true. +// - All the health conditions on the control plane machines are true. +// If the control plane is not passing preflight checks, it requeue. +func (r *ElfMachineTemplateReconciler) preflightChecksForCP(ctx goctx.Context, emtCtx *context.MachineTemplateContext, kcp *controlplanev1.KubeadmControlPlane) (bool, error) { + log := ctrl.LoggerFrom(ctx) + // During the rolling update process, it is impossible to determine which + // machines are new and which are old machines. Complete the rolling update + // first and then update the resources to avoid updating resources for old + // machines that are about to be deleted. + if kcputil.IsKCPInRollingUpdate(kcp) { + log.Info("KCP rolling update in progress, skip updating resources") + + return false, nil + } + + cpMachines, err := machineutil.GetControlPlaneMachinesForCluster(ctx, r.Client, emtCtx.Cluster) + if err != nil { + return false, err + } + + machines := collections.FromMachines(cpMachines...) + deletingMachines := machines.Filter(collections.HasDeletionTimestamp) + if len(deletingMachines) > 0 { + log.Info("Waiting for machines to be deleted", "machines", deletingMachines.Names()) + + return false, nil + } + + allMachineHealthConditions := []clusterv1.ConditionType{ + controlplanev1.MachineAPIServerPodHealthyCondition, + controlplanev1.MachineControllerManagerPodHealthyCondition, + controlplanev1.MachineSchedulerPodHealthyCondition, + controlplanev1.MachineEtcdPodHealthyCondition, + controlplanev1.MachineEtcdMemberHealthyCondition, + } + machineErrors := []error{} + for _, machine := range machines { + if machine.Status.NodeRef == nil { + // The conditions will only ever be set on a Machine if we're able to correlate a Machine to a Node. + // Correlating Machines to Nodes requires the nodeRef to be set. + // Instead of confusing users with errors about that the conditions are not set, let's point them + // towards the unset nodeRef (which is the root cause of the conditions not being there). + machineErrors = append(machineErrors, errors.Errorf("Machine %s does not have a corresponding Node yet (Machine.status.nodeRef not set)", machine.Name)) + } else { + for _, condition := range allMachineHealthConditions { + if err := preflightCheckCondition("Machine", machine, condition); err != nil { + machineErrors = append(machineErrors, err) + } + } + } + } + + if len(machineErrors) > 0 { + aggregatedError := kerrors.NewAggregate(machineErrors) + log.Info("Waiting for control plane to pass preflight checks", "failures", aggregatedError.Error()) + + return false, nil + } + + return true, nil +} + +func preflightCheckCondition(kind string, obj conditions.Getter, condition clusterv1.ConditionType) error { + c := conditions.Get(obj, condition) + if c == nil { + return errors.Errorf("%s %s does not have %s condition", kind, obj.GetName(), condition) + } + if c.Status == corev1.ConditionFalse { + return errors.Errorf("%s %s reports %s condition is false (%s, %s)", kind, obj.GetName(), condition, c.Severity, c.Message) + } + if c.Status == corev1.ConditionUnknown { + return errors.Errorf("%s %s reports %s condition is unknown (%s)", kind, obj.GetName(), condition, c.Message) + } + return nil +} + +// reconcileWorkerResources ensures that the resources(disk capacity) of the +// worker virtual machines are the same as expected by ElfMachine. +func (r *ElfMachineTemplateReconciler) reconcileWorkerResources(ctx goctx.Context, emtCtx *context.MachineTemplateContext) (bool, error) { + mds, err := machineutil.GetMDsForCluster(ctx, r.Client, emtCtx.Cluster.Namespace, emtCtx.Cluster.Name) + if err != nil { + return false, err + } + + allElfMachinesUpToDate := true + for i := 0; i < len(mds); i++ { + if emtCtx.ElfMachineTemplate.Name != mds[i].Spec.Template.Spec.InfrastructureRef.Name { + continue + } + + if ok, err := r.reconcileWorkerResourcesForMD(ctx, emtCtx, mds[i]); err != nil { + return false, err + } else if !ok { + allElfMachinesUpToDate = false + } + } + + return allElfMachinesUpToDate, nil +} + +// reconcileWorkerResourcesForMD ensures that the resources(disk capacity) of the +// worker virtual machines managed by the md are the same as expected by ElfMachine. +func (r *ElfMachineTemplateReconciler) reconcileWorkerResourcesForMD(ctx goctx.Context, emtCtx *context.MachineTemplateContext, md *clusterv1.MachineDeployment) (bool, error) { + log := ctrl.LoggerFrom(ctx) + + elfMachines, err := machineutil.GetElfMachinesForMD(ctx, r.Client, emtCtx.Cluster, md) + if err != nil { + return false, err + } + + updatingResourcesElfMachines, needUpdatedResourcesElfMachines, err := r.selectResourcesNotUpToDateElfMachines(ctx, emtCtx.ElfMachineTemplate, elfMachines) + if err != nil { + return false, err + } else if len(updatingResourcesElfMachines) == 0 && len(needUpdatedResourcesElfMachines) == 0 { + log.V(4).Info(fmt.Sprintf("ElfMachines resources of md %s are up to date", klog.KObj(md))) + + return true, nil + } + + maxSurge := getMaxSurge(md) + checksPassed := r.preflightChecksForWorker(ctx, md, updatingResourcesElfMachines) + + toBeUpdatedElfMachines, needUpdatedResourcesElfMachines := selectToBeUpdatedAndNeedUpdatedElfMachines(checksPassed, maxSurge, updatingResourcesElfMachines, needUpdatedResourcesElfMachines) + + if err := r.markElfMachinesResourcesNotUpToDate(ctx, emtCtx.ElfMachineTemplate, needUpdatedResourcesElfMachines); err != nil { + return false, err + } + + if err := r.markElfMachinesToBeUpdatedResources(ctx, emtCtx.ElfMachineTemplate, toBeUpdatedElfMachines); err != nil { + return false, err + } + + log.V(1).Info("Waiting for worker ElfMachines to be updated resources", "md", md.Name, "updatingCount", len(updatingResourcesElfMachines)+len(toBeUpdatedElfMachines), "needUpdatedCount", len(needUpdatedResourcesElfMachines), "maxSurge", maxSurge) + + return false, nil +} + +func getMaxSurge(md *clusterv1.MachineDeployment) int { + maxSurge := mdutil.MaxSurge(*md) + if maxSurge <= 0 { + return 1 + } + + return int(maxSurge) +} + +func selectToBeUpdatedAndNeedUpdatedElfMachines( + checksPassed bool, maxSurge int, + updatingResourcesElfMachines, needUpdatedResourcesElfMachines []*infrav1.ElfMachine, +) ([]*infrav1.ElfMachine, []*infrav1.ElfMachine) { + var toBeUpdatedElfMachines, needUpdatedElfMachines []*infrav1.ElfMachine + if checksPassed { + toBeUpdatedCount := maxSurge - len(updatingResourcesElfMachines) + if toBeUpdatedCount > 0 { + if toBeUpdatedCount >= len(needUpdatedResourcesElfMachines) { + toBeUpdatedElfMachines = needUpdatedResourcesElfMachines + needUpdatedElfMachines = nil + } else { + toBeUpdatedElfMachines = needUpdatedResourcesElfMachines[:toBeUpdatedCount] + needUpdatedElfMachines = needUpdatedResourcesElfMachines[toBeUpdatedCount:] + } + } else { + needUpdatedElfMachines = needUpdatedResourcesElfMachines + } + } else { + needUpdatedElfMachines = needUpdatedResourcesElfMachines + } + + return toBeUpdatedElfMachines, needUpdatedElfMachines +} + +// preflightChecksForWorker checks if the worker is stable before proceeding with a updating resources operation, +// where stable means that: +// - MD not in rolling update. +// - The number of machines updating resources is not greater than maxSurge. +// - The number of unavailable machines is no greater than maxUnavailable. +// If the worker is not passing preflight checks, it requeue. +func (r *ElfMachineTemplateReconciler) preflightChecksForWorker(ctx goctx.Context, md *clusterv1.MachineDeployment, updatingResourcesElfMachines []*infrav1.ElfMachine) bool { + log := ctrl.LoggerFrom(ctx) + + if mdutil.IsMDInRollingUpdate(md) { + log.Info("MD rolling update in progress, skip updating resources", "md", md.Name) + + return false + } + + // Use maxSurge of rolling update to control the maximum number of concurrent + // update resources to avoid updating too many machines at the same time. + // If an exception occurs during the resource update process, all machines will + // not be affected. + if maxSurge := getMaxSurge(md); len(updatingResourcesElfMachines) >= maxSurge { + log.V(1).Info("Hot updated worker ElfMachine has reached the max number of concurrencies, so waiting for worker ElfMachines to be updated resources", "md", md.Name, "maxSurge", maxSurge, "updatingCount", len(updatingResourcesElfMachines)) + + return false + } + + maxUnavailable := mdutil.MaxUnavailable(*md) + if md.Status.UnavailableReplicas > maxUnavailable { + log.Info(fmt.Sprintf("MD unavailable replicas %d is greater than expected %d, skip updating resources", md.Status.UnavailableReplicas, maxUnavailable), "md", md.Name) + + return false + } + + return true +} + +// selectResourcesNotUpToDateElfMachines returns elfMachines whose resources are +// not as expected. +func (r *ElfMachineTemplateReconciler) selectResourcesNotUpToDateElfMachines(ctx goctx.Context, elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachines []*infrav1.ElfMachine) ([]*infrav1.ElfMachine, []*infrav1.ElfMachine, error) { + var updatingResourcesElfMachines []*infrav1.ElfMachine + var needUpdatedResourcesElfMachines []*infrav1.ElfMachine + for i := 0; i < len(elfMachines); i++ { + elfMachine := elfMachines[i] + + machine, err := capiutil.GetOwnerMachine(ctx, r.Client, elfMachine.ObjectMeta) + if err != nil { + return nil, nil, err + } + + // No need to update the resources of deleted and failed machines. + if machine == nil || + !machine.DeletionTimestamp.IsZero() || + clusterv1.MachinePhase(machine.Status.Phase) == clusterv1.MachinePhaseFailed { + continue + } + + if machineutil.IsUpdatingElfMachineResources(elfMachine) && + machineutil.IsResourcesUpToDate(elfMachineTemplate, elfMachine) { + updatingResourcesElfMachines = append(updatingResourcesElfMachines, elfMachine) + } else if machineutil.NeedUpdateElfMachineResources(elfMachineTemplate, elfMachine) { + needUpdatedResourcesElfMachines = append(needUpdatedResourcesElfMachines, elfMachine) + } + } + + return updatingResourcesElfMachines, needUpdatedResourcesElfMachines, nil +} + +// markElfMachinesToBeUpdatedResources synchronizes the expected resource values +// from the ElfMachineTemplate and marks the machines to be updated resources. +func (r *ElfMachineTemplateReconciler) markElfMachinesToBeUpdatedResources(ctx goctx.Context, elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachines []*infrav1.ElfMachine) error { + log := ctrl.LoggerFrom(ctx) + + for i := 0; i < len(elfMachines); i++ { + elfMachine := elfMachines[i] + + patchHelper, err := patch.NewHelper(elfMachine, r.Client) + if err != nil { + return err + } + + // Ensure resources are up to date. + orignalDiskGiB := elfMachine.Spec.DiskGiB + elfMachine.Spec.DiskGiB = elfMachineTemplate.Spec.Template.Spec.DiskGiB + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") + + log.Info(fmt.Sprintf("Resources of ElfMachine is not up to date, marking for updating resources(disk: %d -> %d)", orignalDiskGiB, elfMachine.Spec.DiskGiB), "elfMachine", elfMachine.Name) + + if err := patchHelper.Patch(ctx, elfMachine); err != nil { + return errors.Wrapf(err, "failed to patch ElfMachine %s to mark for updating resources", elfMachine.Name) + } + } + + return nil +} + +// markElfMachinesResourcesNotUpToDate synchronizes the expected resource values +// from the ElfMachineTemplate and marks the machines waiting for updated resources. +func (r *ElfMachineTemplateReconciler) markElfMachinesResourcesNotUpToDate(ctx goctx.Context, elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachines []*infrav1.ElfMachine) error { + log := ctrl.LoggerFrom(ctx) + + for i := 0; i < len(elfMachines); i++ { + elfMachine := elfMachines[i] + if machineutil.IsResourcesUpToDate(elfMachineTemplate, elfMachine) { + continue + } + + patchHelper, err := patch.NewHelper(elfMachine, r.Client) + if err != nil { + return err + } + + // Ensure resources are up to date. + orignalDiskGiB := elfMachine.Spec.DiskGiB + elfMachine.Spec.DiskGiB = elfMachineTemplate.Spec.Template.Spec.DiskGiB + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, anotherMachineHotUpdateInProgressMessage) + + log.Info(fmt.Sprintf("Resources of ElfMachine is not up to date, marking for resources not up to date and waiting for hot updating resources(disk: %d -> %d)", orignalDiskGiB, elfMachine.Spec.DiskGiB), "elfMachine", elfMachine.Name) + + if err := patchHelper.Patch(ctx, elfMachine); err != nil { + return errors.Wrapf(err, "failed to patch ElfMachine %s to mark for resources not up to date", elfMachine.Name) + } + } + + return nil +} diff --git a/controllers/elfmachinetemplate_controller_test.go b/controllers/elfmachinetemplate_controller_test.go new file mode 100644 index 00000000..e3b5605a --- /dev/null +++ b/controllers/elfmachinetemplate_controller_test.go @@ -0,0 +1,507 @@ +/* +Copyright 2024. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "bytes" + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + "k8s.io/utils/pointer" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" + capiutil "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/conditions" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + ctrlutil "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" + "github.com/smartxworks/cluster-api-provider-elf/test/fake" +) + +var _ = Describe("ElfMachineTemplateReconciler", func() { + var ( + elfCluster *infrav1.ElfCluster + cluster *clusterv1.Cluster + elfMachine *infrav1.ElfMachine + machine *clusterv1.Machine + secret *corev1.Secret + logBuffer *bytes.Buffer + ) + + BeforeEach(func() { + logBuffer = new(bytes.Buffer) + klog.SetOutput(logBuffer) + + elfCluster, cluster, elfMachine, machine, secret = fake.NewClusterAndMachineObjects() + }) + + AfterEach(func() { + }) + + Context("Reconcile a ElfMachineTemplate", func() { + It("Reconcile", func() { + emt := fake.NewElfMachineTemplate() + emt.OwnerReferences = append(emt.OwnerReferences, metav1.OwnerReference{Kind: fake.ClusterKind, APIVersion: clusterv1.GroupVersion.String(), Name: cluster.Name, UID: "blah"}) + kcp := fake.NewKCP() + kcp.Spec.MachineTemplate = controlplanev1.KubeadmControlPlaneMachineTemplate{ + InfrastructureRef: corev1.ObjectReference{Namespace: emt.Namespace, Name: emt.Name}, + } + md := fake.NewMD() + md.Labels = map[string]string{clusterv1.ClusterNameLabel: cluster.Name} + md.Spec.Template = clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + InfrastructureRef: corev1.ObjectReference{Namespace: emt.Namespace, Name: emt.Name}, + }, + } + cluster.Spec.ControlPlaneRef = &corev1.ObjectReference{Namespace: kcp.Namespace, Name: kcp.Name} + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, emt, kcp, md) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + emtKey := capiutil.ObjectKey(emt) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: emtKey}) + Expect(result).To(BeZero()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("ElfMachines resources of kcp %s are up to date", klog.KObj(kcp)))) + Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("ElfMachines resources of md %s are up to date", klog.KObj(md)))) + + emt.Spec.Template.Spec.DiskGiB = 0 + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, emt, kcp, md) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + emtKey = capiutil.ObjectKey(emt) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: emtKey}) + Expect(result).To(BeZero()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("ElfMachines resources of kcp %s are up to date", klog.KObj(kcp)))) + Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("ElfMachines resources of md %s are up to date", klog.KObj(md)))) + }) + + It("should not error and not requeue the request without elfmachinetemplate", func() { + emt := fake.NewElfMachineTemplate() + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: capiutil.ObjectKey(emt)}) + Expect(result).To(BeZero()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("ElfMachineTemplate not found, won't reconcile")) + + emt.OwnerReferences = append(emt.OwnerReferences, metav1.OwnerReference{Kind: fake.ClusterKind, APIVersion: clusterv1.GroupVersion.String(), Name: cluster.Name, UID: "blah"}) + ctrlMgrCtx = fake.NewControllerManagerContext(cluster, elfMachine, machine, secret, emt) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: capiutil.ObjectKey(emt)}) + Expect(result).To(BeZero()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("ElfMachineTemplate Waiting for ElfCluster")) + }) + + It("should not error and not requeue the request when Cluster is paused", func() { + emt := fake.NewElfMachineTemplate() + emt.OwnerReferences = append(emt.OwnerReferences, metav1.OwnerReference{Kind: fake.ClusterKind, APIVersion: clusterv1.GroupVersion.String(), Name: cluster.Name, UID: "blah"}) + cluster.Spec.Paused = true + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, emt) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + emtKey := capiutil.ObjectKey(emt) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: emtKey}) + Expect(result).To(BeZero()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("ElfMachineTemplate linked to a cluster that is paused")) + }) + }) + + Context("reconcileWorkerResources", func() { + It("reconcileWorkerResources", func() { + emt := fake.NewElfMachineTemplate() + md := fake.NewMD() + md.Labels = map[string]string{clusterv1.ClusterNameLabel: cluster.Name} + md.Spec.Replicas = pointer.Int32(3) + md.Spec.Template = clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + InfrastructureRef: corev1.ObjectReference{Namespace: emt.Namespace, Name: emt.Name}, + }, + } + md.Spec.Strategy = &clusterv1.MachineDeploymentStrategy{ + RollingUpdate: &clusterv1.MachineRollingUpdateDeployment{ + MaxSurge: intOrStrPtr(1), + MaxUnavailable: intOrStrPtr(1), + }, + } + fake.ToWorkerMachine(elfMachine, md) + fake.ToWorkerMachine(machine, md) + fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + mtCtx := newMachineTemplateContext(elfCluster, cluster, emt) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err := reconciler.reconcileWorkerResources(ctx, mtCtx) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring(fmt.Sprintf("ElfMachines resources of md %s are up to date", klog.KObj(md)))) + + logBuffer.Reset() + elfMachine.Spec.DiskGiB -= 1 + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err = reconciler.reconcileWorkerResources(ctx, mtCtx) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).NotTo(ContainSubstring("Resources of ElfMachine is not up to date, marking for resources not up to date and waiting for hot updating resources")) + Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for worker ElfMachines to be updated resources")) + + // logBuffer.Reset() + // elfMachine.Spec.DiskGiB -= 1 + // updatingElfMachine, updatingMachine := fake.NewMachineObjects(elfCluster, cluster) + // fake.ToWorkerMachine(updatingElfMachine, md) + // fake.ToWorkerMachine(updatingMachine, md) + // fake.SetElfMachineTemplateForElfMachine(updatingElfMachine, emt) + // ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, md, updatingElfMachine, updatingMachine) + // fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + // fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, updatingElfMachine, updatingMachine) + // mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) + // reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + // ok, err = reconciler.reconcileWorkerResources(ctx, mtCtx) + // Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) + }) + + It("selectToBeUpdatedAndNeedUpdatedElfMachines", func() { + elfMachine1, _ := fake.NewMachineObjects(elfCluster, cluster) + elfMachine2, _ := fake.NewMachineObjects(elfCluster, cluster) + + toBeUpdated, needUpdated := selectToBeUpdatedAndNeedUpdatedElfMachines(false, 1, []*infrav1.ElfMachine{}, []*infrav1.ElfMachine{elfMachine1, elfMachine2}) + Expect(toBeUpdated).To(BeEmpty()) + Expect(needUpdated).To(Equal([]*infrav1.ElfMachine{elfMachine1, elfMachine2})) + + toBeUpdated, needUpdated = selectToBeUpdatedAndNeedUpdatedElfMachines(true, 1, []*infrav1.ElfMachine{elfMachine1}, []*infrav1.ElfMachine{elfMachine2}) + Expect(toBeUpdated).To(BeEmpty()) + Expect(needUpdated).To(Equal([]*infrav1.ElfMachine{elfMachine2})) + + toBeUpdated, needUpdated = selectToBeUpdatedAndNeedUpdatedElfMachines(true, 2, []*infrav1.ElfMachine{elfMachine1}, []*infrav1.ElfMachine{elfMachine2}) + Expect(toBeUpdated).To(Equal([]*infrav1.ElfMachine{elfMachine2})) + Expect(needUpdated).To(BeEmpty()) + + toBeUpdated, needUpdated = selectToBeUpdatedAndNeedUpdatedElfMachines(true, 1, []*infrav1.ElfMachine{}, []*infrav1.ElfMachine{elfMachine1, elfMachine2}) + Expect(toBeUpdated).To(Equal([]*infrav1.ElfMachine{elfMachine1})) + Expect(needUpdated).To(Equal([]*infrav1.ElfMachine{elfMachine2})) + }) + }) + + Context("reconcileCPResources", func() { + It("reconcileCPResources", func() { + emt := fake.NewElfMachineTemplate() + kcp := fake.NewKCP() + kcp.Spec.MachineTemplate = controlplanev1.KubeadmControlPlaneMachineTemplate{ + InfrastructureRef: corev1.ObjectReference{Namespace: emt.Namespace, Name: "notfoud"}, + } + cluster.Spec.ControlPlaneRef = &corev1.ObjectReference{Namespace: kcp.Namespace, Name: kcp.Name} + elfMachine.Spec.DiskGiB -= 1 + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kcp) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + mtCtx := newMachineTemplateContext(elfCluster, cluster, emt) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err := reconciler.reconcileCPResources(ctx, mtCtx) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + + kcp.Spec.MachineTemplate = controlplanev1.KubeadmControlPlaneMachineTemplate{ + InfrastructureRef: corev1.ObjectReference{Namespace: emt.Namespace, Name: emt.Name}, + } + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kcp) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err = reconciler.reconcileCPResources(ctx, mtCtx) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + + logBuffer.Reset() + updatingElfMachine, updatingMachine := fake.NewMachineObjects(elfCluster, cluster) + fake.ToControlPlaneMachine(updatingElfMachine, kcp) + fake.ToControlPlaneMachine(updatingMachine, kcp) + fake.SetElfMachineTemplateForElfMachine(updatingElfMachine, emt) + conditions.MarkFalse(updatingElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kcp, + updatingElfMachine, updatingMachine, + ) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, updatingElfMachine, updatingMachine) + mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err = reconciler.reconcileCPResources(ctx, mtCtx) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for control plane ElfMachines to be updated resources")) + + logBuffer.Reset() + kcp.Spec.Replicas = pointer.Int32(3) + kcp.Status.Replicas = 3 + kcp.Status.UpdatedReplicas = 2 + fake.ToControlPlaneMachine(elfMachine, kcp) + fake.ToControlPlaneMachine(machine, kcp) + elfMachine.Spec.DiskGiB -= 1 + machine.Status.NodeRef = &corev1.ObjectReference{} + conditions.MarkTrue(machine, controlplanev1.MachineAPIServerPodHealthyCondition) + conditions.MarkTrue(machine, controlplanev1.MachineControllerManagerPodHealthyCondition) + conditions.MarkTrue(machine, controlplanev1.MachineSchedulerPodHealthyCondition) + conditions.MarkTrue(machine, controlplanev1.MachineEtcdPodHealthyCondition) + conditions.MarkTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kcp) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err = reconciler.reconcileCPResources(ctx, mtCtx) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("KCP rolling update in progress, skip updating resources")) + Expect(logBuffer.String()).NotTo(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) + + logBuffer.Reset() + kcp.Status.UpdatedReplicas = 3 + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, kcp) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + mtCtx = newMachineTemplateContext(elfCluster, cluster, emt) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err = reconciler.reconcileCPResources(ctx, mtCtx) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for control plane ElfMachines to be updated resources")) + }) + }) + + Context("preflightChecksForCP", func() { + It("should return false if KCP rolling update in progress", func() { + emt := fake.NewElfMachineTemplate() + kcp := fake.NewKCP() + kcp.Spec.Replicas = pointer.Int32(3) + kcp.Status.Replicas = 3 + kcp.Status.UpdatedReplicas = 2 + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + mtCtx := newMachineTemplateContext(elfCluster, cluster, emt) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err := reconciler.preflightChecksForCP(ctx, mtCtx, kcp) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("KCP rolling update in progress, skip updating resources")) + }) + + It("should return false if has deleting or failed machine", func() { + emt := fake.NewElfMachineTemplate() + kcp := fake.NewKCP() + kcp.Spec.Replicas = pointer.Int32(3) + kcp.Status.Replicas = 3 + kcp.Status.UpdatedReplicas = 3 + fake.ToControlPlaneMachine(elfMachine, kcp) + fake.ToControlPlaneMachine(machine, kcp) + ctrlutil.AddFinalizer(machine, infrav1.MachineFinalizer) + machine.DeletionTimestamp = &metav1.Time{Time: time.Now().UTC()} + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + mtCtx := newMachineTemplateContext(elfCluster, cluster, emt) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err := reconciler.preflightChecksForCP(ctx, mtCtx, kcp) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for machines to be deleted")) + + logBuffer.Reset() + machine.DeletionTimestamp = nil + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err = reconciler.preflightChecksForCP(ctx, mtCtx, kcp) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for control plane to pass preflight checks")) + + logBuffer.Reset() + machine.Status.NodeRef = &corev1.ObjectReference{} + conditions.MarkFalse(machine, controlplanev1.MachineEtcdPodHealthyCondition, controlplanev1.PodInspectionFailedReason, clusterv1.ConditionSeverityInfo, "error") + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err = reconciler.preflightChecksForCP(ctx, mtCtx, kcp) + Expect(ok).To(BeFalse()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Waiting for control plane to pass preflight checks")) + }) + + It("should return true", func() { + emt := fake.NewElfMachineTemplate() + kcp := fake.NewKCP() + kcp.Spec.Replicas = pointer.Int32(3) + kcp.Status.Replicas = 3 + kcp.Status.UpdatedReplicas = 3 + fake.ToControlPlaneMachine(elfMachine, kcp) + fake.ToControlPlaneMachine(machine, kcp) + machine.Status.NodeRef = &corev1.ObjectReference{} + conditions.MarkTrue(machine, controlplanev1.MachineAPIServerPodHealthyCondition) + conditions.MarkTrue(machine, controlplanev1.MachineControllerManagerPodHealthyCondition) + conditions.MarkTrue(machine, controlplanev1.MachineSchedulerPodHealthyCondition) + conditions.MarkTrue(machine, controlplanev1.MachineEtcdPodHealthyCondition) + conditions.MarkTrue(machine, controlplanev1.MachineEtcdMemberHealthyCondition) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + mtCtx := newMachineTemplateContext(elfCluster, cluster, emt) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok, err := reconciler.preflightChecksForCP(ctx, mtCtx, kcp) + Expect(ok).To(BeTrue()) + Expect(err).NotTo(HaveOccurred()) + }) + }) + + Context("preflightChecksForWorker", func() { + It("should return false if MD rolling update in progress", func() { + md := fake.NewMD() + fake.ToWorkerMachine(elfMachine, md) + fake.ToWorkerMachine(machine, md) + md.Spec.Replicas = pointer.Int32(3) + md.Status.Replicas = 3 + md.Status.UpdatedReplicas = 2 + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok := reconciler.preflightChecksForWorker(ctx, md, nil) + Expect(ok).To(BeFalse()) + Expect(logBuffer.String()).To(ContainSubstring("MD rolling update in progress, skip updating resources")) + }) + + It("should check maxSurge", func() { + md := fake.NewMD() + fake.ToWorkerMachine(elfMachine, md) + fake.ToWorkerMachine(machine, md) + md.Spec.Strategy = &clusterv1.MachineDeploymentStrategy{ + RollingUpdate: &clusterv1.MachineRollingUpdateDeployment{MaxSurge: intOrStrPtr(1)}, + } + md.Spec.Replicas = pointer.Int32(3) + md.Status.Replicas = 3 + md.Status.UpdatedReplicas = 3 + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok := reconciler.preflightChecksForWorker(ctx, md, []*infrav1.ElfMachine{elfMachine}) + Expect(ok).To(BeFalse()) + Expect(logBuffer.String()).To(ContainSubstring("Hot updated worker ElfMachine has reached the max number of concurrencies, so waiting for worker ElfMachines to be updated resources")) + + logBuffer.Reset() + md.Status.UnavailableReplicas = 3 + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok = reconciler.preflightChecksForWorker(ctx, md, []*infrav1.ElfMachine{}) + Expect(ok).To(BeFalse()) + Expect(logBuffer.String()).To(ContainSubstring("MD unavailable replicas")) + + md.Status.UnavailableReplicas = 0 + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + ok = reconciler.preflightChecksForWorker(ctx, md, []*infrav1.ElfMachine{}) + Expect(ok).To(BeTrue()) + }) + }) + + Context("selectResourcesNotUpToDateElfMachines", func() { + It("should return updating/needUpdated resources elfMachines", func() { + emt := fake.NewElfMachineTemplate() + upToDateElfMachine, upToDateMachine := fake.NewMachineObjects(elfCluster, cluster) + fake.SetElfMachineTemplateForElfMachine(upToDateElfMachine, emt) + noUpToDateElfMachine, noUpToDateMachine := fake.NewMachineObjects(elfCluster, cluster) + fake.SetElfMachineTemplateForElfMachine(noUpToDateElfMachine, emt) + noUpToDateElfMachine.Spec.DiskGiB -= 1 + updatingElfMachine, updatingMachine := fake.NewMachineObjects(elfCluster, cluster) + fake.SetElfMachineTemplateForElfMachine(updatingElfMachine, emt) + conditions.MarkFalse(updatingElfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") + failedElfMachine, failedMachine := fake.NewMachineObjects(elfCluster, cluster) + fake.SetElfMachineTemplateForElfMachine(failedElfMachine, emt) + failedElfMachine.Spec.DiskGiB -= 1 + failedMachine.Status.Phase = string(clusterv1.MachinePhaseFailed) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret, + upToDateElfMachine, upToDateMachine, + noUpToDateElfMachine, noUpToDateMachine, + updatingElfMachine, updatingMachine, + failedElfMachine, failedMachine, + ) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, upToDateElfMachine, upToDateMachine) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, noUpToDateElfMachine, noUpToDateMachine) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, updatingElfMachine, updatingMachine) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, failedElfMachine, failedMachine) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + elfMachines := []*infrav1.ElfMachine{upToDateElfMachine, noUpToDateElfMachine, updatingElfMachine, failedElfMachine} + updatingResourcesElfMachines, needUpdatedResourcesElfMachines, err := reconciler.selectResourcesNotUpToDateElfMachines(ctx, emt, elfMachines) + Expect(err).NotTo(HaveOccurred()) + Expect(updatingResourcesElfMachines).To(Equal([]*infrav1.ElfMachine{updatingElfMachine})) + Expect(needUpdatedResourcesElfMachines).To(Equal([]*infrav1.ElfMachine{noUpToDateElfMachine})) + }) + }) + + Context("markElfMachinesToBeUpdatedResources", func() { + It("should mark resources to be updated", func() { + emt := fake.NewElfMachineTemplate() + fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) + elfMachine.Spec.DiskGiB -= 1 + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + err := reconciler.markElfMachinesToBeUpdatedResources(ctx, emt, []*infrav1.ElfMachine{elfMachine}) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for updating resources")) + elfMachineKey := client.ObjectKey{Namespace: elfMachine.Namespace, Name: elfMachine.Name} + Eventually(func() bool { + _ = reconciler.Client.Get(ctx, elfMachineKey, elfMachine) + return elfMachine.Spec.DiskGiB == emt.Spec.Template.Spec.DiskGiB + }, timeout).Should(BeTrue()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForResourcesHotUpdateReason}}) + }) + }) + + Context("markElfMachinesResourcesNotUpToDate", func() { + It("should mark resources not up to date", func() { + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + emt := fake.NewElfMachineTemplate() + fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + reconciler := &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + err := reconciler.markElfMachinesResourcesNotUpToDate(ctx, emt, []*infrav1.ElfMachine{elfMachine}) + Expect(err).NotTo(HaveOccurred()) + expectConditions(elfMachine, []conditionAssertion{}) + + logBuffer.Reset() + elfMachine.Spec.DiskGiB -= 1 + ctrlMgrCtx = fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, machine, secret) + fake.InitOwnerReferences(ctx, ctrlMgrCtx, elfCluster, cluster, elfMachine, machine) + reconciler = &ElfMachineTemplateReconciler{ControllerManagerContext: ctrlMgrCtx} + err = reconciler.markElfMachinesResourcesNotUpToDate(ctx, emt, []*infrav1.ElfMachine{elfMachine}) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Resources of ElfMachine is not up to date, marking for resources not up to date and waiting for hot updating resources")) + elfMachineKey := client.ObjectKey{Namespace: elfMachine.Namespace, Name: elfMachine.Name} + Eventually(func() bool { + _ = reconciler.Client.Get(ctx, elfMachineKey, elfMachine) + return elfMachine.Spec.DiskGiB == emt.Spec.Template.Spec.DiskGiB + }, timeout).Should(BeTrue()) + expectConditions(elfMachine, []conditionAssertion{{infrav1.ResourcesHotUpdatedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.WaitingForResourcesHotUpdateReason}}) + Expect(conditions.GetMessage(elfMachine, infrav1.ResourcesHotUpdatedCondition)).To(Equal(anotherMachineHotUpdateInProgressMessage)) + }) + }) +}) diff --git a/controllers/suite_test.go b/controllers/suite_test.go index 2e03414e..a16dc9f0 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -25,8 +25,10 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" utilruntime "k8s.io/apimachinery/pkg/util/runtime" cgscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/klog/v2" @@ -47,8 +49,9 @@ const ( ) var ( - testEnv *helpers.TestEnvironment - ctx = ctrl.SetupSignalHandler() + testEnv *helpers.TestEnvironment + ctx = ctrl.SetupSignalHandler() + unexpectedError = errors.New("unexpected error") ) func TestControllers(t *testing.T) { @@ -136,6 +139,16 @@ func newMachineContext( } } +func newMachineTemplateContext( + elfCluster *infrav1.ElfCluster, cluster *clusterv1.Cluster, + emt *infrav1.ElfMachineTemplate) *context.MachineTemplateContext { + return &context.MachineTemplateContext{ + Cluster: cluster, + ElfCluster: elfCluster, + ElfMachineTemplate: emt, + } +} + type conditionAssertion struct { conditionType clusterv1.ConditionType status corev1.ConditionStatus @@ -154,3 +167,8 @@ func expectConditions(getter conditions.Getter, expected []conditionAssertion) { Expect(actual.Reason).To(Equal(c.reason)) } } + +func intOrStrPtr(i int32) *intstr.IntOrString { + res := intstr.FromInt(int(i)) + return &res +} diff --git a/go.mod b/go.mod index dbeab8a3..d7522f29 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( github.com/patrickmn/go-cache v2.1.0+incompatible github.com/pkg/errors v0.9.1 github.com/smartxworks/cloudtower-go-sdk/v2 v2.13.1-0.20231116110941-d411454388af + github.com/smartxworks/host-config-agent-api v0.0.0-20240410021405-be1517a07889 golang.org/x/mod v0.13.0 k8s.io/api v0.28.4 k8s.io/apiextensions-apiserver v0.28.4 diff --git a/go.sum b/go.sum index 1c599876..304afebf 100644 --- a/go.sum +++ b/go.sum @@ -517,6 +517,8 @@ github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0 github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/smartxworks/cloudtower-go-sdk/v2 v2.13.1-0.20231116110941-d411454388af h1:rV7FO8PZAzurE2zLEtXVrcKZkF544w98rXjvv2bDaqI= github.com/smartxworks/cloudtower-go-sdk/v2 v2.13.1-0.20231116110941-d411454388af/go.mod h1:X6R9+L438SMnLJXykSCV3fJ+AZul0hlyjITsZgrSRtM= +github.com/smartxworks/host-config-agent-api v0.0.0-20240410021405-be1517a07889 h1:kXlP3d9BEE6q31cR3qAehK0BKNVuCZ6jT2t+Jf5Gwrs= +github.com/smartxworks/host-config-agent-api v0.0.0-20240410021405-be1517a07889/go.mod h1:oexROTOhF0Ium/iTwRbwXwZy+eGkzYE5a5qe3HvvKug= github.com/soheilhy/cmux v0.1.5 h1:jjzc5WVemNEDTLwv9tlmemhC73tI08BNOIGwBOo10Js= github.com/soheilhy/cmux v0.1.5/go.mod h1:T7TcVDs9LWfQgPlPsdngu6I6QIoyIFZDDC6sNE1GqG0= github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= diff --git a/main.go b/main.go index 46318221..c84c5686 100644 --- a/main.go +++ b/main.go @@ -63,8 +63,9 @@ var ( webhookOpts webhook.Options watchNamespace string - elfClusterConcurrency int - elfMachineConcurrency int + elfClusterConcurrency int + elfMachineConcurrency int + elfMachineTemplateConcurrency int tlsOptions = capiflags.TLSOptions{} diagnosticsOptions = capiflags.DiagnosticsOptions{} @@ -93,6 +94,9 @@ func InitFlags(fs *pflag.FlagSet) { fs.IntVar(&elfMachineConcurrency, "max-elfmachine-concurrent-reconciles", 10, "Number of ELF machines to process simultaneously") + fs.IntVar(&elfMachineTemplateConcurrency, "max-elfmachinetemplate-concurrent-reconciles", 10, + "Number of ELF machine templates to process simultaneously") + fs.StringVar(&managerOpts.PodName, "pod-name", defaultPodName, "The name of the pod running the controller manager.") @@ -197,6 +201,16 @@ func main() { // Create a function that adds all of the controllers and webhooks to the manager. addToManager := func(ctx goctx.Context, ctrlMgrCtx *context.ControllerManagerContext, mgr ctrlmgr.Manager) error { if os.Getenv("ENABLE_WEBHOOKS") != "false" { + if err := (&webhooks.ElfMachineTemplateValidator{}).SetupWebhookWithManager(mgr); err != nil { + return err + } + + if err := (&webhooks.ElfMachineValidator{ + Client: mgr.GetClient(), + }).SetupWebhookWithManager(mgr); err != nil { + return err + } + if err := (&webhooks.ElfMachineMutation{ Client: mgr.GetClient(), Logger: mgr.GetLogger().WithName("ElfMachineMutation"), @@ -220,6 +234,10 @@ func main() { return err } + if err := controllers.AddMachineTemplateControllerToManager(ctx, ctrlMgrCtx, mgr, controller.Options{MaxConcurrentReconciles: elfMachineTemplateConcurrency}); err != nil { + return err + } + return nil } diff --git a/pkg/context/machine_template_context.go b/pkg/context/machine_template_context.go new file mode 100644 index 00000000..98daf60b --- /dev/null +++ b/pkg/context/machine_template_context.go @@ -0,0 +1,39 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package context + +import ( + "fmt" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" + "github.com/smartxworks/cluster-api-provider-elf/pkg/service" +) + +// MachineTemplateContext is a Go context used with an ElfMachineTemplate. +type MachineTemplateContext struct { + Cluster *clusterv1.Cluster + ElfCluster *infrav1.ElfCluster + ElfMachineTemplate *infrav1.ElfMachineTemplate + VMService service.VMService +} + +// String returns ElfMachineTemplateGroupVersionKindElfMachineTemplateNamespace/ElfMachineTemplateName. +func (c *MachineTemplateContext) String() string { + return fmt.Sprintf("%s %s/%s", c.ElfMachineTemplate.GroupVersionKind(), c.ElfMachineTemplate.Namespace, c.ElfMachineTemplate.Name) +} diff --git a/pkg/hostagent/service.go b/pkg/hostagent/service.go new file mode 100644 index 00000000..2518f230 --- /dev/null +++ b/pkg/hostagent/service.go @@ -0,0 +1,74 @@ +/* +Copyright 2024. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package hostagent + +import ( + goctx "context" + "fmt" + "time" + + agentv1 "github.com/smartxworks/host-config-agent-api/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + apitypes "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" + "github.com/smartxworks/cluster-api-provider-elf/pkg/hostagent/tasks" +) + +const defaultTimeout = 1 * time.Minute + +func GetHostJob(ctx goctx.Context, c client.Client, namespace, name string) (*agentv1.HostOperationJob, error) { + var restartKubeletJob agentv1.HostOperationJob + if err := c.Get(ctx, apitypes.NamespacedName{ + Name: name, + Namespace: "default", + }, &restartKubeletJob); err != nil { + return nil, err + } + + return &restartKubeletJob, nil +} + +// GetExpandRootPartitionJobName return the expand root partition job name. +// The same disk expansion uses the same job name to reduce duplicate jobs. +func GetExpandRootPartitionJobName(elfMachine *infrav1.ElfMachine) string { + return fmt.Sprintf("cape-expand-root-partition-%s-%d", elfMachine.Name, elfMachine.Spec.DiskGiB) +} + +func ExpandRootPartition(ctx goctx.Context, c client.Client, elfMachine *infrav1.ElfMachine) (*agentv1.HostOperationJob, error) { + agentJob := &agentv1.HostOperationJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: GetExpandRootPartitionJobName(elfMachine), + Namespace: "default", + }, + Spec: agentv1.HostOperationJobSpec{ + NodeName: elfMachine.Name, + Operation: agentv1.Operation{ + Ansible: &agentv1.Ansible{ + LocalPlaybookText: &agentv1.YAMLText{ + Inline: tasks.ExpandRootPartitionTask, + }, + }, + Timeout: metav1.Duration{Duration: defaultTimeout}, + }, + }, + } + + if err := c.Create(ctx, agentJob); err != nil { + return nil, err + } + + return agentJob, nil +} diff --git a/pkg/hostagent/tasks/expand_root_partition.yaml b/pkg/hostagent/tasks/expand_root_partition.yaml new file mode 100644 index 00000000..5b500f8f --- /dev/null +++ b/pkg/hostagent/tasks/expand_root_partition.yaml @@ -0,0 +1,39 @@ +--- +- name: Expand root partition + hosts: all + become: true + gather_facts: false + tasks: + - name: Get root path + shell: | + . /etc/os-release + rootpath="/dev/mapper/rl-root" + if [[ $ID == 'openEuler' ]]; then + rootpath="/dev/mapper/openeuler-root" + fi + echo $rootpath + register: rootpath + - name: Grow vda2 + shell: | + result=$(growpart /dev/vda 2) + if [[ $? == 0 ]]; then + echo "$result" + elif [[ $result == NOCHANGE* ]]; then + echo "$result" + else + echo "$result" + exit 1 + fi + - name: Resize vda2 + shell: pvresize /dev/vda2 + - name: Extend root + shell: | + result=$(lvextend -r -l+100%FREE -n {{ rootpath.stdout }} 2>&1) + if [[ $? == 0 ]]; then + echo "$result" + elif [[ $result == *'matches existing size'* ]]; then + echo "$result" + else + echo "$result" + exit 1 + fi diff --git a/pkg/hostagent/tasks/tasks.go b/pkg/hostagent/tasks/tasks.go new file mode 100644 index 00000000..8a38d3df --- /dev/null +++ b/pkg/hostagent/tasks/tasks.go @@ -0,0 +1,23 @@ +/* +Copyright 2024. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tasks + +import ( + _ "embed" +) + +// ExpandRootPartitionTask is the task to add new disk capacity to root. +// +//go:embed expand_root_partition.yaml +var ExpandRootPartitionTask string diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 4531d0b0..856452d1 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -20,6 +20,7 @@ import ( goctx "context" "github.com/pkg/errors" + agentv1 "github.com/smartxworks/host-config-agent-api/api/v1alpha1" cgscheme "k8s.io/client-go/kubernetes/scheme" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" @@ -48,6 +49,7 @@ func New(ctx goctx.Context, opts Options) (Manager, error) { _ = infrav1.AddToScheme(opts.Scheme) _ = bootstrapv1.AddToScheme(opts.Scheme) _ = controlplanev1.AddToScheme(opts.Scheme) + _ = agentv1.AddToScheme(opts.Scheme) // +kubebuilder:scaffold:scheme // Build the controller manager. diff --git a/pkg/service/errors.go b/pkg/service/errors.go index f783cf3b..d69ba8fc 100644 --- a/pkg/service/errors.go +++ b/pkg/service/errors.go @@ -27,6 +27,7 @@ const ( HostNotFound = "HOST_NOT_FOUND" VMTemplateNotFound = "VM_TEMPLATE_NOT_FOUND" VMNotFound = "VM_NOT_FOUND" + VMVolumeNotFound = "VM_VOLUME_NOT_FOUND" VMGPUInfoNotFound = "VM_GPU_INFO_NOT_FOUND" VMDuplicate = "VM_DUPLICATE" TaskNotFound = "TASK_NOT_FOUND" @@ -62,6 +63,10 @@ func IsShutDownTimeout(message string) bool { return strings.Contains(message, "JOB_VM_SHUTDOWN_TIMEOUT") } +func IsVMVolumeNotFound(err error) bool { + return strings.Contains(err.Error(), VMVolumeNotFound) +} + func IsGPUAssignFailed(message string) bool { return strings.Contains(message, GPUAssignFailed) } diff --git a/pkg/service/mock_services/vm_mock.go b/pkg/service/mock_services/vm_mock.go index 057af947..3e6fa765 100644 --- a/pkg/service/mock_services/vm_mock.go +++ b/pkg/service/mock_services/vm_mock.go @@ -338,6 +338,21 @@ func (mr *MockVMServiceMockRecorder) GetTask(id interface{}) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetTask", reflect.TypeOf((*MockVMService)(nil).GetTask), id) } +// GetVMDisks mocks base method. +func (m *MockVMService) GetVMDisks(vmDiskIDs []string) ([]*models.VMDisk, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetVMDisks", vmDiskIDs) + ret0, _ := ret[0].([]*models.VMDisk) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetVMDisks indicates an expected call of GetVMDisks. +func (mr *MockVMServiceMockRecorder) GetVMDisks(vmDiskIDs interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetVMDisks", reflect.TypeOf((*MockVMService)(nil).GetVMDisks), vmDiskIDs) +} + // GetVMGPUAllocationInfo mocks base method. func (m *MockVMService) GetVMGPUAllocationInfo(id string) (*models.VMGpuInfo, error) { m.ctrl.T.Helper() @@ -398,6 +413,21 @@ func (mr *MockVMServiceMockRecorder) GetVMTemplate(template interface{}) *gomock return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetVMTemplate", reflect.TypeOf((*MockVMService)(nil).GetVMTemplate), template) } +// GetVMVolume mocks base method. +func (m *MockVMService) GetVMVolume(vmVolumeID string) (*models.VMVolume, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetVMVolume", vmVolumeID) + ret0, _ := ret[0].(*models.VMVolume) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetVMVolume indicates an expected call of GetVMVolume. +func (mr *MockVMServiceMockRecorder) GetVMVolume(vmVolumeID interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetVMVolume", reflect.TypeOf((*MockVMService)(nil).GetVMVolume), vmVolumeID) +} + // GetVlan mocks base method. func (m *MockVMService) GetVlan(id string) (*models.Vlan, error) { m.ctrl.T.Helper() @@ -473,6 +503,21 @@ func (mr *MockVMServiceMockRecorder) RemoveGPUDevices(id, gpus interface{}) *gom return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveGPUDevices", reflect.TypeOf((*MockVMService)(nil).RemoveGPUDevices), id, gpus) } +// ResizeVMVolume mocks base method. +func (m *MockVMService) ResizeVMVolume(vmVolumeID string, size int64) (*models.WithTaskVMVolume, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ResizeVMVolume", vmVolumeID, size) + ret0, _ := ret[0].(*models.WithTaskVMVolume) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ResizeVMVolume indicates an expected call of ResizeVMVolume. +func (mr *MockVMServiceMockRecorder) ResizeVMVolume(vmVolumeID, size interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ResizeVMVolume", reflect.TypeOf((*MockVMService)(nil).ResizeVMVolume), vmVolumeID, size) +} + // ShutDown mocks base method. func (m *MockVMService) ShutDown(uuid string) (*models.Task, error) { m.ctrl.T.Helper() diff --git a/pkg/service/util.go b/pkg/service/util.go index 7d92da62..6eed2517 100644 --- a/pkg/service/util.go +++ b/pkg/service/util.go @@ -110,6 +110,10 @@ func TowerInt32(v int) *int32 { return &val } +func TowerInt64(v int64) *int64 { + return &v +} + func TowerFloat64(v int) *float64 { val := float64(v) @@ -146,6 +150,10 @@ func TowerCPUSockets(vCPU, cpuCores int32) *int32 { return &cpuSockets } +func ByteToGiB(bytes int64) int32 { + return int32(bytes / 1024 / 1024 / 1024) +} + func IsVMInRecycleBin(vm *models.VM) bool { return vm.InRecycleBin != nil && *vm.InRecycleBin } @@ -194,6 +202,10 @@ func IsUpdateVMTask(task *models.Task) bool { return strings.Contains(GetTowerString(task.Description), "Edit VM") } +func IsUpdateVMDiskTask(task *models.Task, vmName string) bool { + return GetTowerString(task.Description) == fmt.Sprintf("Edit VM %s disk", vmName) +} + func IsVMColdMigrationTask(task *models.Task) bool { return strings.Contains(GetTowerString(task.Description), "performing a cold migration") } @@ -206,6 +218,12 @@ func IsPlacementGroupTask(task *models.Task) bool { return strings.Contains(GetTowerString(task.Description), "VM placement group") // Update VM placement group } +// IsTowerResourcePerformingAnOperation returns whether the Tower resource is being operated on. +// Before operating on Tower resources, call this function first to avoid Tower resource lock conflicts. +func IsTowerResourcePerformingAnOperation(entityAsyncStatus *models.EntityAsyncStatus) bool { + return entityAsyncStatus != nil +} + // HasGPUsCanNotBeUsedForVM returns whether the specified GPUs contains GPU // that cannot be used by the specified VM. func HasGPUsCanNotBeUsedForVM(gpuVMInfos GPUVMInfos, elfMachine *infrav1.ElfMachine) bool { @@ -300,3 +318,21 @@ func parseOwnerFromCreatedByAnnotation(createdBy string) string { // last `@` replaced with `_`. return fmt.Sprintf("%s_%s", username, authConfigID) } + +// GetVMSystemDisk selects and returns the system disk from the disks mounted on +// the virtual machine. +// By default, the disk with the smallest boot value is the system disk. +func GetVMSystemDisk(disks []*models.VMDisk) *models.VMDisk { + if len(disks) == 0 { + return nil + } + + systemDisk := disks[0] + for i := 0; i < len(disks); i++ { + if *disks[i].Boot < *systemDisk.Boot { + systemDisk = disks[i] + } + } + + return systemDisk +} diff --git a/pkg/service/util_test.go b/pkg/service/util_test.go index f00de112..3194f531 100644 --- a/pkg/service/util_test.go +++ b/pkg/service/util_test.go @@ -299,3 +299,20 @@ func TestParseOwnerFromCreatedByAnnotation(t *testing.T) { g.Expect(parseOwnerFromCreatedByAnnotation("root@d8dc20fc-e197-41da-83b6-c903c88663fd@")).To(gomega.Equal("root@d8dc20fc-e197-41da-83b6-c903c88663fd@")) }) } + +func TestGetVMSystemDisk(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + disk10 := &models.VMDisk{Boot: TowerInt32(1), ID: TowerString("10")} + disk11 := &models.VMDisk{Boot: TowerInt32(1), ID: TowerString("11")} + disk20 := &models.VMDisk{Boot: TowerInt32(2), ID: TowerString("20")} + + t.Run("GetVMSystemDisk", func(t *testing.T) { + g.Expect(GetVMSystemDisk(nil)).To(gomega.BeNil()) + g.Expect(GetVMSystemDisk([]*models.VMDisk{disk10})).To(gomega.Equal(disk10)) + g.Expect(GetVMSystemDisk([]*models.VMDisk{disk10, disk11})).To(gomega.Equal(disk10)) + g.Expect(GetVMSystemDisk([]*models.VMDisk{disk11, disk10})).To(gomega.Equal(disk11)) + g.Expect(GetVMSystemDisk([]*models.VMDisk{disk20, disk10})).To(gomega.Equal(disk10)) + g.Expect(GetVMSystemDisk([]*models.VMDisk{disk20, disk11, disk10})).To(gomega.Equal(disk11)) + }) +} diff --git a/pkg/service/vm.go b/pkg/service/vm.go index fd3cf82c..bb93fa30 100644 --- a/pkg/service/vm.go +++ b/pkg/service/vm.go @@ -31,8 +31,10 @@ import ( clienttask "github.com/smartxworks/cloudtower-go-sdk/v2/client/task" clientvlan "github.com/smartxworks/cloudtower-go-sdk/v2/client/vlan" clientvm "github.com/smartxworks/cloudtower-go-sdk/v2/client/vm" + clientvmdisk "github.com/smartxworks/cloudtower-go-sdk/v2/client/vm_disk" clientvmnic "github.com/smartxworks/cloudtower-go-sdk/v2/client/vm_nic" clientvmplacementgroup "github.com/smartxworks/cloudtower-go-sdk/v2/client/vm_placement_group" + clientvmvolume "github.com/smartxworks/cloudtower-go-sdk/v2/client/vm_volume" "github.com/smartxworks/cloudtower-go-sdk/v2/models" "k8s.io/apimachinery/pkg/util/wait" @@ -58,6 +60,9 @@ type VMService interface { FindByIDs(ids []string) ([]*models.VM, error) FindVMsByName(name string) ([]*models.VM, error) GetVMNics(vmID string) ([]*models.VMNic, error) + GetVMDisks(vmDiskIDs []string) ([]*models.VMDisk, error) + GetVMVolume(vmVolumeID string) (*models.VMVolume, error) + ResizeVMVolume(vmVolumeID string, size int64) (*models.WithTaskVMVolume, error) GetVMTemplate(template string) (*models.ContentLibraryVMTemplate, error) GetTask(id string) (*models.Task, error) WaitTask(ctx goctx.Context, id string, timeout, interval time.Duration) (*models.Task, error) @@ -118,6 +123,56 @@ func (svr *TowerVMService) UpdateVM(vm *models.VM, elfMachine *infrav1.ElfMachin return updateVMResp.Payload[0], nil } +func (svr *TowerVMService) GetVMDisks(vmDiskIDs []string) ([]*models.VMDisk, error) { + getVMDisksParams := clientvmdisk.NewGetVMDisksParams() + getVMDisksParams.RequestBody = &models.GetVMDisksRequestBody{ + Where: &models.VMDiskWhereInput{IDIn: vmDiskIDs}, + OrderBy: models.NewVMDiskOrderByInput(models.VMDiskOrderByInputBootASC), + } + + getVMDisksResp, err := svr.Session.VMDisk.GetVMDisks(getVMDisksParams) + if err != nil { + return nil, err + } + + return getVMDisksResp.Payload, nil +} + +func (svr *TowerVMService) GetVMVolume(volumeID string) (*models.VMVolume, error) { + getVMVolumesParams := clientvmvolume.NewGetVMVolumesParams() + getVMVolumesParams.RequestBody = &models.GetVMVolumesRequestBody{ + Where: &models.VMVolumeWhereInput{ID: TowerString(volumeID)}, + } + + getVMVolumesResp, err := svr.Session.VMVolume.GetVMVolumes(getVMVolumesParams) + if err != nil { + return nil, err + } + + if len(getVMVolumesResp.Payload) == 0 { + return nil, errors.New(VMVolumeNotFound) + } + + return getVMVolumesResp.Payload[0], nil +} + +// ResizeVMVolume resizes the virtual machine volume to the specified size. +// Can only increase the volume size, not reduce it. +func (svr *TowerVMService) ResizeVMVolume(vmVolumeID string, size int64) (*models.WithTaskVMVolume, error) { + updateVMVolumeParams := clientvmvolume.NewUpdateVMVolumeParams() + updateVMVolumeParams.RequestBody = &models.UpdateVMVolumeParams{ + Data: &models.UpdateVMVolumeParamsData{Size: TowerInt64(size)}, + Where: &models.VMVolumeWhereInput{ID: TowerString(vmVolumeID)}, + } + + updateVMVolumeResp, err := svr.Session.VMVolume.UpdateVMVolume(updateVMVolumeParams) + if err != nil { + return nil, err + } + + return updateVMVolumeResp.Payload[0], nil +} + // Clone kicks off a clone operation on Elf to create a new virtual machine using VM template. func (svr *TowerVMService) Clone( elfCluster *infrav1.ElfCluster, elfMachine *infrav1.ElfMachine, bootstrapData, @@ -150,21 +205,6 @@ func (svr *TowerVMService) Clone( ha = TowerBool(false) } - var mountDisks []*models.MountNewCreateDisksParams - if elfMachine.Spec.DiskGiB > 0 { - storagePolicy := models.VMVolumeElfStoragePolicyTypeREPLICA2THINPROVISION - bus := models.BusVIRTIO - mountDisks = append(mountDisks, &models.MountNewCreateDisksParams{ - Boot: TowerInt32(0), - Bus: &bus, - VMVolume: &models.MountNewCreateDisksParamsVMVolume{ - ElfStoragePolicy: &storagePolicy, - Name: TowerString(config.VMDiskName), - Size: TowerDisk(elfMachine.Spec.DiskGiB), - }, - }) - } - nics := make([]*models.VMNicParams, 0, len(elfMachine.Spec.Network.Devices)) networks := make([]*models.CloudInitNetWork, 0, len(elfMachine.Spec.Network.Devices)) for i := 0; i < len(elfMachine.Spec.Network.Devices); i++ { @@ -269,11 +309,7 @@ func (svr *TowerVMService) Clone( TemplateID: template.ID, GuestOsType: models.NewVMGuestsOperationSystem(models.VMGuestsOperationSystem(elfMachine.Spec.OSType)), VMNics: nics, - DiskOperate: &models.VMDiskOperate{ - NewDisks: &models.VMDiskParams{ - MountNewCreateDisks: mountDisks, - }, - }, + DiskOperate: &models.VMDiskOperate{}, CloudInit: &models.TemplateCloudInit{ Hostname: TowerString(elfMachine.Name), UserData: TowerString(bootstrapData), diff --git a/pkg/util/annotations/helpers.go b/pkg/util/annotations/helpers.go index 2534140f..9007c1fc 100644 --- a/pkg/util/annotations/helpers.go +++ b/pkg/util/annotations/helpers.go @@ -18,6 +18,7 @@ package annotations import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/annotations" infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" @@ -52,7 +53,26 @@ func GetCreatedBy(o metav1.Object) string { return annotations[infrav1.CreatedByAnnotation] } +func GetTemplateClonedFromName(o metav1.Object) string { + annotations := o.GetAnnotations() + if annotations == nil { + return "" + } + + return annotations[clusterv1.TemplateClonedFromNameAnnotation] +} + // AddAnnotations sets the desired annotations on the object and returns true if the annotations have changed. func AddAnnotations(o metav1.Object, desired map[string]string) bool { return annotations.AddAnnotations(o, desired) } + +// RemoveAnnotation deletes the desired annotation on the object. +func RemoveAnnotation(o metav1.Object, annotation string) { + annotations := o.GetAnnotations() + if annotations == nil { + return + } + delete(annotations, annotation) + o.SetAnnotations(annotations) +} diff --git a/pkg/util/machine/machine.go b/pkg/util/machine/machine.go index e4f380d4..53772b88 100644 --- a/pkg/util/machine/machine.go +++ b/pkg/util/machine/machine.go @@ -23,6 +23,7 @@ import ( "strings" "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/conditions" @@ -89,6 +90,47 @@ func GetControlPlaneElfMachinesInCluster(ctx goctx.Context, ctrlClient client.Cl return machines, nil } +func GetElfMachinesForMD( + ctx goctx.Context, + ctrlClient client.Client, + cluster *clusterv1.Cluster, + md *clusterv1.MachineDeployment) ([]*infrav1.ElfMachine, error) { + elfMachineList := &infrav1.ElfMachineList{} + labels := map[string]string{ + clusterv1.ClusterNameLabel: cluster.Name, + clusterv1.MachineDeploymentNameLabel: md.Name, + } + if err := ctrlClient.List(ctx, elfMachineList, client.InNamespace(md.Namespace), client.MatchingLabels(labels)); err != nil { + return nil, err + } + + elfMachines := make([]*infrav1.ElfMachine, len(elfMachineList.Items)) + for i := range elfMachineList.Items { + elfMachines[i] = &elfMachineList.Items[i] + } + + return elfMachines, nil +} + +func GetControlPlaneMachinesForCluster(ctx goctx.Context, ctrlClient client.Client, cluster *clusterv1.Cluster) ([]*clusterv1.Machine, error) { + ms := &clusterv1.MachineList{} + labels := map[string]string{ + clusterv1.ClusterNameLabel: cluster.Name, + clusterv1.MachineControlPlaneLabel: "", + } + + if err := ctrlClient.List(ctx, ms, client.InNamespace(cluster.Namespace), client.MatchingLabels(labels)); err != nil { + return nil, err + } + + machines := make([]*clusterv1.Machine, len(ms.Items)) + for i := range ms.Items { + machines[i] = &ms.Items[i] + } + + return machines, nil +} + // IsControlPlaneMachine returns true if the provided resource is // a member of the control plane. func IsControlPlaneMachine(machine metav1.Object) bool { @@ -129,6 +171,40 @@ func IsMachineFailed(machine *clusterv1.Machine) bool { return machine.Status.FailureReason != nil || machine.Status.FailureMessage != nil } +func IsUpdatingElfMachineResources(elfMachine *infrav1.ElfMachine) bool { + condition := conditions.Get(elfMachine, infrav1.ResourcesHotUpdatedCondition) + if condition != nil && + condition.Status == corev1.ConditionFalse { + if condition.Reason == infrav1.WaitingForResourcesHotUpdateReason && condition.Message != "" { + return false + } + + return true + } + + return false +} + +func IsResourcesUpToDate(elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachine *infrav1.ElfMachine) bool { + return elfMachineTemplate.Spec.Template.Spec.DiskGiB <= elfMachine.Spec.DiskGiB +} + +func NeedUpdateElfMachineResources(elfMachineTemplate *infrav1.ElfMachineTemplate, elfMachine *infrav1.ElfMachine) bool { + if !IsResourcesUpToDate(elfMachineTemplate, elfMachine) { + return true + } + + condition := conditions.Get(elfMachine, infrav1.ResourcesHotUpdatedCondition) + if condition != nil && + condition.Status == corev1.ConditionFalse { + if condition.Reason == infrav1.WaitingForResourcesHotUpdateReason && condition.Message != "" { + return true + } + } + + return false +} + func ConvertProviderIDToUUID(providerID *string) string { if providerID == nil || *providerID == "" { return "" diff --git a/pkg/util/machine/machine_test.go b/pkg/util/machine/machine_test.go index 9fd6b88f..1a338fa7 100644 --- a/pkg/util/machine/machine_test.go +++ b/pkg/util/machine/machine_test.go @@ -22,6 +22,8 @@ import ( "testing" "github.com/onsi/gomega" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/util/conditions" infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" "github.com/smartxworks/cluster-api-provider-elf/test/fake" @@ -231,6 +233,81 @@ func TestGetNetworkStatus(t *testing.T) { } } +func TestGetElfMachinesForMD(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + md := fake.NewMD() + elfCluster, cluster, elfMachine, _, _ := fake.NewClusterAndMachineObjects() + fake.ToWorkerMachine(elfMachine, md) + elfMachine2, _ := fake.NewMachineObjects(elfCluster, cluster) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, elfMachine, elfMachine2) + elfMachines, err := GetElfMachinesForMD(goctx.TODO(), ctrlMgrCtx.Client, cluster, md) + g.Expect(err).NotTo(gomega.HaveOccurred()) + g.Expect(elfMachines).To(gomega.HaveLen(1)) + g.Expect(elfMachines[0].Name).To(gomega.Equal(elfMachine.Name)) +} + +func TestGetControlPlaneMachinesForCluster(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + kcp := fake.NewKCP() + elfCluster, cluster, _, machine, _ := fake.NewClusterAndMachineObjects() + fake.ToControlPlaneMachine(machine, kcp) + _, machine2 := fake.NewMachineObjects(elfCluster, cluster) + ctrlMgrCtx := fake.NewControllerManagerContext(elfCluster, cluster, machine, machine2) + machines, err := GetControlPlaneMachinesForCluster(goctx.TODO(), ctrlMgrCtx.Client, cluster) + g.Expect(err).NotTo(gomega.HaveOccurred()) + g.Expect(machines).To(gomega.HaveLen(1)) + g.Expect(machines[0].Name).To(gomega.Equal(machine.Name)) +} + +func TestIsUpdatingElfMachineResources(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + elfCluster, cluster := fake.NewClusterObjects() + emt := fake.NewElfMachineTemplate() + elfMachine, _ := fake.NewMachineObjects(elfCluster, cluster) + fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) + g.Expect(IsUpdatingElfMachineResources(elfMachine)).To(gomega.BeFalse()) + + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") + g.Expect(IsUpdatingElfMachineResources(elfMachine)).To(gomega.BeTrue()) + + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "xx") + g.Expect(IsUpdatingElfMachineResources(elfMachine)).To(gomega.BeFalse()) +} + +func TestNeedUpdateElfMachineResources(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + elfCluster, cluster := fake.NewClusterObjects() + emt := fake.NewElfMachineTemplate() + elfMachine, _ := fake.NewMachineObjects(elfCluster, cluster) + fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) + g.Expect(NeedUpdateElfMachineResources(emt, elfMachine)).To(gomega.BeFalse()) + + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "") + g.Expect(NeedUpdateElfMachineResources(emt, elfMachine)).To(gomega.BeFalse()) + + conditions.MarkFalse(elfMachine, infrav1.ResourcesHotUpdatedCondition, infrav1.WaitingForResourcesHotUpdateReason, clusterv1.ConditionSeverityInfo, "xx") + g.Expect(NeedUpdateElfMachineResources(emt, elfMachine)).To(gomega.BeTrue()) + + elfMachine.Spec.DiskGiB -= 1 + g.Expect(NeedUpdateElfMachineResources(emt, elfMachine)).To(gomega.BeTrue()) +} + +func TestIsResourcesUpToDate(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + elfCluster, cluster := fake.NewClusterObjects() + emt := fake.NewElfMachineTemplate() + elfMachine, _ := fake.NewMachineObjects(elfCluster, cluster) + fake.SetElfMachineTemplateForElfMachine(elfMachine, emt) + g.Expect(IsResourcesUpToDate(emt, elfMachine)).To(gomega.BeTrue()) + elfMachine.Spec.DiskGiB -= 1 + g.Expect(IsResourcesUpToDate(emt, elfMachine)).To(gomega.BeFalse()) +} + func toString(s string) *string { return &s } diff --git a/pkg/util/machine/md.go b/pkg/util/machine/md.go index 8cba6d68..c41879cf 100644 --- a/pkg/util/machine/md.go +++ b/pkg/util/machine/md.go @@ -34,3 +34,25 @@ func GetMDByMachine(ctx goctx.Context, ctrlClient client.Client, machine *cluste return &md, nil } + +func GetMDsForCluster( + ctx goctx.Context, + ctrlClient client.Client, + namespace, clusterName string) ([]*clusterv1.MachineDeployment, error) { + var mdList clusterv1.MachineDeploymentList + labels := map[string]string{clusterv1.ClusterNameLabel: clusterName} + + if err := ctrlClient.List( + ctx, &mdList, + client.InNamespace(namespace), + client.MatchingLabels(labels)); err != nil { + return nil, err + } + + mds := make([]*clusterv1.MachineDeployment, len(mdList.Items)) + for i := range mdList.Items { + mds[i] = &mdList.Items[i] + } + + return mds, nil +} diff --git a/pkg/util/machine/md_test.go b/pkg/util/machine/md_test.go index 123195a6..f0bfb34e 100644 --- a/pkg/util/machine/md_test.go +++ b/pkg/util/machine/md_test.go @@ -21,6 +21,7 @@ import ( "testing" "github.com/onsi/gomega" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "github.com/smartxworks/cluster-api-provider-elf/test/fake" ) @@ -40,3 +41,20 @@ func TestGetMDByMachine(t *testing.T) { g.Expect(md.Name).To(gomega.Equal(machineDeployment.Name)) }) } + +func TestGetMDsForCluster(t *testing.T) { + g := gomega.NewGomegaWithT(t) + ctx := goctx.TODO() + _, cluster := fake.NewClusterObjects() + md1 := fake.NewMD() + md1.Labels = map[string]string{clusterv1.ClusterNameLabel: cluster.Name} + md2 := fake.NewMD() + ctrlMgrCtx := fake.NewControllerManagerContext(md1, md2) + + t.Run("should return mds", func(t *testing.T) { + mds, err := GetMDsForCluster(ctx, ctrlMgrCtx.Client, cluster.Namespace, cluster.Name) + g.Expect(err).ToNot(gomega.HaveOccurred()) + g.Expect(mds).To(gomega.HaveLen(1)) + g.Expect(mds[0].Name).To(gomega.Equal(md1.Name)) + }) +} diff --git a/pkg/util/md/md.go b/pkg/util/md/md.go new file mode 100644 index 00000000..6d40bf2e --- /dev/null +++ b/pkg/util/md/md.go @@ -0,0 +1,95 @@ +package md + +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +import ( + intstrutil "k8s.io/apimachinery/pkg/util/intstr" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" +) + +// IsMDInRollingUpdate returns whether MD is in rolling update. +// +// When *md.Spec.Replicas > md.Status.UpdatedReplicas, it must be in a MD rolling update process. +// When *md.Spec.Replicas == md.Status.UpdatedReplicas, it could be in one of the following cases: +// 1. It's not in a MD rolling update process. So md.Spec.Replicas == md.Status.Replicas. +// 2. It's at the end of a MD rolling update process, and the last MD replica (i.e the last MD ElfMachine) is created just now. +// There is still an old MD ElfMachine, so md.Spec.Replicas + 1 == md.Status.Replicas. +func IsMDInRollingUpdate(md *clusterv1.MachineDeployment) bool { + if (*md.Spec.Replicas > md.Status.UpdatedReplicas && *md.Spec.Replicas <= md.Status.Replicas) || + (*md.Spec.Replicas == md.Status.UpdatedReplicas && *md.Spec.Replicas < md.Status.Replicas) { + return true + } + + return false +} + +/* +Copy from CAPI: https://github.com/kubernetes-sigs/cluster-api/blob/release-1.5/internal/controllers/machinedeployment/mdutil/util.go +*/ + +// MaxUnavailable returns the maximum unavailable machines a rolling deployment can take. +func MaxUnavailable(deployment clusterv1.MachineDeployment) int32 { + if deployment.Spec.Strategy.Type != clusterv1.RollingUpdateMachineDeploymentStrategyType || *(deployment.Spec.Replicas) == 0 { + return int32(0) + } + // Error caught by validation + _, maxUnavailable, _ := ResolveFenceposts(deployment.Spec.Strategy.RollingUpdate.MaxSurge, deployment.Spec.Strategy.RollingUpdate.MaxUnavailable, *(deployment.Spec.Replicas)) + if maxUnavailable > *deployment.Spec.Replicas { + return *deployment.Spec.Replicas + } + return maxUnavailable +} + +// MaxSurge returns the maximum surge machines a rolling deployment can take. +func MaxSurge(deployment clusterv1.MachineDeployment) int32 { + if deployment.Spec.Strategy.Type != clusterv1.RollingUpdateMachineDeploymentStrategyType { + return int32(0) + } + // Error caught by validation + maxSurge, _, _ := ResolveFenceposts(deployment.Spec.Strategy.RollingUpdate.MaxSurge, deployment.Spec.Strategy.RollingUpdate.MaxUnavailable, *(deployment.Spec.Replicas)) + return maxSurge +} + +// ResolveFenceposts resolves both maxSurge and maxUnavailable. This needs to happen in one +// step. For example: +// +// 2 desired, max unavailable 1%, surge 0% - should scale old(-1), then new(+1), then old(-1), then new(+1) +// 1 desired, max unavailable 1%, surge 0% - should scale old(-1), then new(+1) +// 2 desired, max unavailable 25%, surge 1% - should scale new(+1), then old(-1), then new(+1), then old(-1) +// 1 desired, max unavailable 25%, surge 1% - should scale new(+1), then old(-1) +// 2 desired, max unavailable 0%, surge 1% - should scale new(+1), then old(-1), then new(+1), then old(-1) +// 1 desired, max unavailable 0%, surge 1% - should scale new(+1), then old(-1). +func ResolveFenceposts(maxSurge, maxUnavailable *intstrutil.IntOrString, desired int32) (int32, int32, error) { + surge, err := intstrutil.GetScaledValueFromIntOrPercent(maxSurge, int(desired), true) + if err != nil { + return 0, 0, err + } + unavailable, err := intstrutil.GetScaledValueFromIntOrPercent(maxUnavailable, int(desired), false) + if err != nil { + return 0, 0, err + } + + if surge == 0 && unavailable == 0 { + // Validation should never allow the user to explicitly use zero values for both maxSurge + // maxUnavailable. Due to rounding down maxUnavailable though, it may resolve to zero. + // If both fenceposts resolve to zero, then we should set maxUnavailable to 1 on the + // theory that surge might not work due to quota. + unavailable = 1 + } + + return int32(surge), int32(unavailable), nil +} diff --git a/pkg/util/md/md_test.go b/pkg/util/md/md_test.go new file mode 100644 index 00000000..674bbbeb --- /dev/null +++ b/pkg/util/md/md_test.go @@ -0,0 +1,254 @@ +package md + +import ( + "fmt" + "testing" + + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/pointer" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" +) + +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +func TestMaxUnavailable(t *testing.T) { + deployment := func(replicas int32, maxUnavailable intstr.IntOrString) clusterv1.MachineDeployment { + return clusterv1.MachineDeployment{ + Spec: clusterv1.MachineDeploymentSpec{ + Replicas: func(i int32) *int32 { return &i }(replicas), + Strategy: &clusterv1.MachineDeploymentStrategy{ + RollingUpdate: &clusterv1.MachineRollingUpdateDeployment{ + MaxSurge: func(i int) *intstr.IntOrString { x := intstr.FromInt(i); return &x }(int(1)), + MaxUnavailable: &maxUnavailable, + }, + Type: clusterv1.RollingUpdateMachineDeploymentStrategyType, + }, + }, + } + } + tests := []struct { + name string + deployment clusterv1.MachineDeployment + expected int32 + }{ + { + name: "maxUnavailable less than replicas", + deployment: deployment(10, intstr.FromInt(5)), + expected: int32(5), + }, + { + name: "maxUnavailable equal replicas", + deployment: deployment(10, intstr.FromInt(10)), + expected: int32(10), + }, + { + name: "maxUnavailable greater than replicas", + deployment: deployment(5, intstr.FromInt(10)), + expected: int32(5), + }, + { + name: "maxUnavailable with replicas is 0", + deployment: deployment(0, intstr.FromInt(10)), + expected: int32(0), + }, + { + name: "maxUnavailable less than replicas with percents", + deployment: deployment(10, intstr.FromString("50%")), + expected: int32(5), + }, + { + name: "maxUnavailable equal replicas with percents", + deployment: deployment(10, intstr.FromString("100%")), + expected: int32(10), + }, + { + name: "maxUnavailable greater than replicas with percents", + deployment: deployment(5, intstr.FromString("100%")), + expected: int32(5), + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + g := NewWithT(t) + + g.Expect(MaxUnavailable(test.deployment)).To(Equal(test.expected)) + }) + } +} + +func TestResolveFenceposts(t *testing.T) { + tests := []struct { + maxSurge string + maxUnavailable string + desired int32 + expectSurge int32 + expectUnavailable int32 + expectError bool + }{ + { + maxSurge: "0%", + maxUnavailable: "0%", + desired: 0, + expectSurge: 0, + expectUnavailable: 1, + expectError: false, + }, + { + maxSurge: "39%", + maxUnavailable: "39%", + desired: 10, + expectSurge: 4, + expectUnavailable: 3, + expectError: false, + }, + { + maxSurge: "oops", + maxUnavailable: "39%", + desired: 10, + expectSurge: 0, + expectUnavailable: 0, + expectError: true, + }, + { + maxSurge: "55%", + maxUnavailable: "urg", + desired: 10, + expectSurge: 0, + expectUnavailable: 0, + expectError: true, + }, + { + maxSurge: "5", + maxUnavailable: "1", + desired: 7, + expectSurge: 0, + expectUnavailable: 0, + expectError: true, + }, + } + + for _, test := range tests { + t.Run("maxSurge="+test.maxSurge, func(t *testing.T) { + g := NewWithT(t) + + maxSurge := intstr.FromString(test.maxSurge) + maxUnavail := intstr.FromString(test.maxUnavailable) + surge, unavail, err := ResolveFenceposts(&maxSurge, &maxUnavail, test.desired) + if test.expectError { + g.Expect(err).To(HaveOccurred()) + } else { + g.Expect(err).ToNot(HaveOccurred()) + } + g.Expect(surge).To(Equal(test.expectSurge)) + g.Expect(unavail).To(Equal(test.expectUnavailable)) + }) + } +} + +func TestMaxSurge(t *testing.T) { + maxSurge := intstr.FromInt(1) + maxUnavailable := intstr.FromInt(1) + tests := []struct { + strategy *clusterv1.MachineDeploymentStrategy + expectSurge int32 + }{ + { + strategy: &clusterv1.MachineDeploymentStrategy{ + Type: clusterv1.OnDeleteMachineDeploymentStrategyType, + }, + expectSurge: 0, + }, + { + strategy: &clusterv1.MachineDeploymentStrategy{ + Type: clusterv1.RollingUpdateMachineDeploymentStrategyType, + RollingUpdate: &clusterv1.MachineRollingUpdateDeployment{ + MaxSurge: &maxSurge, + MaxUnavailable: &maxUnavailable, + }, + }, + expectSurge: 1, + }, + } + + for _, tc := range tests { + t.Run(fmt.Sprintf("maxSurge=%d", tc.expectSurge), func(t *testing.T) { + g := NewWithT(t) + + deployment := clusterv1.MachineDeployment{ + Spec: clusterv1.MachineDeploymentSpec{ + Replicas: pointer.Int32(1), + Strategy: tc.strategy, + }, + } + surge := MaxSurge(deployment) + g.Expect(surge).To(Equal(tc.expectSurge)) + }) + } +} + +func TestIsMDInRollingUpdate(t *testing.T) { + tests := []struct { + specReplicas int32 + statusReplicas int32 + updatedReplicas int32 + isUpdated bool + }{ + { + specReplicas: 1, + statusReplicas: 1, + updatedReplicas: 1, + isUpdated: false, + }, + { + specReplicas: 3, + statusReplicas: 3, + updatedReplicas: 3, + isUpdated: false, + }, + { + specReplicas: 3, + statusReplicas: 3, + updatedReplicas: 2, + isUpdated: true, + }, + { + specReplicas: 3, + statusReplicas: 3, + updatedReplicas: 1, + isUpdated: true, + }, + } + + for _, tc := range tests { + t.Run(fmt.Sprintf("%v", tc), func(t *testing.T) { + g := NewWithT(t) + + deployment := clusterv1.MachineDeployment{ + Spec: clusterv1.MachineDeploymentSpec{ + Replicas: pointer.Int32(tc.specReplicas), + }, + Status: clusterv1.MachineDeploymentStatus{ + Replicas: tc.statusReplicas, + UpdatedReplicas: tc.updatedReplicas, + }, + } + g.Expect(IsMDInRollingUpdate(&deployment)).To(Equal(tc.isUpdated)) + }) + } +} diff --git a/templates/cluster-template.yaml b/templates/cluster-template.yaml index 3d915720..ed5c502e 100644 --- a/templates/cluster-template.yaml +++ b/templates/cluster-template.yaml @@ -98,6 +98,38 @@ spec: - echo "127.0.0.1 {{ ds.meta_data.hostname }}" >>/etc/hosts - echo "{{ ds.meta_data.hostname }}" >/etc/hostname - /etc/kube-vip-prepare.sh + #! If you need to expand the system disk capacity (based on the capacity of + #! the virtual machine template disk) when creating a node virtual machine, + #! you need to set the following commands. CAPE will add new disk capacity + #! to the system disk through Tower API. These commands will add the new + #! capacity of the system disk to the root partition. + #! - | + #! . /etc/os-release + #! rootpath="/dev/mapper/rl-root" + #! if [[ $ID == 'openEuler' ]]; then + #! rootpath="/dev/mapper/openeuler-root" + #! fi + #! - | + #! result=$(growpart /dev/vda 2) + #! if [[ $? == 0 ]]; then + #! echo "$result" + #! elif [[ $result == NOCHANGE* ]]; then + #! echo "$result" + #! else + #! echo "$result" + #! exit 1 + #! fi + #! - "pvresize /dev/vda2" + #! - | + #! result=$(lvextend -l+100%FREE -n $rootpath 2>&1) + #! if [[ $? == 0 ]]; then + #! echo "$result" + #! elif [[ $result == *'matches existing size'* ]]; then + #! echo "$result" + #! else + #! echo "$result" + #! exit 1 + #! fi useExperimentalRetryJoin: true files: - content: | @@ -247,6 +279,38 @@ spec: - echo "127.0.0.1 localhost" >>/etc/hosts - echo "127.0.0.1 {{ ds.meta_data.hostname }}" >>/etc/hosts - echo "{{ ds.meta_data.hostname }}" >/etc/hostname + #! If you need to expand the system disk capacity (based on the capacity of + #! the virtual machine template disk) when creating a node virtual machine, + #! you need to set the following commands. CAPE will add new disk capacity + #! to the system disk through Tower API. These commands will add the new + #! capacity of the system disk to the root partition. + #! - | + #! . /etc/os-release + #! rootpath="/dev/mapper/rl-root" + #! if [[ $ID == 'openEuler' ]]; then + #! rootpath="/dev/mapper/openeuler-root" + #! fi + #! - | + #! result=$(growpart /dev/vda 2) + #! if [[ $? == 0 ]]; then + #! echo "$result" + #! elif [[ $result == NOCHANGE* ]]; then + #! echo "$result" + #! else + #! echo "$result" + #! exit 1 + #! fi + #! - "pvresize /dev/vda2" + #! - | + #! result=$(lvextend -l+100%FREE -n $rootpath 2>&1) + #! if [[ $? == 0 ]]; then + #! echo "$result" + #! elif [[ $result == *'matches existing size'* ]]; then + #! echo "$result" + #! else + #! echo "$result" + #! exit 1 + #! fi --- apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 kind: ElfMachineTemplate diff --git a/test/config/host-agent/kubesmart.smtx.io_hostconfigs.yaml b/test/config/host-agent/kubesmart.smtx.io_hostconfigs.yaml new file mode 100644 index 00000000..a75109b8 --- /dev/null +++ b/test/config/host-agent/kubesmart.smtx.io_hostconfigs.yaml @@ -0,0 +1,153 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.12.0 + name: hostconfigs.kubesmart.smtx.io +spec: + group: kubesmart.smtx.io + names: + categories: + - kubesmart + kind: HostConfig + listKind: HostConfigList + plural: hostconfigs + shortNames: + - hc + singular: hostconfig + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: the current phase of HostConfig + jsonPath: .status.phase + name: Phase + type: string + - description: the last execution time + jsonPath: .status.lastExecutionTime + name: LastExecutionTime + type: string + - description: Time duration since creation of HostConfig + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: HostConfig is the Schema for the HostConfig API. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + properties: + config: + properties: + ansible: + description: Ansible 通过 ansible playbook 完成配置 + properties: + localPlaybookText: + description: LocalPlaybookText 本地的 playbook,单个 yaml 文件, secret + 引用或者 yaml 字符串 + properties: + inline: + description: Inline is the inline yaml text. + format: yaml + type: string + secretRef: + description: SecretRef specifies the secret which stores + yaml text. + properties: + name: + description: name is unique within a namespace to + reference a secret resource. + type: string + namespace: + description: namespace defines the space within which + the secret name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + type: object + remotePlaybook: + description: RemotePlaybook 在远端的 playbook,单个 .tar.gz 压缩包,内容可以是单个 + yaml 文件,也可以符合 ansible 要求的目录 + properties: + md5sum: + description: MD5sum 压缩包的 MD5,填写了会进行校验,已经下载过的 playbook + 校验通过后跳过重复下载 + type: string + name: + description: Name 要执行的 playbook 文件名,相对于压缩包顶层的位置 + type: string + url: + description: URL playbook 在远端的地址,支持 https + type: string + required: + - name + - url + type: object + values: + description: Values 执行 playbook 的参数,yaml 格式,可以是 secret 引用或者 + yaml 字符串 + properties: + inline: + description: Inline is the inline yaml text. + format: yaml + type: string + secretRef: + description: SecretRef specifies the secret which stores + yaml text. + properties: + name: + description: name is unique within a namespace to + reference a secret resource. + type: string + namespace: + description: namespace defines the space within which + the secret name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + type: object + type: object + timeout: + description: Timeout 执行一次配置的超时时间 + type: string + type: object + nodeName: + type: string + required: + - config + - nodeName + type: object + status: + properties: + failureMessage: + type: string + failureReason: + type: string + lastExecutionTime: + description: LastExecutionTime 最后执行的时间戳 + format: date-time + type: string + phase: + description: Phase 当前状态 + type: string + required: + - phase + type: object + type: object + served: true + storage: true + subresources: + status: {} \ No newline at end of file diff --git a/test/config/host-agent/kubesmart.smtx.io_hostoperationjobs.yaml b/test/config/host-agent/kubesmart.smtx.io_hostoperationjobs.yaml new file mode 100644 index 00000000..037d9db5 --- /dev/null +++ b/test/config/host-agent/kubesmart.smtx.io_hostoperationjobs.yaml @@ -0,0 +1,153 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.12.0 + name: hostoperationjobs.kubesmart.smtx.io +spec: + group: kubesmart.smtx.io + names: + categories: + - kubesmart + kind: HostOperationJob + listKind: HostOperationJobList + plural: hostoperationjobs + shortNames: + - hoj + singular: hostoperationjob + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: the current phase of HostOperationJob + jsonPath: .status.phase + name: Phase + type: string + - description: the last execution time + jsonPath: .status.lastExecutionTime + name: LastExecutionTime + type: string + - description: Time duration since creation of HostOperationJob + jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: HostOperationJob is the Schema for the HostOperationJob API. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + properties: + nodeName: + type: string + operation: + properties: + ansible: + description: Ansible 通过 ansible playbook 完成操作 + properties: + localPlaybookText: + description: LocalPlaybookText 本地的 playbook,单个 yaml 文件, secret + 引用或者 yaml 字符串 + properties: + inline: + description: Inline is the inline yaml text. + format: yaml + type: string + secretRef: + description: SecretRef specifies the secret which stores + yaml text. + properties: + name: + description: name is unique within a namespace to + reference a secret resource. + type: string + namespace: + description: namespace defines the space within which + the secret name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + type: object + remotePlaybook: + description: RemotePlaybook 在远端的 playbook,单个 .tar.gz 压缩包,内容可以是单个 + yaml 文件,也可以符合 ansible 要求的目录 + properties: + md5sum: + description: MD5sum 压缩包的 MD5,填写了会进行校验,已经下载过的 playbook + 校验通过后跳过重复下载 + type: string + name: + description: Name 要执行的 playbook 文件名,相对于压缩包顶层的位置 + type: string + url: + description: URL playbook 在远端的地址,支持 https + type: string + required: + - name + - url + type: object + values: + description: Values 执行 playbook 的参数,yaml 格式,可以是 secret 引用或者 + yaml 字符串 + properties: + inline: + description: Inline is the inline yaml text. + format: yaml + type: string + secretRef: + description: SecretRef specifies the secret which stores + yaml text. + properties: + name: + description: name is unique within a namespace to + reference a secret resource. + type: string + namespace: + description: namespace defines the space within which + the secret name must be unique. + type: string + type: object + x-kubernetes-map-type: atomic + type: object + type: object + timeout: + description: Timeout 执行一次操作的超时时间 + type: string + type: object + required: + - nodeName + - operation + type: object + status: + properties: + failureMessage: + type: string + failureReason: + type: string + lastExecutionTime: + description: LastExecutionTime 最后执行的时间戳 + format: date-time + type: string + phase: + description: Phase 当前阶段 + type: string + required: + - phase + type: object + type: object + served: true + storage: true + subresources: + status: {} \ No newline at end of file diff --git a/test/fake/controller_manager_context.go b/test/fake/controller_manager_context.go index 3b237ca5..8cf45680 100644 --- a/test/fake/controller_manager_context.go +++ b/test/fake/controller_manager_context.go @@ -17,6 +17,7 @@ limitations under the License. package fake import ( + agentv1 "github.com/smartxworks/host-config-agent-api/api/v1alpha1" "k8s.io/apimachinery/pkg/runtime" cgscheme "k8s.io/client-go/kubernetes/scheme" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -54,6 +55,7 @@ func NewControllerManagerContext(initObjects ...client.Object) *context.Controll _ = clusterv1.AddToScheme(scheme) _ = controlplanev1.AddToScheme(scheme) _ = infrav1.AddToScheme(scheme) + _ = agentv1.AddToScheme(scheme) clientWithObjects := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource( &infrav1.ElfCluster{}, diff --git a/test/fake/tower.go b/test/fake/tower.go index c19b6c1e..4c76575e 100644 --- a/test/fake/tower.go +++ b/test/fake/tower.go @@ -175,3 +175,26 @@ func NewTowerVGPUVMInfo(vGPUCount int32) *models.GpuVMInfo { Model: pointer.String(""), } } + +func NewVMVolume(elfMachine *infrav1.ElfMachine) *models.VMVolume { + return &models.VMVolume{ + ID: pointer.String(ID()), + Name: pointer.String(ID()), + Size: service.TowerDisk(elfMachine.Spec.DiskGiB), + } +} + +func NewWithTaskVMVolume(vmVolume *models.VMVolume, task *models.Task) *models.WithTaskVMVolume { + return &models.WithTaskVMVolume{ + Data: vmVolume, + TaskID: task.ID, + } +} + +func NewVMDisk(vmVolume *models.VMVolume) *models.VMDisk { + return &models.VMDisk{ + ID: pointer.String(ID()), + Boot: pointer.Int32(0), + VMVolume: &models.NestedVMVolume{ID: vmVolume.ID}, + } +} diff --git a/test/fake/types.go b/test/fake/types.go index 5d825f90..a556d5e2 100644 --- a/test/fake/types.go +++ b/test/fake/types.go @@ -48,6 +48,9 @@ const ( // ElfMachineKind is the fake elf machine kind. ElfMachineKind = "ElfMachine" + + // DiskGiB is the default disk size. + DiskGiB = 60 ) func NewClusterObjects() (*infrav1.ElfCluster, *clusterv1.Cluster) { @@ -117,6 +120,7 @@ func NewElfMachine(elfCluster *infrav1.ElfCluster) *infrav1.ElfMachine { NumCPUs: 1, NumCoresPerSocket: 1, MemoryMiB: 1, + DiskGiB: DiskGiB, Network: infrav1.NetworkSpec{ Devices: []infrav1.NetworkDeviceSpec{ { @@ -232,3 +236,27 @@ func ToWorkerMachine(machine metav1.Object, md *clusterv1.MachineDeployment) { machine.SetLabels(labels) } + +func NewElfMachineTemplate() *infrav1.ElfMachineTemplate { + return &infrav1.ElfMachineTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: names.SimpleNameGenerator.GenerateName("elfMachineTemplate-"), + Namespace: Namespace, + }, + Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: DiskGiB, + }, + }, + }, + } +} + +func SetElfMachineTemplateForElfMachine(elfMachine *infrav1.ElfMachine, emt *infrav1.ElfMachineTemplate) { + if elfMachine.Annotations == nil { + elfMachine.Annotations = make(map[string]string) + } + elfMachine.Annotations[clusterv1.TemplateClonedFromNameAnnotation] = emt.Name + elfMachine.Spec = *emt.Spec.Template.Spec.DeepCopy() +} diff --git a/test/helpers/envtest.go b/test/helpers/envtest.go index 09b5795b..aecb1c81 100644 --- a/test/helpers/envtest.go +++ b/test/helpers/envtest.go @@ -98,6 +98,8 @@ func init() { if capiPath := getFilePathToCAPICRDs(root); capiPath != "" { crdPaths = append(crdPaths, capiPath) } + + crdPaths = append(crdPaths, filepath.Join(root, "test", "config", "host-agent")) } // TestEnvironment encapsulates a Kubernetes local test environment. @@ -152,6 +154,16 @@ func NewTestEnvironment(ctx goctx.Context) *TestEnvironment { KubeConfig: env.Config, } managerOpts.AddToManager = func(ctx goctx.Context, ctrlMgrCtx *context.ControllerManagerContext, mgr ctrlmgr.Manager) error { + if err := (&webhooks.ElfMachineTemplateValidator{}).SetupWebhookWithManager(mgr); err != nil { + return err + } + + if err := (&webhooks.ElfMachineValidator{ + Client: mgr.GetClient(), + }).SetupWebhookWithManager(mgr); err != nil { + return err + } + if err := (&webhooks.ElfMachineMutation{ Client: mgr.GetClient(), Logger: mgr.GetLogger().WithName("ElfMachineMutation"), diff --git a/webhooks/elfmachine_webhook_validation.go b/webhooks/elfmachine_webhook_validation.go new file mode 100644 index 00000000..959089da --- /dev/null +++ b/webhooks/elfmachine_webhook_validation.go @@ -0,0 +1,104 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhooks + +import ( + goctx "context" + "fmt" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/validation/field" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/webhook" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" + annotationsutil "github.com/smartxworks/cluster-api-provider-elf/pkg/util/annotations" +) + +// Error messages. +const ( + canOnlyModifiedThroughElfMachineTemplate = "virtual machine resources can only be modified through ElfMachineTemplate %s" +) + +func (v *ElfMachineValidator) SetupWebhookWithManager(mgr ctrl.Manager) error { + return ctrl.NewWebhookManagedBy(mgr). + For(&infrav1.ElfMachine{}). + WithValidator(v). + Complete() +} + +//+kubebuilder:webhook:path=/validate-infrastructure-cluster-x-k8s-io-v1beta1-elfmachine,mutating=false,failurePolicy=fail,sideEffects=None,groups=infrastructure.cluster.x-k8s.io,resources=elfmachines,verbs=create;update,versions=v1beta1,name=validation.elfmachine.infrastructure.x-k8s.io,admissionReviewVersions=v1 + +// ElfMachineValidator implements a validation webhook for ElfMachine. +type ElfMachineValidator struct { + client.Client +} + +var _ webhook.CustomValidator = &ElfMachineTemplateValidator{} + +// ValidateCreate implements webhook.Validator so a webhook will be registered for the type. +func (v *ElfMachineValidator) ValidateCreate(ctx goctx.Context, obj runtime.Object) (admission.Warnings, error) { + return nil, nil +} + +// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type. +func (v *ElfMachineValidator) ValidateUpdate(ctx goctx.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { + oldElfMachine, ok := oldObj.(*infrav1.ElfMachine) //nolint:forcetypeassert + if !ok { + return nil, apierrors.NewBadRequest(fmt.Sprintf("expected an ElfMachine but got a %T", oldObj)) + } + elfMachine, ok := newObj.(*infrav1.ElfMachine) //nolint:forcetypeassert + if !ok { + return nil, apierrors.NewBadRequest(fmt.Sprintf("expected an ElfMachine but got a %T", newObj)) + } + + var allErrs field.ErrorList + + elfMachineTemplateName := annotationsutil.GetTemplateClonedFromName(elfMachine) + if elfMachineTemplateName == "" { + if elfMachine.Spec.DiskGiB < oldElfMachine.Spec.DiskGiB { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "diskGiB"), elfMachine.Spec.DiskGiB, diskCapacityCanOnlyBeExpanded)) + } + + return nil, aggregateObjErrors(elfMachine.GroupVersionKind().GroupKind(), elfMachine.Name, allErrs) + } + + // If the ElfMachine was created using ElfMachineTemplate. ElfMachine's + // resources can only be modified through this ElfMachineTemplate. + + var elfMachineTemplate infrav1.ElfMachineTemplate + if err := v.Client.Get(ctx, client.ObjectKey{ + Namespace: elfMachine.Namespace, + Name: annotationsutil.GetTemplateClonedFromName(elfMachine), + }, &elfMachineTemplate); err != nil { + return nil, apierrors.NewInternalError(err) + } + + if elfMachine.Spec.DiskGiB != elfMachineTemplate.Spec.Template.Spec.DiskGiB { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "diskGiB"), elfMachine.Spec.DiskGiB, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplateName))) + } + + return nil, aggregateObjErrors(elfMachine.GroupVersionKind().GroupKind(), elfMachine.Name, allErrs) +} + +// ValidateDelete implements webhook.Validator so a webhook will be registered for the type. +func (v *ElfMachineValidator) ValidateDelete(ctx goctx.Context, obj runtime.Object) (admission.Warnings, error) { + return nil, nil +} diff --git a/webhooks/elfmachine_webhook_validation_test.go b/webhooks/elfmachine_webhook_validation_test.go new file mode 100644 index 00000000..9ffa3217 --- /dev/null +++ b/webhooks/elfmachine_webhook_validation_test.go @@ -0,0 +1,136 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhooks + +import ( + goctx "context" + "fmt" + "testing" + + . "github.com/onsi/gomega" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/validation/field" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" +) + +func TestElfMachineValidatorValidateUpdate(t *testing.T) { + g := NewWithT(t) + + var tests []elfMachineTestCase + scheme := newScheme(g) + + elfMachineTemplate := &infrav1.ElfMachineTemplate{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 1, + }, + }, + }, + } + + tests = append(tests, elfMachineTestCase{ + Name: "Cannot reduce disk capacity", + OldEM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 2, + }, + }, + EM: &infrav1.ElfMachine{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 1, + }, + }, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "diskGiB"), 1, diskCapacityCanOnlyBeExpanded), + }, + }) + + tests = append(tests, elfMachineTestCase{ + Name: "Disk cannot be modified directly", + OldEM: nil, + EM: &infrav1.ElfMachine{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + clusterv1.TemplateClonedFromNameAnnotation: elfMachineTemplate.Name, + }, + }, + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 2, + }, + }, + Objs: []client.Object{elfMachineTemplate}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "diskGiB"), 2, fmt.Sprintf(canOnlyModifiedThroughElfMachineTemplate, elfMachineTemplate.Name)), + }, + }) + + for _, tc := range tests { + t.Run(tc.Name, func(t *testing.T) { + validator := &ElfMachineValidator{ + Client: fake.NewClientBuilder().WithScheme(scheme).WithObjects(tc.Objs...).Build(), + } + warnings, err := validator.ValidateUpdate(goctx.Background(), tc.OldEM, tc.EM) + g.Expect(warnings).To(BeEmpty()) + expectElfMachineTestCase(g, tc, err) + }) + } +} + +func newScheme(g Gomega) *runtime.Scheme { + scheme := runtime.NewScheme() + g.Expect(infrav1.AddToScheme(scheme)).To(Succeed()) + + return scheme +} + +func expectElfMachineTestCase(g Gomega, tc elfMachineTestCase, err error) { + if tc.Errs != nil { + g.Expect(err).To(HaveOccurred()) + statusErr, ok := err.(*apierrors.StatusError) + g.Expect(ok).To(BeTrue()) + g.Expect(statusErr.ErrStatus.Details.Group).To(Equal(tc.EM.GroupVersionKind().Group)) + g.Expect(statusErr.ErrStatus.Details.Kind).To(Equal(tc.EM.GroupVersionKind().Kind)) + g.Expect(statusErr.ErrStatus.Details.Name).To(Equal(tc.EM.Name)) + causes := make([]metav1.StatusCause, 0, len(tc.Errs)) + for i := 0; i < len(tc.Errs); i++ { + causes = append(causes, metav1.StatusCause{ + Type: metav1.CauseType(tc.Errs[i].Type), + Message: tc.Errs[i].ErrorBody(), + Field: tc.Errs[i].Field, + }) + } + g.Expect(statusErr.ErrStatus.Details.Causes).To(Equal(causes)) + } else { + g.Expect(err).NotTo(HaveOccurred()) + } +} + +type elfMachineTestCase struct { + Name string + EM *infrav1.ElfMachine + OldEM *infrav1.ElfMachine + Objs []client.Object + Errs field.ErrorList +} diff --git a/webhooks/elfmachinetemplate_webhook_validation.go b/webhooks/elfmachinetemplate_webhook_validation.go new file mode 100644 index 00000000..2597b2ac --- /dev/null +++ b/webhooks/elfmachinetemplate_webhook_validation.go @@ -0,0 +1,90 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhooks + +import ( + goctx "context" + "fmt" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/validation/field" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/webhook" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" +) + +// Error messages. +const ( + diskCapacityCannotLessThanZeroMsg = "the disk capacity can only greater than or equal to 0" + diskCapacityCanOnlyBeExpanded = "the disk capacity can only be expanded" +) + +func (v *ElfMachineTemplateValidator) SetupWebhookWithManager(mgr ctrl.Manager) error { + return ctrl.NewWebhookManagedBy(mgr). + For(&infrav1.ElfMachineTemplate{}). + WithValidator(v). + Complete() +} + +//+kubebuilder:webhook:path=/validate-infrastructure-cluster-x-k8s-io-v1beta1-elfmachinetemplate,mutating=false,failurePolicy=fail,sideEffects=None,groups=infrastructure.cluster.x-k8s.io,resources=elfmachinetemplates,verbs=create;update,versions=v1beta1,name=validation.elfmachinetemplate.infrastructure.x-k8s.io,admissionReviewVersions=v1 + +// ElfMachineTemplateValidator implements a validation webhook for ElfMachineTemplate. +type ElfMachineTemplateValidator struct{} + +var _ webhook.CustomValidator = &ElfMachineTemplateValidator{} + +// ValidateCreate implements webhook.Validator so a webhook will be registered for the type. +func (v *ElfMachineTemplateValidator) ValidateCreate(ctx goctx.Context, obj runtime.Object) (admission.Warnings, error) { + elfMachineTemplate, ok := obj.(*infrav1.ElfMachineTemplate) + if !ok { + return nil, apierrors.NewBadRequest(fmt.Sprintf("expected an ElfMachineTemplate but got a %T", obj)) + } + + var allErrs field.ErrorList + if elfMachineTemplate.Spec.Template.Spec.DiskGiB < 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), elfMachineTemplate.Spec.Template.Spec.DiskGiB, diskCapacityCannotLessThanZeroMsg)) + } + + return nil, aggregateObjErrors(elfMachineTemplate.GroupVersionKind().GroupKind(), elfMachineTemplate.Name, allErrs) +} + +// ValidateUpdate implements webhook.Validator so a webhook will be registered for the type. +func (v *ElfMachineTemplateValidator) ValidateUpdate(ctx goctx.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { + oldElfMachineTemplate, ok := oldObj.(*infrav1.ElfMachineTemplate) //nolint:forcetypeassert + if !ok { + return nil, apierrors.NewBadRequest(fmt.Sprintf("expected an ElfMachineTemplate but got a %T", oldObj)) + } + elfMachineTemplate, ok := newObj.(*infrav1.ElfMachineTemplate) //nolint:forcetypeassert + if !ok { + return nil, apierrors.NewBadRequest(fmt.Sprintf("expected an ElfMachineTemplate but got a %T", newObj)) + } + + var allErrs field.ErrorList + if elfMachineTemplate.Spec.Template.Spec.DiskGiB < oldElfMachineTemplate.Spec.Template.Spec.DiskGiB { + allErrs = append(allErrs, field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), elfMachineTemplate.Spec.Template.Spec.DiskGiB, diskCapacityCanOnlyBeExpanded)) + } + + return nil, aggregateObjErrors(elfMachineTemplate.GroupVersionKind().GroupKind(), elfMachineTemplate.Name, allErrs) +} + +// ValidateDelete implements webhook.Validator so a webhook will be registered for the type. +func (v *ElfMachineTemplateValidator) ValidateDelete(ctx goctx.Context, obj runtime.Object) (admission.Warnings, error) { + return nil, nil +} diff --git a/webhooks/elfmachinetemplate_webhook_validation_test.go b/webhooks/elfmachinetemplate_webhook_validation_test.go new file mode 100644 index 00000000..1ae4eb99 --- /dev/null +++ b/webhooks/elfmachinetemplate_webhook_validation_test.go @@ -0,0 +1,143 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhooks + +import ( + goctx "context" + "testing" + + . "github.com/onsi/gomega" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/validation/field" + + infrav1 "github.com/smartxworks/cluster-api-provider-elf/api/v1beta1" +) + +func TestElfMachineTemplateValidatorValidateCreate(t *testing.T) { + g := NewWithT(t) + + var tests []testCaseEMT + tests = append(tests, testCaseEMT{ + Name: "disk capacity cannot be less than 0", + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: -1, + }, + }, + }}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), -1, diskCapacityCannotLessThanZeroMsg), + }, + }, testCaseEMT{ + Name: "disk capacity can be 0", + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 0, + }, + }, + }}, + Errs: nil, + }, testCaseEMT{ + Name: "disk capacity can > 0", + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 100, + }, + }, + }}, + Errs: nil, + }) + + validator := &ElfMachineTemplateValidator{} + + for _, tc := range tests { + t.Run(tc.Name, func(t *testing.T) { + warnings, err := validator.ValidateCreate(goctx.Background(), tc.EMT) + g.Expect(warnings).To(BeEmpty()) + expectTestCase(g, tc, err) + }) + } +} + +func TestElfMachineTemplateValidatorValidateUpdate(t *testing.T) { + g := NewWithT(t) + + var tests []testCaseEMT + tests = append(tests, testCaseEMT{ + Name: "Cannot reduce disk capacity", + OldEMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 2, + }, + }, + }}, + EMT: &infrav1.ElfMachineTemplate{Spec: infrav1.ElfMachineTemplateSpec{ + Template: infrav1.ElfMachineTemplateResource{ + Spec: infrav1.ElfMachineSpec{ + DiskGiB: 1, + }, + }, + }}, + Errs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "template", "spec", "diskGiB"), 1, diskCapacityCanOnlyBeExpanded), + }, + }) + + validator := &ElfMachineTemplateValidator{} + + for _, tc := range tests { + t.Run(tc.Name, func(t *testing.T) { + warnings, err := validator.ValidateUpdate(goctx.Background(), tc.OldEMT, tc.EMT) + g.Expect(warnings).To(BeEmpty()) + expectTestCase(g, tc, err) + }) + } +} + +func expectTestCase(g Gomega, tc testCaseEMT, err error) { + if tc.Errs != nil { + g.Expect(err).To(HaveOccurred()) + statusErr, ok := err.(*apierrors.StatusError) + g.Expect(ok).To(BeTrue()) + g.Expect(statusErr.ErrStatus.Details.Group).To(Equal(tc.EMT.GroupVersionKind().Group)) + g.Expect(statusErr.ErrStatus.Details.Kind).To(Equal(tc.EMT.GroupVersionKind().Kind)) + g.Expect(statusErr.ErrStatus.Details.Name).To(Equal(tc.EMT.Name)) + causes := make([]metav1.StatusCause, 0, len(tc.Errs)) + for i := 0; i < len(tc.Errs); i++ { + causes = append(causes, metav1.StatusCause{ + Type: metav1.CauseType(tc.Errs[i].Type), + Message: tc.Errs[i].ErrorBody(), + Field: tc.Errs[i].Field, + }) + } + g.Expect(statusErr.ErrStatus.Details.Causes).To(Equal(causes)) + } else { + g.Expect(err).NotTo(HaveOccurred()) + } +} + +type testCaseEMT struct { + Name string + EMT *infrav1.ElfMachineTemplate + OldEMT *infrav1.ElfMachineTemplate + Errs field.ErrorList +} diff --git a/webhooks/util.go b/webhooks/util.go new file mode 100644 index 00000000..cc485089 --- /dev/null +++ b/webhooks/util.go @@ -0,0 +1,35 @@ +/* +Copyright 2021 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package webhooks + +import ( + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/validation/field" +) + +func aggregateObjErrors(gk schema.GroupKind, name string, allErrs field.ErrorList) error { + if len(allErrs) == 0 { + return nil + } + + return apierrors.NewInvalid( + gk, + name, + allErrs, + ) +}