From e4485c0e9830daada369587bf8d5d931d2a76cbc Mon Sep 17 00:00:00 2001 From: Suvro Ghosh Date: Mon, 2 Mar 2026 11:28:32 -0500 Subject: [PATCH] feat: add vanilla Kubernetes support The checkup currently assumes OpenShift (cluster-reader ClusterRole, ClusterVersion API, CNV-managed golden images). This breaks on vanilla Kubernetes with five distinct blockers. Changes: - Make checkVersions() tolerate missing OpenShift ClusterVersion API instead of treating it as fatal - Add storage_checkup_cluster_role.yaml manifest defining the read-only ClusterRole that OpenShift provides as cluster-reader - Add golden_image_dataimportcron.yaml manifest for clusters without CNV-managed DataImportCrons - Add docs/vanilla-k8s-guide.md covering RBAC, KubeVirt feature gates, masquerade networking, golden images, and timeout tuning - Add vanilla K8s section to README linking to the guide Signed-off-by: Suvro Ghosh --- README.md | 4 + docs/vanilla-k8s-guide.md | 179 ++++++++++++++++++++ manifests/golden_image_dataimportcron.yaml | 28 +++ manifests/storage_checkup_cluster_role.yaml | 24 +++ pkg/internal/checkup/checkup.go | 16 +- 5 files changed, 243 insertions(+), 8 deletions(-) create mode 100644 docs/vanilla-k8s-guide.md create mode 100644 manifests/golden_image_dataimportcron.yaml create mode 100644 manifests/storage_checkup_cluster_role.yaml diff --git a/README.md b/README.md index 9ce8119f..70a48eb3 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,10 @@ Cluster admin should create the following cluster-reader permissions for dedicat namespace: ``` +## Non-OpenShift / Vanilla Kubernetes + +If you are running this checkup on vanilla Kubernetes (kubeadm, kOps, etc.) rather than OpenShift, additional setup is required for RBAC, KubeVirt feature gates, networking, and golden images. See [docs/vanilla-k8s-guide.md](docs/vanilla-k8s-guide.md) for the full guide. + ## Configuration |Key|Description|Is Mandatory|Remarks| diff --git a/docs/vanilla-k8s-guide.md b/docs/vanilla-k8s-guide.md new file mode 100644 index 00000000..a7bd8f53 --- /dev/null +++ b/docs/vanilla-k8s-guide.md @@ -0,0 +1,179 @@ +# Running kubevirt-storage-checkup on Non-OpenShift Kubernetes + +This guide covers the additional setup required to run the storage checkup +on vanilla Kubernetes clusters (i.e., clusters without OpenShift / CNV). + +On OpenShift, the CNV operator handles RBAC, feature gates, networking +defaults, and golden image provisioning automatically. On vanilla Kubernetes +these must be configured manually. + +## Prerequisites + +- Kubernetes cluster with [KubeVirt](https://kubevirt.io/user-guide/cluster_admin/installation/) installed +- [CDI](https://github.com/kubevirt/containerized-data-importer) (Containerized Data Importer) installed +- A CSI-based StorageClass with a default set (e.g., Ceph, Longhorn, etc.) +- [VolumeSnapshot CRDs and controller](https://github.com/kubernetes-csi/external-snapshotter) installed (for snapshot-based clone tests) + +## 1. KubeVirt Configuration + +The checkup tests volume hotplug and live migration, which require specific +KubeVirt settings that are not enabled by default. + +### Feature Gates + +Enable the `HotplugVolumes` feature gate: + +```bash +kubectl patch kubevirt kubevirt -n kubevirt --type=merge \ + -p '{"spec":{"configuration":{"developerConfiguration":{"featureGates":["HotplugVolumes"]}}}}' +``` + +Without this, the hotplug volume check will fail with: +`Enable DeclarativeHotplugVolumes or HotplugVolumes feature gate to use this API.` + +### Default Network Interface + +Set masquerade as the default network interface to allow live migration: + +```bash +kubectl patch kubevirt kubevirt -n kubevirt --type=merge \ + -p '{"spec":{"configuration":{"network":{"defaultNetworkInterface":"masquerade"}}}}' +``` + +Without this, KubeVirt creates VMs with bridge networking by default, which +is not live-migratable. The live migration check will be skipped with: +`cannot migrate VMI which does not use masquerade [...]` + +Both settings can be applied in a single patch: + +```bash +kubectl patch kubevirt kubevirt -n kubevirt --type=merge -p '{ + "spec": { + "configuration": { + "developerConfiguration": { + "featureGates": ["HotplugVolumes"] + }, + "network": { + "defaultNetworkInterface": "masquerade" + } + } + } +}' +``` + +## 2. RBAC + +The upstream instructions reference `cluster-reader`, an OpenShift-only +ClusterRole. On vanilla Kubernetes, use the provided ClusterRole manifest +instead. + +Apply the namespace-scoped RBAC: + +```bash +kubectl apply -n -f manifests/storage_checkup_permissions.yaml +``` + +Apply the ClusterRole and bind it to the checkup ServiceAccount: + +```bash +kubectl apply -f manifests/storage_checkup_cluster_role.yaml + +kubectl create clusterrolebinding kubevirt-storage-checkup-clustereader \ + --clusterrole=kubevirt-storage-checkup-reader \ + --serviceaccount=:storage-checkup-sa +``` + +## 3. Golden Images + +On OpenShift, the CNV operator automatically creates DataImportCrons that +import OS images (Fedora, CentOS, RHEL, etc.) for use as VM boot disks. On +vanilla Kubernetes, you must create one manually. + +Apply the provided DataImportCron manifest: + +```bash +kubectl apply -n -f manifests/golden_image_dataimportcron.yaml +``` + +Wait for the initial image import to complete (typically 2-5 minutes): + +```bash +kubectl wait -n dataimportcron/fedora-golden \ + --for=condition=UpToDate --timeout=600s +``` + +Without a golden image, the following checks are silently skipped: +- VM boot from golden image +- VM live migration +- VM volume hotplug +- Concurrent VM boot + +The golden image persists across checkup runs. You only need to create it +once per namespace. + +## 4. Running the Checkup + +Once the prerequisites above are in place, create the ConfigMap and Job: + +```bash +export CHECKUP_NAMESPACE= + +envsubst < manifests/storage_checkup.yaml | kubectl apply -f - +``` + +### Tuning for Smaller Clusters + +The default configuration boots 10 VMs concurrently with a 3-minute per-VM +timeout. On smaller clusters (2-3 nodes), this may not be enough. Adjust the +ConfigMap before deploying: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: storage-checkup-config +data: + spec.timeout: 20m + spec.param.vmiTimeout: 8m + spec.param.numOfVMs: "3" +``` + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `spec.timeout` | 10m | Overall checkup job timeout | +| `spec.param.vmiTimeout` | 3m | Timeout per individual VM operation (boot, migration, hotplug) | +| `spec.param.numOfVMs` | 10 | Number of VMs for the concurrent boot test | +| `spec.param.storageClass` | (default SC) | Override the StorageClass used for tests | + +## 5. Checking Results + +```bash +kubectl get configmap storage-checkup-config -n -o yaml +``` + +Key result fields: + +| Field | Description | +|-------|-------------| +| `status.succeeded` | `true` if all checks passed | +| `status.failureReason` | Error details if any check failed | +| `status.result.ocpVersion` | Empty on non-OpenShift clusters (expected) | +| `status.result.cnvVersion` | May be empty if CDI lacks the `app.kubernetes.io/version` label | +| `status.result.pvcBound` | PVC creation and binding test result | +| `status.result.vmBootFromGoldenImage` | VM boot from cloned golden image | +| `status.result.vmLiveMigration` | VM live migration result | +| `status.result.vmHotplugVolume` | Volume hotplug attach/detach result | +| `status.result.concurrentVMBoot` | Concurrent VM boot result | + +## 6. Cleanup + +```bash +export CHECKUP_NAMESPACE= + +envsubst < manifests/storage_checkup.yaml | kubectl delete -f - +kubectl delete -n $CHECKUP_NAMESPACE -f manifests/golden_image_dataimportcron.yaml +kubectl delete clusterrolebinding kubevirt-storage-checkup-clustereader +kubectl delete -f manifests/storage_checkup_cluster_role.yaml +kubectl delete -n $CHECKUP_NAMESPACE -f manifests/storage_checkup_permissions.yaml +``` + diff --git a/manifests/golden_image_dataimportcron.yaml b/manifests/golden_image_dataimportcron.yaml new file mode 100644 index 00000000..c82bdb5f --- /dev/null +++ b/manifests/golden_image_dataimportcron.yaml @@ -0,0 +1,28 @@ +--- +# Golden image DataImportCron for non-OpenShift clusters. +# +# Usage: +# kubectl apply -n -f manifests/golden_image_dataimportcron.yaml +# # Wait for the initial import to complete (~2-5 minutes): +# kubectl wait -n dataimportcron/fedora-golden \ +# --for=condition=UpToDate --timeout=600s +# +apiVersion: cdi.kubevirt.io/v1beta1 +kind: DataImportCron +metadata: + name: fedora-golden +spec: + managedDataSource: fedora-golden + schedule: "0 */12 * * *" + garbageCollect: Outdated + importsToKeep: 1 + template: + spec: + source: + registry: + url: "docker://quay.io/containerdisks/fedora:latest" + pullMethod: node + storage: + resources: + requests: + storage: 10Gi diff --git a/manifests/storage_checkup_cluster_role.yaml b/manifests/storage_checkup_cluster_role.yaml new file mode 100644 index 00000000..bf139a41 --- /dev/null +++ b/manifests/storage_checkup_cluster_role.yaml @@ -0,0 +1,24 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kubevirt-storage-checkup-reader +rules: + - apiGroups: [ "" ] + resources: [ "pods", "namespaces", "nodes", "persistentvolumeclaims", "persistentvolumes" ] + verbs: [ "get", "list" ] + - apiGroups: [ "storage.k8s.io" ] + resources: [ "storageclasses", "csidrivers" ] + verbs: [ "get", "list" ] + - apiGroups: [ "snapshot.storage.k8s.io" ] + resources: [ "volumesnapshots", "volumesnapshotclasses" ] + verbs: [ "get", "list" ] + - apiGroups: [ "cdi.kubevirt.io" ] + resources: [ "storageprofiles", "dataimportcrons", "datasources", "cdis" ] + verbs: [ "get", "list" ] + - apiGroups: [ "kubevirt.io" ] + resources: [ "virtualmachineinstances" ] + verbs: [ "get", "list" ] + - apiGroups: [ "config.openshift.io" ] + resources: [ "clusterversions" ] + verbs: [ "get" ] diff --git a/pkg/internal/checkup/checkup.go b/pkg/internal/checkup/checkup.go index f080849d..6f2dbb5f 100644 --- a/pkg/internal/checkup/checkup.go +++ b/pkg/internal/checkup/checkup.go @@ -214,16 +214,16 @@ func (c *Checkup) Run(ctx context.Context) error { func (c *Checkup) checkVersions(ctx context.Context) error { log.Print("checkVersions") + ocpVersion := "" ver, err := c.client.GetClusterVersion(ctx, "version") if err != nil { - return err - } - ocpVersion := "" - for _, update := range ver.Status.History { - if update.State == configv1.CompletedUpdate { - // obtain the version from the last completed update - ocpVersion = update.Version - break + log.Printf("OpenShift ClusterVersion not available (non-OpenShift cluster): %v", err) + } else { + for _, update := range ver.Status.History { + if update.State == configv1.CompletedUpdate { + ocpVersion = update.Version + break + } } }