diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml index eded3852dea..c1e658df40e 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/defaults/main.yml @@ -21,12 +21,13 @@ kmm_ignition_version: "3.2.0" nfd_operator_namespace: "openshift-nfd" nfd_operator_channel: "stable" nfd_operator_automatic_install_plan_approval: true -nfd_operator_starting_csv: "nfd.4.12.0-202307182142" +nfd_operator_starting_csv: "" nfd_operator_wait_for_deploy: true nfd_operator_use_catalog_snapshot: false nfd_operator_catalogsource_name: "" nfd_operator_catalog_snapshot_image: "" nfd_operator_catalog_snapshot_image_tag: "" +nfd_operator_operand_image: "registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.17" # ------------------------------------------------ # Habana Gaudi Operator @@ -40,6 +41,7 @@ habana_gaudi_operator_use_catalog_snapshot: false habana_gaudi_operator_catalogsource_name: "" habana_gaudi_operator_catalog_snapshot_image: "" habana_gaudi_operator_catalog_snapshot_image_tag: "" +habana_gaudi_image_tag: "1.19.1-26" habana_gaudi_image_version: "1.10.0" habana_gaudi_deviceplugin_version: "1.10.0" habana_gaudi_driver_version: "1.10.0-494" diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml index 9e5a73a3b15..a30ba2dfab6 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/habana_gaudi_operator.yml @@ -10,6 +10,10 @@ retries: 10 delay: 5 +- name: 60 second pause for Habana Gaudi GPU namespace check + pause: + seconds: 60 + - name: Create Habana Gaudi GPU operatorgroup kubernetes.core.k8s: state: present @@ -19,6 +23,10 @@ retries: 25 delay: 5 +- name: 60 second pause for Habana Gaudi GPU operator setup + pause: + seconds: 60 + - name: Create Habana Gaudi GPU subscription kubernetes.core.k8s: state: present @@ -28,15 +36,15 @@ retries: 50 delay: 5 -- name: 120 second pause for Habana Gaudi GPU operator setup +- name: 120 second pause for Habana Gaudi GPU subscription pause: seconds: 120 -- name: Setup Habana Gaudi Device Config +- name: Setup Habana Gaudi Cluster policy setup kubernetes.core.k8s: state: present - definition: "{{ lookup('template', 'habana-gaudi/habana_gpu_deviceconfig.yaml.j2') | from_yaml }}" - register: devconfig_result - until: devconfig_result is successful - retries: 30 + definition: "{{ lookup('template', 'habana-gaudi/habana_gpu_clusterpolicy.yaml.j2') | from_yaml }}" + register: devconfig_clusterpolicy_result + until: devconfig_clusterpolicy_result is successful + retries: 300 delay: 5 diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/kmm_operator.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/kmm_operator.yml index 5d28b01bbad..9ff05436d9a 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/kmm_operator.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/kmm_operator.yml @@ -16,6 +16,10 @@ retries: 25 delay: 5 +- name: 60 second pause for KMM operatorgroup + pause: + seconds: 60 + - name: Create KMM subscription kubernetes.core.k8s: state: present @@ -25,7 +29,7 @@ retries: 25 delay: 5 -- name: 60 second pause for Habana Gaudi GPU operator setup +- name: 60 second pause for KMM subscription pause: seconds: 60 diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml index 52765a1501a..e62721580ec 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/tasks/nfd_operator.yml @@ -1,29 +1,22 @@ --- -- name: "Ensure nfd namespace exists" - kubernetes.core.k8s: - state: present - api_version: v1 - kind: Namespace - name: "{{ nfd_operator_namespace }}" - delay: 5 - -- name: Create NodeFeatureDiscovery operatorgroup - kubernetes.core.k8s: - state: present - definition: "{{ lookup('template', 'nfd/nodefeature_discovery_operatorgroup.yaml.j2') | from_yaml }}" - register: operatorgroup_result - until: operatorgroup_result is not failed - retries: 25 - delay: 5 - -- name: Create NodeFeaturEDiscovery subscription - kubernetes.core.k8s: - state: present - definition: "{{ lookup('template', 'nfd/nodefeature_discovery_sub.yaml.j2') | from_yaml }}" - register: subscription_result - until: subscription_result is not failed - retries: 25 - delay: 5 +- name: Install NFD operator + ansible.builtin.include_role: + name: install_operator + vars: + install_operator_action: install + install_operator_name: nfd + install_operator_namespace: "{{ nfd_operator_namespace }}" + install_operator_channel: "{{ nfd_operator_channel }}" + install_operator_catalog: redhat-operators + install_operator_automatic_install_plan_approval: "{{ nfd_operator_automatic_install_plan_approval | default('true') }}" + install_operator_starting_csv: "{{ nfd_operator_starting_csv | default('') }}" + install_operator_catalogsource_setup: "{{ nfd_operator_use_catalog_snapshot | default(false) }}" + install_operator_catalogsource_name: "{{ nfd_operator_catalogsource_name }}" + install_operator_catalogsource_namespace: "{{ nfd_operator_namespace }}" + install_operator_catalogsource_image: "{{ nfd_operator_catalog_snapshot_image | default('') }}" + install_operator_catalogsource_image_tag: "{{ nfd_operator_catalog_snapshot_image_tag | default('') }}" + install_operator_manage_namespaces: + - "{{ nfd_operator_namespace }}" - name: Create NodeFeatureDiscovery Custom Resource kubernetes.core.k8s: @@ -31,5 +24,5 @@ definition: "{{ lookup('template', 'nfd/nodefeature_discovery_cr.yaml.j2') | from_yaml }}" register: result until: result is not failed - retries: 25 - delay: 5 + retries: 30 + delay: 20 diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/habana-gaudi/habana_gpu_clusterpolicy.yaml.j2 b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/habana-gaudi/habana_gpu_clusterpolicy.yaml.j2 new file mode 100644 index 00000000000..1cb11bd1e36 --- /dev/null +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/habana-gaudi/habana_gpu_clusterpolicy.yaml.j2 @@ -0,0 +1,109 @@ +kind: ClusterPolicy +apiVersion: habanalabs.habana.ai/v1 +metadata: + name: habana-ai +spec: + bmc_monitoring: + image: + repository: vault.habana.ai/habana-bmc-exporter/bmc-exporter + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 250m + memory: 250Mi + requests: + cpu: 150m + memory: 100Mi + device_plugin: + image: + repository: vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 20m + memory: 64Mi + requests: + cpu: 10m + memory: 32Mi + driver: + driver_loader: + images: + rhel_8.6: + repository: vault.habana.ai/habana-ai-operator/driver/rhel8.6/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + rhel_9.2: + repository: vault.habana.ai/habana-ai-operator/driver/rhel9.2/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + rhel_9.4: + repository: vault.habana.ai/habana-ai-operator/driver/rhel9.4/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + tencentos_3.1: + repository: vault.habana.ai/habana-ai-operator/driver/tencentos3.1/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + ubuntu_22.04: + repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + mlnx_ofed_repo_path: artifactory/gaudi-installer/deps + mlnx_ofed_version: mlnx-ofed-5.8-2.0.3.0-rhel8.4-x86_64.tar.gz + repo_path: artifactory/gaudi-installer/repos + repo_server: vault.habana.ai + resources: + limits: + cpu: 4000m + memory: 16Gi + requests: + cpu: 2000m + memory: 8Gi + driver_runner: + image: + repository: vault.habana.ai/habana-ai-operator/driver/ubuntu22.04/driver-installer + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 20m + memory: 64Mi + requests: + cpu: 10m + memory: 32Mi + feature_discovery: + nfd_plugin: false + runner: + image: + repository: vault.habana.ai/habana-ai-operator/habanalabs-feature-discovery + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 20m + memory: 64Mi + requests: + cpu: 10m + memory: 32Mi + image_registry: vault.habana.ai + metric_exporter: + interval: 20 + port: 41611 + runner: + image: + repository: vault.habana.ai/gaudi-metric-exporter/metric-exporter + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 150m + memory: 120Mi + requests: + cpu: 100m + memory: 100Mi + runtime: + configuration: + container_engine: crio + runner: + image: + repository: vault.habana.ai/habana-ai-operator/habana-container-runtime + tag: "{{ habana_gaudi_image_tag}}" + resources: + limits: + cpu: 20m + memory: 64Mi + requests: + cpu: 10m + memory: 32Mi \ No newline at end of file diff --git a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 index 2f356cc045b..0e0c581d602 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 +++ b/ansible/roles_ocp_workloads/ocp4_workload_habana_gaudi_gpu_setup/templates/nfd/nodefeature_discovery_cr.yaml.j2 @@ -1,37 +1,124 @@ ---- -apiVersion: nfd.openshift.io/v1 kind: NodeFeatureDiscovery +apiVersion: nfd.openshift.io/v1 metadata: name: nfd-instance - namespace: {{ nfd_operator_namespace }} + namespace: "{{ nfd_operator_namespace }}" spec: - customConfig: - configData: | - # - name: "more.kernel.features" - # matchOn: - # - loadedKMod: ["example_kmod3"] - # - name: "more.features.by.nodename" - # value: customValue - # matchOn: - # - nodename: ["special-.*-node-.*"] + enableTaints: false extraLabelNs: - habana.ai instance: '' operand: - image: >- - registry.redhat.io/openshift4/ose-node-feature-discovery:v4.12 + image: '{{ nfd_operator_operand_image }}' + imagePullPolicy: IfNotPresent servicePort: 12000 - topologyupdater: false + prunerOnDelete: false + topologyUpdater: false workerConfig: configData: | core: + # labelWhiteList: + # noPublish: false sleepInterval: 60s + # sources: [all] + # klog: + # addDirHeader: false + # alsologtostderr: false + # logBacktraceAt: + # logtostderr: true + # skipHeaders: false + # stderrthreshold: 2 + # v: 0 + # vmodule: + ## NOTE: the following options are not dynamically run-time + ## configurable and require a nfd-worker restart to take effect + ## after being changed + # logDir: + # logFile: + # logFileMaxSize: 1800 + # skipLogHeaders: false sources: + # cpu: + # cpuid: + ## NOTE: whitelist has priority over blacklist + # attributeBlacklist: + # - "BMI1" + # - "BMI2" + # - "CLMUL" + # - "CMOV" + # - "CX16" + # - "ERMS" + # - "F16C" + # - "HTT" + # - "LZCNT" + # - "MMX" + # - "MMXEXT" + # - "NX" + # - "POPCNT" + # - "RDRAND" + # - "RDSEED" + # - "RDTSCP" + # - "SGX" + # - "SSE" + # - "SSE2" + # - "SSE3" + # - "SSE4.1" + # - "SSE4.2" + # - "SSSE3" + # attributeWhitelist: + # kernel: + # kconfigFile: "/path/to/kconfig" + # configOpts: + # - "NO_HZ" + # - "X86" + # - "DMI" pci: deviceClassWhitelist: - "0200" - "03" - "12" deviceLabelFields: + # - "class" - "vendor" - + # - "device" + # - "subsystem_vendor" + # - "subsystem_device" + # usb: + # deviceClassWhitelist: + # - "0e" + # - "ef" + # - "fe" + # - "ff" + # deviceLabelFields: + # - "class" + # - "vendor" + # - "device" + # custom: + # - name: "my.kernel.feature" + # matchOn: + # - loadedKMod: ["example_kmod1", "example_kmod2"] + # - name: "my.pci.feature" + # matchOn: + # - pciId: + # class: ["0200"] + # vendor: ["15b3"] + # device: ["1014", "1017"] + # - pciId : + # vendor: ["8086"] + # device: ["1000", "1100"] + # - name: "my.usb.feature" + # matchOn: + # - usbId: + # class: ["ff"] + # vendor: ["03e7"] + # device: ["2485"] + # - usbId: + # class: ["fe"] + # vendor: ["1a6e"] + # device: ["089a"] + # - name: "my.combined.feature" + # matchOn: + # - pciId: + # vendor: ["15b3"] + # device: ["1014", "1017"] + # loadedKMod : ["vendor_kmod1", "vendor_kmod2"] \ No newline at end of file