diff --git a/kolla/defaults.yml b/kolla/defaults.yml index 156de1aa..0c030a78 100644 --- a/kolla/defaults.yml +++ b/kolla/defaults.yml @@ -345,6 +345,13 @@ enable_k3s: no enable_zun: no enable_zun_compute_k8s: "{{ enable_zun | bool and enable_k3s | bool }}" blazar_enable_device_plugin_k8s: "{{ enable_k3s | bool }}" +# K8S worker tainting +zun_tolerate_worker_taint: false +doni_enable_worker_taint: false +worker_taint: + key: "{{ k8s_worker_taint.key | default('worker-node') }}" + value: "{{ k8s_worker_taint.value | default('true') }}" + effect: "{{ k8s_worker_taint.effect | default('NoSchedule') }}" # If not running K3s, can run standard Zun deploy w/ etcd and kuryr enable_etcd: "{{ enable_zun | bool and not enable_k3s | bool }}" enable_kuryr: "{{ enable_zun | bool and not enable_k3s | bool }}" diff --git a/kolla/node_custom_config/doni.conf b/kolla/node_custom_config/doni.conf index 9a656610..d02e400f 100644 --- a/kolla/node_custom_config/doni.conf +++ b/kolla/node_custom_config/doni.conf @@ -37,6 +37,10 @@ device_fleet_mapping = raspberrypi3-64:chi-edge-workers,raspberrypi4-64:chi-edge {% if enable_k3s | bool %} [k8s] +enable_worker_taint = {{ doni_enable_worker_taint }} +worker_taint_key = {{ worker_taint.key }} +worker_taint_value = {{ worker_taint.value }} +worker_taint_effect = {{ worker_taint.effect }} expected_labels_index_property = machine_name expected_labels = raspberrypi4-64:smarter-device-manager=enabled|smarter-device-manager/configmap=rpi, raspberrypi3-64:smarter-device-manager=enabled|smarter-device-manager/configmap=rpi, diff --git a/kolla/node_custom_config/zun.conf b/kolla/node_custom_config/zun.conf index e119ecd6..83e4c322 100644 --- a/kolla/node_custom_config/zun.conf +++ b/kolla/node_custom_config/zun.conf @@ -37,6 +37,10 @@ allow_without_reservation = False nvidia_require_jetpack = csv-mounts=all nvidia_visible_devices = all nvidia_driver_capabilities = all +enable_worker_taint = {{ zun_tolerate_worker_taint }} +worker_taint_key = {{ worker_taint.key }} +worker_taint_value = {{ worker_taint.value }} +worker_taint_effect = {{ worker_taint.effect }} {% if enable_neutron | bool %} neutron_network = caliconet {% endif %} diff --git a/roles/k3s/defaults/main.yml b/roles/k3s/defaults/main.yml index 8271af80..9ab4e955 100644 --- a/roles/k3s/defaults/main.yml +++ b/roles/k3s/defaults/main.yml @@ -5,6 +5,7 @@ k3s_server_location: /var/lib/rancher/k3s k3s_conf_location: /etc/rancher/k3s k3s_systemd_dir: /etc/systemd/system k3s_server_ip: "{{ kolla_external_vip_address }}" +k3s_worker_taint: "{{ worker_taint }}" k3s_extra_server_args: "" k3s_dry_run: no diff --git a/roles/k3s/tasks/config-device-plugins.yml b/roles/k3s/tasks/config-device-plugins.yml index 967ab234..2a9f7c12 100644 --- a/roles/k3s/tasks/config-device-plugins.yml +++ b/roles/k3s/tasks/config-device-plugins.yml @@ -6,7 +6,7 @@ delegate_to: "{{ groups['deployment'][0] }}" kubernetes.core.k8s: state: present - src: nvidia-device-plugin.yaml + template: "nvidia-device-plugin.yaml.j2" apply: yes when: - not (k3s_dry_run | bool) diff --git a/roles/k3s/templates/nvidia-device-plugin.yaml.j2 b/roles/k3s/templates/nvidia-device-plugin.yaml.j2 new file mode 100644 index 00000000..0b2075a8 --- /dev/null +++ b/roles/k3s/templates/nvidia-device-plugin.yaml.j2 @@ -0,0 +1,74 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + # This annotation is deprecated. Kept here for backward compatibility + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + labels: + name: nvidia-device-plugin-ds + spec: + nodeSelector: + "nvidia-device-plugin/enabled": "true" + tolerations: + # This toleration is deprecated. Kept here for backward compatibility + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + - key: CriticalAddonsOnly + operator: Exists + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: "{{ k3s_worker_taint.key }}" + operator: Equal + value: "{{ k3s_worker_taint.value }}" + effect: "{{ k3s_worker_taint.effect }}" + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.1 + name: nvidia-device-plugin-ctr + env: + - name: FAIL_ON_INIT_ERROR + value: "false" + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 b/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 index 878fb883..9f6f4ae6 100644 --- a/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 +++ b/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 @@ -69,6 +69,11 @@ spec: nodeSelector: smarter-device-manager: enabled smarter-device-manager/configmap: jetson + tolerations: + - key: "{{ k3s_worker_taint.key }}" + operator: Equal + value: "{{ k3s_worker_taint.value }}" + effect: "{{ k3s_worker_taint.effect }}" priorityClassName: "system-node-critical" hostname: smarter-device-management hostNetwork: true diff --git a/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 b/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 index f90551ef..13f765f9 100644 --- a/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 +++ b/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 @@ -67,6 +67,11 @@ spec: nodeSelector: smarter-device-manager: enabled smarter-device-manager/configmap: rpi + tolerations: + - key: "{{ k3s_worker_taint.key }}" + operator: Equal + value: "{{ k3s_worker_taint.value }}" + effect: "{{ k3s_worker_taint.effect }}" priorityClassName: "system-node-critical" hostname: smarter-device-management hostNetwork: true