From 93cfd5d9e07cd47fa1c27aa0369119341b119152 Mon Sep 17 00:00:00 2001 From: Soufiane Jounaid Date: Mon, 24 Jun 2024 18:01:19 -0400 Subject: [PATCH] Added taint tolerations to core deployments in k3s k3s defaults.yaml gets the value of k3s_worker_taint from worker_taint which is defined under kolla/defaults.yml which subsequently defines defaults for the option and takes in site values from the k8s_worker_taint option that can be specified through the site config. Added worker node taint toleration to smarter devices manager daemonsets. Furthermore, templated the nvidia device plugin daemonset and added the toleration there as well. --- roles/k3s/defaults/main.yml | 1 + roles/k3s/tasks/config-device-plugins.yml | 1 + .../templates/nvidia-device-plugin.yaml.j2 | 74 +++++++++++++++++++ .../smarter-device-manager-ds-jetson.yaml.j2 | 5 ++ .../smarter-device-manager-ds-rpi.yaml.j2 | 5 ++ 5 files changed, 86 insertions(+) create mode 100644 roles/k3s/templates/nvidia-device-plugin.yaml.j2 diff --git a/roles/k3s/defaults/main.yml b/roles/k3s/defaults/main.yml index 8271af80..9ab4e955 100644 --- a/roles/k3s/defaults/main.yml +++ b/roles/k3s/defaults/main.yml @@ -5,6 +5,7 @@ k3s_server_location: /var/lib/rancher/k3s k3s_conf_location: /etc/rancher/k3s k3s_systemd_dir: /etc/systemd/system k3s_server_ip: "{{ kolla_external_vip_address }}" +k3s_worker_taint: "{{ worker_taint }}" k3s_extra_server_args: "" k3s_dry_run: no diff --git a/roles/k3s/tasks/config-device-plugins.yml b/roles/k3s/tasks/config-device-plugins.yml index 967ab234..285574c4 100644 --- a/roles/k3s/tasks/config-device-plugins.yml +++ b/roles/k3s/tasks/config-device-plugins.yml @@ -7,6 +7,7 @@ kubernetes.core.k8s: state: present src: nvidia-device-plugin.yaml + template: "nvidia-device-plugin.yaml.j2" apply: yes when: - not (k3s_dry_run | bool) diff --git a/roles/k3s/templates/nvidia-device-plugin.yaml.j2 b/roles/k3s/templates/nvidia-device-plugin.yaml.j2 new file mode 100644 index 00000000..ce5b6b87 --- /dev/null +++ b/roles/k3s/templates/nvidia-device-plugin.yaml.j2 @@ -0,0 +1,74 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + # This annotation is deprecated. Kept here for backward compatibility + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + labels: + name: nvidia-device-plugin-ds + spec: + nodeSelector: + "nvidia-device-plugin/enabled": "true" + tolerations: + # This toleration is deprecated. Kept here for backward compatibility + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + - key: CriticalAddonsOnly + operator: Exists + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: "{{ k3s_worker_taint.key }}" + operator: "Equal" + value: "{{ k3s_worker_taint.value }}" + effect: "{{ k3s_worker_taint.effect }}" + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.1 + name: nvidia-device-plugin-ctr + env: + - name: FAIL_ON_INIT_ERROR + value: "false" + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 b/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 index 878fb883..905d6926 100644 --- a/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 +++ b/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 @@ -69,6 +69,11 @@ spec: nodeSelector: smarter-device-manager: enabled smarter-device-manager/configmap: jetson + tolerations: + - key: "{{ k3s_worker_taint.key }}" + operator: "Equal" + value: "{{ k3s_worker_taint.value }}" + effect: "{{ k3s_worker_taint.effect }}" priorityClassName: "system-node-critical" hostname: smarter-device-management hostNetwork: true diff --git a/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 b/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 index f90551ef..ac6aa125 100644 --- a/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 +++ b/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 @@ -67,6 +67,11 @@ spec: nodeSelector: smarter-device-manager: enabled smarter-device-manager/configmap: rpi + tolerations: + - key: "{{ k3s_worker_taint.key }}" + operator: "Equal" + value: "{{ k3s_worker_taint.value }}" + effect: "{{ k3s_worker_taint.effect }}" priorityClassName: "system-node-critical" hostname: smarter-device-management hostNetwork: true