diff --git a/roles/k3s/defaults/main.yml b/roles/k3s/defaults/main.yml index 8271af80..9ab4e955 100644 --- a/roles/k3s/defaults/main.yml +++ b/roles/k3s/defaults/main.yml @@ -5,6 +5,7 @@ k3s_server_location: /var/lib/rancher/k3s k3s_conf_location: /etc/rancher/k3s k3s_systemd_dir: /etc/systemd/system k3s_server_ip: "{{ kolla_external_vip_address }}" +k3s_worker_taint: "{{ worker_taint }}" k3s_extra_server_args: "" k3s_dry_run: no diff --git a/roles/k3s/tasks/config-device-plugins.yml b/roles/k3s/tasks/config-device-plugins.yml index 967ab234..285574c4 100644 --- a/roles/k3s/tasks/config-device-plugins.yml +++ b/roles/k3s/tasks/config-device-plugins.yml @@ -7,6 +7,7 @@ kubernetes.core.k8s: state: present src: nvidia-device-plugin.yaml + template: "nvidia-device-plugin.yaml.j2" apply: yes when: - not (k3s_dry_run | bool) diff --git a/roles/k3s/templates/nvidia-device-plugin.yaml.j2 b/roles/k3s/templates/nvidia-device-plugin.yaml.j2 new file mode 100644 index 00000000..ce5b6b87 --- /dev/null +++ b/roles/k3s/templates/nvidia-device-plugin.yaml.j2 @@ -0,0 +1,74 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + # This annotation is deprecated. Kept here for backward compatibility + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + labels: + name: nvidia-device-plugin-ds + spec: + nodeSelector: + "nvidia-device-plugin/enabled": "true" + tolerations: + # This toleration is deprecated. Kept here for backward compatibility + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + - key: CriticalAddonsOnly + operator: Exists + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + - key: "{{ k3s_worker_taint.key }}" + operator: "Equal" + value: "{{ k3s_worker_taint.value }}" + effect: "{{ k3s_worker_taint.effect }}" + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.1 + name: nvidia-device-plugin-ctr + env: + - name: FAIL_ON_INIT_ERROR + value: "false" + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 b/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 index 878fb883..905d6926 100644 --- a/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 +++ b/roles/k3s/templates/smarter-device-manager-ds-jetson.yaml.j2 @@ -69,6 +69,11 @@ spec: nodeSelector: smarter-device-manager: enabled smarter-device-manager/configmap: jetson + tolerations: + - key: "{{ k3s_worker_taint.key }}" + operator: "Equal" + value: "{{ k3s_worker_taint.value }}" + effect: "{{ k3s_worker_taint.effect }}" priorityClassName: "system-node-critical" hostname: smarter-device-management hostNetwork: true diff --git a/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 b/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 index f90551ef..ac6aa125 100644 --- a/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 +++ b/roles/k3s/templates/smarter-device-manager-ds-rpi.yaml.j2 @@ -67,6 +67,11 @@ spec: nodeSelector: smarter-device-manager: enabled smarter-device-manager/configmap: rpi + tolerations: + - key: "{{ k3s_worker_taint.key }}" + operator: "Equal" + value: "{{ k3s_worker_taint.value }}" + effect: "{{ k3s_worker_taint.effect }}" priorityClassName: "system-node-critical" hostname: smarter-device-management hostNetwork: true