Skip to content

Commit c61e442

Browse files
authored
Merge pull request #37 from fgci-org/cuda-run-installation
Cuda runfile installation
2 parents 68d0f31 + f1fefb7 commit c61e442

11 files changed

+295
-3
lines changed

.yamllint

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
---
2+
# Based on ansible-lint config
3+
extends: default
4+
5+
rules:
6+
braces:
7+
max-spaces-inside: 1
8+
level: error
9+
brackets:
10+
max-spaces-inside: 1
11+
level: error
12+
colons:
13+
max-spaces-after: -1
14+
level: error
15+
commas:
16+
max-spaces-after: -1
17+
level: error
18+
comments: disable
19+
comments-indentation: disable
20+
document-start: disable
21+
empty-lines:
22+
max: 3
23+
level: error
24+
hyphens:
25+
level: error
26+
indentation: disable
27+
key-duplicates: enable
28+
line-length: disable
29+
new-line-at-end-of-file: disable
30+
new-lines:
31+
type: unix
32+
trailing-spaces: disable
33+
truthy: disable

defaults/main.yml

+7
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@ cuda_repo_url: "http://developer.download.nvidia.com/compute/cuda/repos/"
77
cuda_rpm_key_path: /etc/rpm/nvidia_packaging_key.asc
88
cuda_packages:
99
- cuda
10+
cuda_use_runfile: False
11+
cuda_runfile_url: "https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run"
12+
cuda_runfile_driver: True
13+
cuda_runfile_toolkit: True
14+
cuda_runfile_download: True
15+
cuda_runfile_remove: True
16+
cuda_runfile_disable_nvidia_drm: False
1017
cuda_restart_node_on_install: True
1118
cuda_init: True
1219
cuda_init_restart_service: True

files/blacklist-nouveau.conf

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
blacklist nouveau
2+
options nouveau modeset=0

files/nvidia-persistenced.service

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[Unit]
2+
Description=NVIDIA Persistence Daemon
3+
After=syslog.target
4+
5+
[Service]
6+
Type=forking
7+
PIDFile=/var/run/nvidia-persistenced/nvidia-persistenced.pid
8+
Restart=always
9+
ExecStart=/usr/bin/nvidia-persistenced --verbose
10+
ExecStopPost=/bin/rm -rf /var/run/nvidia-persistenced/*
11+
TimeoutSec=300
12+
13+
[Install]
14+
WantedBy=multi-user.target

molecule/default/INSTALL.rst

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
*******
2+
Docker driver installation guide
3+
*******
4+
5+
Requirements
6+
============
7+
8+
* Docker Engine
9+
10+
Install
11+
=======
12+
13+
Please refer to the `Virtual environment`_ documentation for installation best
14+
practices. If not using a virtual environment, please consider passing the
15+
widely recommended `'--user' flag`_ when invoking ``pip``.
16+
17+
.. _Virtual environment: https://virtualenv.pypa.io/en/latest/
18+
.. _'--user' flag: https://packaging.python.org/tutorials/installing-packages/#installing-to-the-user-site
19+
20+
.. code-block:: bash
21+
22+
$ python3 -m pip install 'molecule[docker]'

molecule/default/converge.yml

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
- name: Converge ansible-role-cuda
3+
hosts: all
4+
tasks:
5+
- name: "Include ansible-role-cuda"
6+
include_role:
7+
name: "ansible-role-cuda"

molecule/default/molecule.yml

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
---
2+
dependency:
3+
name: galaxy
4+
driver:
5+
name: docker
6+
platforms:
7+
- name: centos7_cuda_repo
8+
image: docker.io/pycontribs/centos:7
9+
pre_build_image: true
10+
tmpfs:
11+
- /run
12+
volumes:
13+
- /tmp/centos7_cuda_repo:/tmp:rw
14+
- name: centos7_cuda_run
15+
image: docker.io/pycontribs/centos:7
16+
pre_build_image: true
17+
tmpfs:
18+
- /run
19+
volumes:
20+
- /tmp/centos7_cuda_run:/tmp:rw
21+
provisioner:
22+
name: ansible
23+
inventory:
24+
group_vars:
25+
all:
26+
gpu: True
27+
cuda_driver_kernel_version: 3.10.0-1160.21.1.el7.x86_64 # The kernel to check kernel modules against
28+
host_vars:
29+
centos7_cuda_repo:
30+
cuda_packages:
31+
- cuda-libraries-11-2
32+
cuda_restart_node_on_install: False
33+
cuda_init: False
34+
cuda_init_restart_service: False
35+
centos7_cuda_run:
36+
cuda_use_runfile: True
37+
cuda_runfile_driver: True # Docker has different kernel than images kernel-headers
38+
cuda_runfile_remove: False # Keep the installer in /tmp/centos7_cuda_run for multiple runs
39+
cuda_restart_node_on_install: False
40+
cuda_init: False
41+
cuda_init_restart_service: False
42+
43+
verifier:
44+
name: ansible
45+
lint: |
46+
set -e
47+
yamllint .
48+
ansible-lint
49+
flake8

molecule/default/verify.yml

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
---
2+
# This is an example playbook to execute Ansible tests.
3+
4+
- name: Verify CUDA toolkit installation
5+
hosts: all
6+
tasks:
7+
- name: Check that CUDA has been installed
8+
stat:
9+
path: /usr/local/cuda-11.2
10+
register: cuda_path_check
11+
- name: Verify that CUDA folder exists
12+
assert:
13+
that: cuda_path_check.stat.exists
14+
- name: Verify NVIDIA driver kernel modules
15+
hosts: centos7_cuda_run
16+
tasks:
17+
- name: Check that NVIDIA kernel module has been installed
18+
find:
19+
path: /lib/modules/{{ cuda_driver_kernel_version }}
20+
patterns: nvidia.ko
21+
register: nvidia_module_find
22+
- name: Verify that kernel module exists
23+
assert:
24+
that: nvidia_module_find.matched > 0

tasks/install_runfile.yml

+124
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
---
2+
3+
- name: "Ensure kernel headers are installed (yum)"
4+
yum:
5+
name: "{{ cuda_runfile_packages }}"
6+
state: present
7+
when: ansible_pkg_mgr in ["yum", "dnf"]
8+
9+
- name: "Ensure kernel headers are installed (apt)"
10+
yum:
11+
name:
12+
- linux-headers-generic
13+
- build-essential
14+
state: present
15+
when: ansible_pkg_mgr == "apt"
16+
17+
- name: "Disable nouveau"
18+
copy:
19+
src: blacklist-nouveau.conf
20+
dest: /etc/modprobe.d/blacklist-nouveau.conf
21+
22+
- name: "Register installer name"
23+
set_fact:
24+
cuda_runfile_sh: "{{ cuda_runfile_url | basename }}"
25+
26+
- name: "Determine running kernel"
27+
command: uname -r
28+
register: cuda_driver_kernel_running
29+
30+
- name: "Determine kernel version"
31+
set_fact:
32+
cuda_driver_kernel_version: "{{ cuda_driver_kernel_version | default(cuda_driver_kernel_running.stdout, true) }}"
33+
34+
- name: "Check NVIDIA kernel module"
35+
find:
36+
path: "/lib/modules/{{ cuda_driver_kernel_version }}"
37+
patterns: nvidia.ko
38+
recurse: true
39+
register: cuda_driver_kernel_module_find
40+
41+
- name: "Check CUDA toolkit path"
42+
stat:
43+
path: /usr/local/cuda
44+
register: cuda_toolkit_path
45+
46+
- name: "Determine if driver and toolkit are installed"
47+
set_fact:
48+
cuda_driver_installed: "{{ cuda_driver_kernel_module_find.matched > 0 }}"
49+
cuda_toolkit_installed: "{{ cuda_toolkit_path.stat.exists }}"
50+
51+
- name: "Print information about installed features"
52+
debug:
53+
msg:
54+
- "Driver installed: {{ cuda_driver_installed }}"
55+
- "Toolkit installed: {{ cuda_toolkit_installed }}"
56+
57+
- name: "Create temporary directory for runfile"
58+
file:
59+
path: /tmp/cuda_runfile
60+
state: directory
61+
62+
- name: "Obtain runfile"
63+
block:
64+
65+
- name: "Copy pre-downloaded runfile"
66+
copy:
67+
src: "{{ cuda_runfile_sh }}"
68+
dest: /tmp/cuda_runfile
69+
when: not cuda_runfile_download
70+
71+
- name: "Download runfile"
72+
get_url:
73+
url: "{{ cuda_runfile_url }}"
74+
dest: "/tmp/cuda_runfile/{{ cuda_runfile_sh }}"
75+
when: cuda_runfile_download
76+
77+
when: (cuda_runfile_toolkit and not cuda_toolkit_installed) or
78+
(cuda_runfile_driver and not cuda_driver_installed)
79+
80+
- name: "Run installer for toolkit"
81+
command: bash /tmp/cuda_runfile/{{ cuda_runfile_sh }} --silent --toolkit
82+
register: cuda_toolkit_install_out
83+
when: cuda_runfile_toolkit and not cuda_toolkit_installed
84+
85+
- name: "Install driver"
86+
block:
87+
88+
- name: "Extract installer for driver installation"
89+
command: bash /tmp/cuda_runfile/{{ cuda_runfile_sh }} --extract=/tmp/cuda_runfile
90+
91+
- name: "Find NVIDIA runtime"
92+
find:
93+
paths: /tmp/cuda_runfile
94+
patterns: "NVIDIA*.run"
95+
register: cuda_driver_runfile_find
96+
97+
- name: "Set NVIDIA runfile path"
98+
set_fact:
99+
cuda_driver_runfile: "{{ cuda_driver_runfile_find.files[0].path }}"
100+
101+
- name: "Print information about driver"
102+
debug:
103+
msg: "Building driver {{ cuda_driver_runfile }} for kernel {{ cuda_driver_kernel_version }}"
104+
105+
- name: "Install driver"
106+
command: >
107+
bash {{ cuda_driver_runfile }} --silent
108+
--kernel-name={{ cuda_driver_kernel_version }}
109+
--kernel-source-path=/usr/src/kernels/{{ cuda_driver_kernel_version }}
110+
{{ "--no-drm" if cuda_runfile_disable_nvidia_drm else "" }}
111+
112+
- name: "Install nvidia-persistenced systemd-file"
113+
copy:
114+
src: files/nvidia-persistenced.service
115+
dest: /etc/systemd/system/nvidia-persistenced.service
116+
when: cuda_init_persistence_mode | bool
117+
118+
when: cuda_runfile_driver and not cuda_driver_installed
119+
120+
- name: "Remove installer"
121+
file:
122+
path: /tmp/cuda_runfile
123+
state: absent
124+
when: cuda_runfile_remove

tasks/main.yml

+6-2
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,21 @@
1010

1111
- block:
1212
- include_tasks: configure_yum.yml
13-
when: ansible_pkg_mgr in ['yum', 'dnf']
13+
when: ansible_pkg_mgr in ['yum', 'dnf'] and not cuda_use_runfile
1414

1515
- include_tasks: configure_apt.yml
16-
when: ansible_pkg_mgr == 'apt'
16+
when: ansible_pkg_mgr == 'apt' and not cuda_use_runfile
17+
18+
- include_tasks: install_runfile.yml
19+
when: cuda_use_runfile
1720

1821
- name: Install CUDA packages (1.5-2GB download, also restarts if cuda_restart_node_on_install is set to True)
1922
package:
2023
name: "{{ item }}"
2124
state: present
2225
with_items: "{{ cuda_packages }}"
2326
register: cuda_packages_installation
27+
when: not cuda_use_runfile
2428
notify:
2529
- ZZ CUDA Restart server
2630
- ZZ CUDA Wait for server to restart

vars/centos-7.yml

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
---
22
cuda_repo_subfolder: rhel7
33

4-
# vim:ft=ansible:
4+
cuda_runfile_packages:
5+
- kernel-devel
6+
- "@Development tools"
7+
- which
8+
9+
10+
# vim:ft=ansible:

0 commit comments

Comments
 (0)