From cc549d97b772185d70bbeabc63cb88e102851ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joonhyung=20Lee/=EC=9D=B4=EC=A4=80=ED=98=95?= <33523965+veritas9872@users.noreply.github.com> Date: Mon, 10 Apr 2023 18:12:48 +0900 Subject: [PATCH] Tests and versions. (#116) * Create conftest.py file to allow the number of iterations in the inference test to be configurable. This allows the tests to be used as inference speed benchmarks as originally intended. Also, the allow_tf32 flags are now logged as well. * Add `tmux` to the `simple` environment default environment. * Make tests configurable for GPU as well. * Add explanation as to how to run configurable test for inference speed comparison. * Fix typo. * Update Docker Compose installed version. * Update pytest minimum version to 7.3.0, which fixes the walrus operator bug. * Remove unnecessary import from conftest.py. * Reformat project. * Update documentation to mention that runtime speeds will probably be similar for `conda` installs and source builds. I have confirmed that the `conda` installs use `cuDNN` and probably `magma` properly. The speeds were identical on the hardware I tested. * Update all Docker BuildKit frontend versions to simply 1, which will use the latest BuildKit syntax until the next major release. --- Dockerfile | 2 +- Makefile | 2 +- README.md | 19 ++++++++++--------- dockerfiles/hub.Dockerfile | 2 +- dockerfiles/ngc.Dockerfile | 2 +- dockerfiles/simple.Dockerfile | 2 +- pyproject.toml | 4 ++-- reqs/simple-environment.yaml | 3 ++- tests/README.md | 5 +++++ tests/conftest.py | 3 +++ tests/test_run.py | 20 +++++++++++++++++--- 11 files changed, 44 insertions(+), 20 deletions(-) create mode 100644 tests/conftest.py diff --git a/Dockerfile b/Dockerfile index 705c498..301b821 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -# syntax = docker/dockerfile:1.4 +# syntax = docker/dockerfile:1 # The top line is used by BuildKit. _**DO NOT ERASE IT**_. # Use `export BUILDKIT_PROGRESS=plain` in the host terminal to see full build logs. diff --git a/Makefile b/Makefile index 6026073..c15efe6 100644 --- a/Makefile +++ b/Makefile @@ -120,7 +120,7 @@ ls: # List all services. # Utility for installing Docker Compose on Linux (but not WSL) systems. # Visit https://docs.docker.com/compose/install for the full documentation. -COMPOSE_VERSION = v2.15.1 +COMPOSE_VERSION = v2.17.2 COMPOSE_OS_ARCH = linux-x86_64 COMPOSE_URL = https://github.com/docker/compose/releases/download/${COMPOSE_VERSION}/docker-compose-${COMPOSE_OS_ARCH} COMPOSE_PATH = ${HOME}/.docker/cli-plugins diff --git a/README.md b/README.md index 027d915..d7a76f4 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ If this is your first time using this project, follow these steps: for the latest installation information. Note that Docker Compose V2 is available for WSL users with Docker Desktop by default. -4. Run `make env SERVICE=(train|devel|ngc|hub|simple)` on the terminal +4. Run `make env SERVICE=(train|devel|ngc|hub|simple)` on the terminal at project root to create a basic `.env` file. The `.env` file provides environment variables for `docker-compose.yaml`, allowing different users and machines to set their own variables as required. @@ -67,17 +67,17 @@ If this is your first time using this project, follow these steps: Add configurations that should not be shared via source control there. For example, volume-mount pairs specific to each host machine. - ### Explanation of services + Different Docker Compose services are organized to serve different needs. - `train`, the default service, should be used when compiled dependencies are - necessary or when PyTorch needs to be compiled from source due to + necessary or when PyTorch needs to be compiled from source due to Compute Capability issues, etc. -- `devel` is designed for PyTorch CUDA/C++ developers who need to recompile +- `devel` is designed for PyTorch CUDA/C++ developers who need to recompile frequently and have many complex dependencies. - `ngc` is derived from the official NVIDIA PyTorch HPC images with the option - to install additional packages. It is recommended for users who wish to base + to install additional packages. It is recommended for users who wish to base their projects on the NGC images provided by NVIDIA. Note that the NGC images change greatly between different releases and that configurations for one release may not work for another one. @@ -91,7 +91,8 @@ Different Docker Compose services are organized to serve different needs. `pip` packages can also be installed via `conda`. Also, the base image can be configured to use images other than the Official Linux Docker images by specifying the `BASE_IMAGE` argument directly in the `.env` file. - PyTorch runtime performance may be superior in official NVIDIA CUDA images. + PyTorch runtime performance may be superior in official NVIDIA CUDA images + under certain circumstances. Use the tests to benchmark runtime speeds. **The `simple` service is recommended for users without compiled dependencies.** The `Makefile` has been configured to take values specified in the `.env` file @@ -250,7 +251,7 @@ Please read the Makefile to see the exact commands. To fix this issue, create a new directory on the host to mount the containers' `.vscode-server` directories. For example, one can set a volume pair as `${HOME}/.vscode-project1:/home/${USR}/.vscode-server` for project1. Do not forget to create `${HOME}/.vscode-project1` on the host first. Otherwise, the directory will be owned by `root`, - which will cause VSCode to stall indefinately. + which will cause VSCode to stall indefinitely. - If any networking issues arise, check `docker network ls` and check for conflicts. Most networking and SSH problems can be solved by running `docker network prune`. @@ -261,7 +262,7 @@ The main components of the project are as follows. The other files are utilities 1. Dockerfile 2. docker-compose.yaml 3. docker-compose.override.yaml -4. reqs/\*requirements.txt +4. reqs/(`*requirements.txt`|`*environment.yaml`) 5. .env When the user inputs `make up` or another `make` command, @@ -497,7 +498,7 @@ For other VSCode problems, try deleting `~/.vscode-server` on the host. [not fail-safe](https://stackoverflow.com/a/8573310/9289275). 6. `torch.cuda.is_available()` will return a `... UserWarning: - CUDA initialization:...` error or the image will simply not start if +CUDA initialization:...` error or the image will simply not start if the CUDA driver on the host is incompatible with the CUDA version on the Docker image. Either upgrade the host CUDA driver or downgrade the CUDA version of the image. Check the diff --git a/dockerfiles/hub.Dockerfile b/dockerfiles/hub.Dockerfile index a473ee1..d0b7b29 100644 --- a/dockerfiles/hub.Dockerfile +++ b/dockerfiles/hub.Dockerfile @@ -1,4 +1,4 @@ -# syntax = docker/dockerfile:1.4 +# syntax = docker/dockerfile:1 # The top line is used by BuildKit. _**DO NOT ERASE IT**_. ARG PYTORCH_VERSION ARG CUDA_SHORT_VERSION diff --git a/dockerfiles/ngc.Dockerfile b/dockerfiles/ngc.Dockerfile index 8a52de8..ead4abb 100644 --- a/dockerfiles/ngc.Dockerfile +++ b/dockerfiles/ngc.Dockerfile @@ -1,4 +1,4 @@ -# syntax = docker/dockerfile:1.4 +# syntax = docker/dockerfile:1 # The top line is used by BuildKit. _**DO NOT ERASE IT**_. ARG INTERACTIVE_MODE diff --git a/dockerfiles/simple.Dockerfile b/dockerfiles/simple.Dockerfile index 6da531b..4f82ad6 100644 --- a/dockerfiles/simple.Dockerfile +++ b/dockerfiles/simple.Dockerfile @@ -1,4 +1,4 @@ -# syntax = docker/dockerfile:1.4 +# syntax = docker/dockerfile:1 # The top line is used by BuildKit. _**DO NOT ERASE IT**_. # This Dockerfile exists to provide a method of installing all packages from diff --git a/pyproject.toml b/pyproject.toml index 796cbf6..592fc8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ target-version = ['py38', 'py39', 'py310'] include = '\.pyi?$' [tool.pytest.ini_options] -minversion = "7.0" # Update to 7.2.3 as soon as it becomes available. +minversion = "7.3.0" addopts = """\ --capture=tee-sys \ --doctest-modules \ @@ -85,7 +85,7 @@ max-doc-length = 80 [tool.ruff.per-file-ignores] # Ignore `E402` (import violations) in all `__init__.py` files. "__init__.py" = ["E402"] -"*test*.py" = ["D"] # ignore all docstring lints in tests +"*test*.py" = ["D"] # Ignore all docstring lints in tests. [tool.ruff.mccabe] # Unlike Flake8, default to a complexity level of 10. diff --git a/reqs/simple-environment.yaml b/reqs/simple-environment.yaml index 060c53c..19689bf 100644 --- a/reqs/simple-environment.yaml +++ b/reqs/simple-environment.yaml @@ -10,6 +10,7 @@ dependencies: # Use conda packages if possible. - pytorch::pytorch-cuda==11.8 - jemalloc - intel::mkl - - intel::numpy # Use Numpy built with the Intel compiler for best performance with MKL. + - intel::numpy # Use Numpy built with the Intel compiler for best performance with MKL. - pytest + - tmux==3.2a - tqdm diff --git a/tests/README.md b/tests/README.md index 327e2f4..2edb533 100644 --- a/tests/README.md +++ b/tests/README.md @@ -5,3 +5,8 @@ PyTest is the recommended testing platform. Simple unit tests should preferably be written as doctests, with more advanced tests being placed in this directory. + +To use the `test_run.py` file as an inference speed benchmark, which was its +original purpose, use the following command to run 1024 iterations on GPU 0: + +`python -m pytest tests/test_run.py::test_inference_run --gpu 0 --num_steps 1024` diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..32cab5c --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,3 @@ +def pytest_addoption(parser): + parser.addoption("--num_steps", type=int, action="store", default=64) + parser.addoption("--gpu", type=int, action="store", default=0) diff --git a/tests/test_run.py b/tests/test_run.py index 5848f94..c601af6 100644 --- a/tests/test_run.py +++ b/tests/test_run.py @@ -44,10 +44,16 @@ def enable_cudnn_benchmarking(): torch.backends.cudnn.benchmark = True +@pytest.fixture(scope="session", autouse=True) +def allow_tf32(): + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + + @pytest.fixture(scope="session") -def device(gpu: int = 0) -> torch.device: +def device(pytestconfig) -> torch.device: if torch.cuda.is_available(): - device = torch.device(f"cuda:{int(gpu)}") + device = torch.device(f"cuda:{pytestconfig.getoption('gpu')}") else: device = torch.device("cpu") msg = "No GPUs found for this container. Please check run configurations." @@ -77,13 +83,18 @@ class Config(NamedTuple): ] +@pytest.fixture(scope="session") +def num_steps(pytestconfig): + return pytestconfig.getoption("num_steps") + + @pytest.mark.parametrize(["name", "network_func", "input_shapes"], _configs) def test_inference_run( name: str, network_func: Callable[[], nn.Module], input_shapes: Sequence[Sequence[int]], device: torch.device, - num_steps: int = 64, + num_steps, enable_amp: bool = False, enable_scripting: bool = False, ): @@ -153,6 +164,9 @@ def get_cuda_info(device): # Using as a fixture to get device info. logger.info(f"PyTorch Architecture List: {al}") logger.info(f"GPU Device Name: {dp.name}") logger.info(f"GPU Compute Capability: {dp.major}.{dp.minor}") + # No way to check if the GPU has TF32 hardware, only whether it is allowed. + logger.info(f"MatMul TF32 Allowed: {torch.backends.cuda.matmul.allow_tf32}") + logger.info(f"cuDNN TF32 Allowed: {torch.backends.cudnn.allow_tf32}") # Python3.7+ required for `subprocess` to work as intended. if int(platform.python_version_tuple()[1]) > 6: