Skip to content

Commit

Permalink
Merge branch 'branch-24.10' into fea-nvcomp-4
Browse files Browse the repository at this point in the history
  • Loading branch information
bdice authored Sep 3, 2024
2 parents d2f9117 + 0e9ebcb commit 8a7da7a
Show file tree
Hide file tree
Showing 15 changed files with 213 additions and 69 deletions.
10 changes: 7 additions & 3 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ VALIDARGS="clean libkvikio kvikio -v -g -n --pydevelop -h"
HELP="$0 [clean] [libkvikio] [kvikio] [-v] [-g] [-n] [--cmake-args=\"<args>\"] [-h]
clean - remove all existing build artifacts and configuration (start over)
libkvikio - build and install the libkvikio C++ code
kvikio - build and install the kvikio Python package
kvikio - build and install the kvikio Python package (requires libkvikio)
-v - verbose build mode
-g - build for debug
-n - no install step
Expand All @@ -32,7 +32,7 @@ HELP="$0 [clean] [libkvikio] [kvikio] [-v] [-g] [-n] [--cmake-args=\"<args>\"] [
default action (no args) is to build and install 'libkvikio' and 'kvikio' targets
"
LIBKVIKIO_BUILD_DIR=${LIBKVIKIO_BUILD_DIR:=${REPODIR}/cpp/build}
KVIKIO_BUILD_DIR="${REPODIR}/python/build ${REPODIR}/python/_skbuild"
KVIKIO_BUILD_DIR="${REPODIR}/python/kvikio/build/"
BUILD_DIRS="${LIBKVIKIO_BUILD_DIR} ${KVIKIO_BUILD_DIR}"

# Set defaults for vars modified by flags to this script
Expand Down Expand Up @@ -111,10 +111,14 @@ fi
# Process flags
if hasArg -v; then
VERBOSE_FLAG=-v
export SKBUILD_BUILD_VERBOSE=true
export SKBUILD_LOGGING_LEVEL=INFO
set -x
fi
if hasArg -g; then
BUILD_TYPE=Debug
export SKBUILD_INSTALL_STRIP=false
export SKBUILD_CMAKE_BUILD_TYPE=Debug
fi
if hasArg -n; then
INSTALL_TARGET=""
Expand Down Expand Up @@ -150,7 +154,7 @@ if (( NUMARGS == 0 )) || hasArg libkvikio; then
cmake --build "${LIBKVIKIO_BUILD_DIR}" -j${PARALLEL_LEVEL} ${VERBOSE_FLAG}
if [[ ${INSTALL_TARGET} != "" ]]; then
echo "installing libkvikio..."
cmake --build "${LIBKVIKIO_BUILD_DIR}" --target install -v ${VERBOSE_FLAG}
cmake --build "${LIBKVIKIO_BUILD_DIR}" --target install ${VERBOSE_FLAG}
fi
fi

Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/kvikio/build.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
# Copyright (c) 2018-2024, NVIDIA CORPORATION.

./build.sh kvikio
./build.sh -v kvikio
4 changes: 2 additions & 2 deletions conda/recipes/libkvikio/build.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

./build.sh -n libkvikio
./build.sh -v -n libkvikio
16 changes: 15 additions & 1 deletion cpp/doxygen/main_page.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,28 @@ Set the environment variable `KVIKIO_COMPAT_MODE` to enable/disable compatibilit
- when running in Windows Subsystem for Linux (WSL).
- when `/run/udev` isn't readable, which typically happens when running inside a docker image not launched with `--volume /run/udev:/run/udev:ro`.

This setting can also be controlled by `defaults::compat_mode()` and `defaults::compat_mode_reset()`.


#### Thread Pool (KVIKIO_NTHREADS)
KvikIO can use multiple threads for IO automatically. Set the environment variable `KVIKIO_NTHREADS` to the number of threads in the thread pool. If not set, the default value is 1.

This setting can also be controlled by `defaults::thread_pool_nthreads()` and `defaults::thread_pool_nthreads_reset()`.

#### Task Size (KVIKIO_TASK_SIZE)
KvikIO splits parallel IO operations into multiple tasks. Set the environment variable `KVIKIO_TASK_SIZE` to the maximum task size (in bytes). If not set, the default value is 4194304 (4 MiB).

This setting can also be controlled by `defaults::task_size()` and `defaults::task_size_reset()`.

#### GDS Threshold (KVIKIO_GDS_THRESHOLD)
In order to improve performance of small IO, `.pread()` and `.pwrite()` implement a shortcut that circumvent the threadpool and use the POSIX backend directly. Set the environment variable `KVIKIO_GDS_THRESHOLD` to the minimum size (in bytes) to use GDS. If not set, the default value is 1048576 (1 MiB).
To improve performance of small IO requests, `.pread()` and `.pwrite()` implement a shortcut that circumvents the threadpool and uses the POSIX backend directly. Set the environment variable `KVIKIO_GDS_THRESHOLD` to the minimum size (in bytes) to use GDS. If not set, the default value is 1048576 (1 MiB).

This setting can also be controlled by `defaults::gds_threshold()` and `defaults::gds_threshold_reset()`.

#### Size of the Bounce Buffer (KVIKIO_GDS_THRESHOLD)
KvikIO might have to use intermediate host buffers (one per thread) when copying between files and device memory. Set the environment variable ``KVIKIO_BOUNCE_BUFFER_SIZE`` to the size (in bytes) of these "bounce" buffers. If not set, the default value is 16777216 (16 MiB).

This setting can also be controlled by `defaults::bounce_buffer_size()` and `defaults::bounce_buffer_size_reset()`.


## Example
Expand Down
70 changes: 49 additions & 21 deletions cpp/include/kvikio/bounce_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,21 @@ class AllocRetain {
// The size of each allocation in `_free_allocs`
std::size_t _size{defaults::bounce_buffer_size()};

public:
/**
* @brief An host memory allocation
*/
class Alloc {
private:
AllocRetain* _manager;
void* _alloc;
const std::size_t _size;
std::size_t const _size;

public:
Alloc(AllocRetain* manager, void* alloc, std::size_t size)
: _manager(manager), _alloc{alloc}, _size{size}
{
}
Alloc(const Alloc&) = delete;
Alloc(Alloc const&) = delete;
Alloc& operator=(Alloc const&) = delete;
Alloc(Alloc&& o) = delete;
Alloc& operator=(Alloc&& o) = delete;
Expand All @@ -61,29 +60,49 @@ class AllocRetain {
};

AllocRetain() = default;
~AllocRetain() noexcept
{
try {
clear();
} catch (const CUfileException& e) {
std::cerr << "~AllocRetain(): " << e.what() << std::endl;
}
}

void clear()
// Notice, we do not clear the allocations at destruction thus the allocations leaks
// at exit. We do this because `AllocRetain::instance()` stores the allocations in a
// static stack that are destructed below main, which is not allowed in CUDA:
// <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization>
~AllocRetain() noexcept = default;

/**
* @brief Free all retained allocations
*
* NB: The `_mutex` must be taken prior to calling this function.
*
* @return The number of bytes cleared
*/
std::size_t _clear()
{
std::size_t ret = _free_allocs.size() * _size;
while (!_free_allocs.empty()) {
CUDA_DRIVER_TRY(cudaAPI::instance().MemFreeHost(_free_allocs.top()));
_free_allocs.pop();
}
return ret;
}

[[nodiscard]] Alloc get()
/**
* @brief Ensure the sizes of the retained allocations match `defaults::bounce_buffer_size()`
*
* NB: `_mutex` must be taken prior to calling this function.
*/
void _ensure_alloc_size()
{
const std::lock_guard lock(_mutex);
if (_size != defaults::bounce_buffer_size()) {
clear(); // the desired allocation size has changed.
auto const bounce_buffer_size = defaults::bounce_buffer_size();
if (_size != bounce_buffer_size) {
_clear();
_size = bounce_buffer_size;
}
}

public:
[[nodiscard]] Alloc get()
{
std::lock_guard const lock(_mutex);
_ensure_alloc_size();

// Check if we have an allocation available
if (!_free_allocs.empty()) {
Expand All @@ -101,10 +120,8 @@ class AllocRetain {

void put(void* alloc, std::size_t size)
{
const std::lock_guard lock(_mutex);
if (_size != defaults::bounce_buffer_size()) {
clear(); // the desired allocation size has changed.
}
std::lock_guard const lock(_mutex);
_ensure_alloc_size();

// If the size of `alloc` matches the sizes of the retained allocations,
// it is added to the set of free allocation otherwise it is freed.
Expand All @@ -115,13 +132,24 @@ class AllocRetain {
}
}

/**
* @brief Free all retained allocations
*
* @return The number of bytes cleared
*/
std::size_t clear()
{
std::lock_guard const lock(_mutex);
return _clear();
}

static AllocRetain& instance()
{
static AllocRetain _instance;
return _instance;
}

AllocRetain(const AllocRetain&) = delete;
AllocRetain(AllocRetain const&) = delete;
AllocRetain& operator=(AllocRetain const&) = delete;
AllocRetain(AllocRetain&& o) = delete;
AllocRetain& operator=(AllocRetain&& o) = delete;
Expand Down
6 changes: 3 additions & 3 deletions docs/source/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel like:
Build from source
-----------------

In order to setup a development environment run:
In order to setup a development environment, we recommend Conda:

.. code-block::
Expand All @@ -48,11 +48,11 @@ In order to setup a development environment run:
# CUDA 12.5
mamba env create --name kvikio-dev --file conda/environments/all_cuda-125_arch-x86_64.yaml
To build and install the extension run:
The Python library depends on the C++ library, thus we build and install both:

.. code-block::
./build.sh kvikio
./build.sh libkvikio kvikio
One might have to define ``CUDA_HOME`` to the path to the CUDA installation.
Expand Down
12 changes: 12 additions & 0 deletions docs/source/runtime_settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,29 @@ Set the environment variable ``KVIKIO_COMPAT_MODE`` to enable/disable compatibil
* when running in Windows Subsystem for Linux (WSL).
* when ``/run/udev`` isn't readable, which typically happens when running inside a docker image not launched with ``--volume /run/udev:/run/udev:ro``.

This setting can also be controlled by :py:func:`kvikio.defaults.compat_mode`, :py:func:`kvikio.defaults.compat_mode_reset`, and :py:func:`kvikio.defaults.set_compat_mode`.


Thread Pool ``KVIKIO_NTHREADS``
-------------------------------
KvikIO can use multiple threads for IO automatically. Set the environment variable ``KVIKIO_NTHREADS`` to the number of threads in the thread pool. If not set, the default value is 1.

This setting can also be controlled by :py:func:`kvikio.defaults.get_num_threads`, :py:func:`kvikio.defaults.num_threads_reset`, and :py:func:`kvikio.defaults.set_num_threads`.

Task Size ``KVIKIO_TASK_SIZE``
------------------------------
KvikIO splits parallel IO operations into multiple tasks. Set the environment variable ``KVIKIO_TASK_SIZE`` to the maximum task size (in bytes). If not set, the default value is 4194304 (4 MiB).

This setting can also be controlled by :py:func:`kvikio.defaults.task_size`, :py:func:`kvikio.defaults.task_size_reset`, and :py:func:`kvikio.defaults.set_task_size`.

GDS Threshold ``KVIKIO_GDS_THRESHOLD``
--------------------------------------
In order to improve performance of small IO, ``.pread()`` and ``.pwrite()`` implement a shortcut that circumvent the threadpool and use the POSIX backend directly. Set the environment variable ``KVIKIO_GDS_THRESHOLD`` to the minimum size (in bytes) to use GDS. If not set, the default value is 1048576 (1 MiB).

This setting can also be controlled by :py:func:`kvikio.defaults.gds_threshold`, :py:func:`kvikio.defaults.gds_threshold_reset`, and :py:func:`kvikio.defaults.set_gds_threshold`.

Size of the Bounce Buffer ``KVIKIO_BOUNCE_BUFFER_SIZE``
-------------------------------------------------------
KvikIO might have to use intermediate host buffers (one per thread) when copying between files and device memory. Set the environment variable ``KVIKIO_BOUNCE_BUFFER_SIZE`` to the size (in bytes) of these "bounce" buffers. If not set, the default value is 16777216 (16 MiB).

This setting can also be controlled by :py:func:`kvikio.defaults.bounce_buffer_size`, :py:func:`kvikio.defaults.bounce_buffer_size_reset`, and :py:func:`kvikio.defaults.set_bounce_buffer_size`.
11 changes: 1 addition & 10 deletions python/kvikio/kvikio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,10 @@
# Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
# See file LICENSE for terms.

from kvikio._lib import buffer, driver_properties # type: ignore
from kvikio._lib import driver_properties # type: ignore
from kvikio._version import __git_commit__, __version__
from kvikio.cufile import CuFile


def memory_register(buf) -> None:
return buffer.memory_register(buf)


def memory_deregister(buf) -> None:
buffer.memory_deregister(buf)


# TODO: Wrap nicely, maybe as a dataclass?
DriverProperties = driver_properties.DriverProperties

Expand Down
10 changes: 9 additions & 1 deletion python/kvikio/kvikio/_lib/buffer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from kvikio._lib.arr cimport Array


cdef extern from "<kvikio/buffer.hpp>" namespace "kvikio" nogil:
cdef extern from "<kvikio/buffer.hpp>" nogil:
void cpp_memory_register "kvikio::memory_register"(const void* devPtr) except +
void cpp_memory_deregister "kvikio::memory_deregister"(const void* devPtr) except +

Expand All @@ -25,3 +25,11 @@ def memory_deregister(buf) -> None:
buf = Array(buf)
cdef Array arr = buf
cpp_memory_deregister(<void*>arr.ptr)


cdef extern from "<kvikio/bounce_buffer.hpp>" nogil:
size_t cpp_alloc_retain_clear "kvikio::AllocRetain::instance().clear"() except +


def bounce_buffer_free() -> int:
return cpp_alloc_retain_clear()
Loading

0 comments on commit 8a7da7a

Please sign in to comment.