SNMG ANN (#231)

This PR implements a distributed (single-node-multiple-GPUs) implementation of ANN indexes. It allows to build, extend and search an index on multiple GPUs. Before building the index, the user has to choose between two modes : **Sharding mode** : The index dataset is split, each GPU trains its own index with its respective share of the dataset. This is intended to both increase the search throughput and the maximal size of the index. **Index duplication mode** : The index is built once on a GPU and then copied over to others. Alternatively, the index dataset is sent to each GPU to be built there. This intended to increase the search throughput. SNMG indexes can be serialized and de-serialized. Local models can also be deserialized and deployed in index duplication mode. ![bench](https://github.com/user-attachments/assets/e313d0ef-02eb-482a-9104-9e1bb400456d) Migrated from rapidsai/raft#1993 Authors: - Victor Lafargue (https://github.com/viclafargue) - James Lamb (https://github.com/jameslamb) - Corey J. Nolet (https://github.com/cjnolet) Approvers: - Tamas Bela Feher (https://github.com/tfeher) - James Lamb (https://github.com/jameslamb) - Corey J. Nolet (https://github.com/cjnolet) URL: #231
rapidsai · Oct 3, 2024 · 3383f28 · 3383f28
1 parent 5629977
commit 3383f28
Show file tree

Hide file tree

Showing 73 changed files with 6,800 additions and 46 deletions.
diff --git a/build.sh b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # scripts, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcuvs python rust docs tests bench-ann examples --uninstall  -v -g -n --compile-static-lib --allgpuarch --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
+VALIDARGS="clean libcuvs python rust docs tests bench-ann examples --uninstall  -v -g -n --compile-static-lib --allgpuarch --no-mg --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
 HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench-ann=<targets>] [--build-metrics=<filename>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -40,6 +40,7 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    --limit-tests               - semicolon-separated list of test executables to compile (e.g. NEIGHBORS_TEST;CLUSTER_TEST)
    --limit-bench-ann           - semicolon-separated list of ann benchmark executables to compute (e.g. HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH)
    --allgpuarch                - build for all supported GPU architectures
+   --no-mg                     - disable multi-GPU support
    --no-nvtx                   - disable nvtx (profiling markers), but allow enabling it in downstream projects
    --show_depr_warn            - show cmake deprecation warnings
    --build-metrics             - filename for generating build metrics report for libcuvs
@@ -65,6 +66,7 @@ CMAKE_LOG_LEVEL=""
 VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
 BUILD_TESTS=ON
+BUILD_MG_ALGOS=ON
 BUILD_TYPE=Release
 COMPILE_LIBRARY=OFF
 INSTALL_TARGET=install
@@ -261,6 +263,10 @@ if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
 
+if hasArg --no-mg; then
+    BUILD_MG_ALGOS=OFF
+fi
+
 if hasArg tests || (( ${NUMARGS} == 0 )); then
     BUILD_TESTS=ON
     CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}"
@@ -353,6 +359,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has
           -DBUILD_C_TESTS=${BUILD_TESTS} \
           -DBUILD_CUVS_BENCH=${BUILD_CUVS_BENCH} \
           -DBUILD_CPU_ONLY=${BUILD_CPU_ONLY} \
+          -DBUILD_MG_ALGOS=${BUILD_MG_ALGOS} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           ${CACHE_ARGS} \
           ${EXTRA_CMAKE_ARGS}

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -37,6 +37,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -37,6 +37,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -34,6 +34,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -34,6 +34,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -35,6 +35,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-aarch64=11.8

diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -35,6 +35,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8

diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -32,6 +32,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas

diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -32,6 +32,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas

diff --git a/conda/recipes/libcuvs/conda_build_config.yaml b/conda/recipes/libcuvs/conda_build_config.yaml
@@ -22,6 +22,9 @@ cmake_version:
 h5py_version:
   - ">=3.8.0"
 
+nccl_version:
+  - ">=2.19"
+
 # The CTK libraries below are missing from the conda-forge::cudatoolkit package
 # for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
 # and the "*_run_*" version specifiers correspond to `11.x` packages.

diff --git a/conda/recipes/libcuvs/meta.yaml b/conda/recipes/libcuvs/meta.yaml
@@ -65,6 +65,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}
         - cuda-profiler-api {{ cuda11_cuda_profiler_api_host_version }}
@@ -131,6 +132,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}
         - cuda-profiler-api {{ cuda11_cuda_profiler_api_host_version }}
@@ -197,6 +199,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - {{ pin_subpackage('libcuvs', exact=True) }}
         - cuda-version ={{ cuda_version }}
         - openblas # required by some CPU algos in benchmarks
@@ -268,6 +271,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - {{ pin_subpackage('libcuvs', exact=True) }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -57,6 +57,7 @@ option(BUILD_C_LIBRARY "Build cuVS C API library" OFF)
 option(BUILD_C_TESTS "Build cuVS C API tests" OFF)
 option(BUILD_CUVS_BENCH "Build cuVS ann benchmarks" OFF)
 option(BUILD_CAGRA_HNSWLIB "Build CAGRA+hnswlib interface" ON)
+option(BUILD_MG_ALGOS "Build with multi-GPU support" ON)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
        "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
@@ -287,6 +288,24 @@ target_compile_options(
                             "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
 )
 
+if(BUILD_MG_ALGOS)
+  set(CUVS_MG_ALGOS
+      src/neighbors/mg/mg_flat_float_int64_t.cu
+      src/neighbors/mg/mg_flat_int8_t_int64_t.cu
+      src/neighbors/mg/mg_flat_uint8_t_int64_t.cu
+      src/neighbors/mg/mg_pq_float_int64_t.cu
+      src/neighbors/mg/mg_pq_half_int64_t.cu
+      src/neighbors/mg/mg_pq_int8_t_int64_t.cu
+      src/neighbors/mg/mg_pq_uint8_t_int64_t.cu
+      src/neighbors/mg/mg_cagra_float_uint32_t.cu
+      src/neighbors/mg/mg_cagra_half_uint32_t.cu
+      src/neighbors/mg/mg_cagra_int8_t_uint32_t.cu
+      src/neighbors/mg/mg_cagra_uint8_t_uint32_t.cu
+      src/neighbors/mg/omp_checks.cpp
+      src/neighbors/mg/nccl_comm.cpp
+  )
+endif()
+
 add_library(
   cuvs_objs OBJECT
   src/cluster/kmeans_balanced_fit_float.cu
@@ -367,6 +386,17 @@ add_library(
   src/neighbors/cagra_serialize_half.cu
   src/neighbors/cagra_serialize_int8.cu
   src/neighbors/cagra_serialize_uint8.cu
+  src/neighbors/iface/iface_cagra_float_uint32_t.cu
+  src/neighbors/iface/iface_cagra_half_uint32_t.cu
+  src/neighbors/iface/iface_cagra_int8_t_uint32_t.cu
+  src/neighbors/iface/iface_cagra_uint8_t_uint32_t.cu
+  src/neighbors/iface/iface_flat_float_int64_t.cu
+  src/neighbors/iface/iface_flat_int8_t_int64_t.cu
+  src/neighbors/iface/iface_flat_uint8_t_int64_t.cu
+  src/neighbors/iface/iface_pq_float_int64_t.cu
+  src/neighbors/iface/iface_pq_half_int64_t.cu
+  src/neighbors/iface/iface_pq_int8_t_int64_t.cu
+  src/neighbors/iface/iface_pq_uint8_t_int64_t.cu
   src/neighbors/detail/cagra/cagra_build.cpp
   src/neighbors/detail/cagra/topk_for_cagra/topk.cu
   $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
@@ -434,6 +464,7 @@ add_library(
   src/selection/select_k_half_uint32_t.cu
   src/stats/silhouette_score.cu
   src/stats/trustworthiness_score.cu
+  ${CUVS_MG_ALGOS}
 )
 
 set_target_properties(
@@ -520,11 +551,16 @@ if(NOT BUILD_CPU_ONLY)
                                  ${CUVS_CUSPARSE_DEPENDENCY} ${CUVS_CURAND_DEPENDENCY}
   )
 
+  if(BUILD_MG_ALGOS)
+    set(CUVS_COMMS_DEPENDENCY nccl)
+  endif()
+
   # Keep cuVS as lightweight as possible. Only CUDA libs and rmm should be used in global target.
   target_link_libraries(
     cuvs
     PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
     PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX> cuvs-cagra-search
+            ${CUVS_COMMS_DEPENDENCY}
   )
 
   target_link_libraries(
@@ -534,6 +570,11 @@ if(NOT BUILD_CPU_ONLY)
   )
 endif()
 
+if(BUILD_MG_ALGOS)
+  target_compile_definitions(cuvs PUBLIC CUVS_BUILD_MG_ALGOS)
+  target_compile_definitions(cuvs_objs PUBLIC CUVS_BUILD_MG_ALGOS)
+endif()
+
 if(BUILD_CAGRA_HNSWLIB)
   target_link_libraries(cuvs_objs PRIVATE hnswlib::hnswlib)
   target_compile_definitions(cuvs_objs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
@@ -32,6 +32,7 @@ option(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE "Include cuVS brute force knn in benc
 option(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB "Include cuVS CAGRA with HNSW search in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" OFF)
+option(CUVS_ANN_BENCH_USE_CUVS_MG "Include cuVS ann mg algorithm in benchmark" ${BUILD_MG_ALGOS})
 option(CUVS_ANN_BENCH_SINGLE_EXE
        "Make a single executable with benchmark as shared library modules" OFF
 )
@@ -55,6 +56,7 @@ if(BUILD_CPU_ONLY)
   set(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB OFF)
   set(CUVS_ANN_BENCH_USE_GGNN OFF)
   set(CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_MG OFF)
 else()
   set(CUVS_FAISS_ENABLE_GPU ON)
 endif()
@@ -66,6 +68,7 @@ if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
    OR CUVS_ANN_BENCH_USE_CUVS_CAGRA
    OR CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB
    OR CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE
+   OR CUVS_ANN_BENCH_USE_CUVS_MG
 )
   set(CUVS_ANN_BENCH_USE_CUVS ON)
 endif()
@@ -245,6 +248,21 @@ if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_CUVS_MG)
+  ConfigureAnnBench(
+    NAME
+    CUVS_MG
+    PATH
+    src/cuvs/cuvs_benchmark.cu
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_MG}>:src/cuvs/cuvs_mg_ivf_flat.cu>
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_MG}>:src/cuvs/cuvs_mg_ivf_pq.cu>
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_MG}>:src/cuvs/cuvs_mg_cagra.cu>
+    LINKS
+    cuvs
+    nccl
+  )
+endif()
+
 message("CUVS_FAISS_TARGETS: ${CUVS_FAISS_TARGETS}")
 message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}")
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT)

diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -45,7 +45,18 @@ extern template class cuvs::bench::cuvs_cagra<uint8_t, uint32_t>;
 extern template class cuvs::bench::cuvs_cagra<int8_t, uint32_t>;
 #endif
 
-#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT
+#ifdef CUVS_ANN_BENCH_USE_CUVS_MG
+#include "cuvs_ivf_flat_wrapper.h"
+#include "cuvs_mg_ivf_flat_wrapper.h"
+
+#include "cuvs_ivf_pq_wrapper.h"
+#include "cuvs_mg_ivf_pq_wrapper.h"
+
+#include "cuvs_cagra_wrapper.h"
+#include "cuvs_mg_cagra_wrapper.h"
+#endif
+
+#if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT) || defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
                        typename cuvs::bench::cuvs_ivf_flat<T, IdxT>::build_param& param)
@@ -64,7 +75,7 @@ void parse_search_param(const nlohmann::json& conf,
 #endif
 
 #if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || \
-  defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+  defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) || defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
                        typename cuvs::bench::cuvs_ivf_pq<T, IdxT>::build_param& param)
@@ -130,7 +141,8 @@ void parse_search_param(const nlohmann::json& conf,
 }
 #endif
 
-#if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+#if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) || \
+  defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::nn_descent::index_params& param)
 {