rapidsai · rapids-bot · Oct 3, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024
@@ -18,7 +18,7 @@ ARGS=$*
 # scripts, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcuvs python rust docs tests bench-ann examples --uninstall  -v -g -n --compile-static-lib --allgpuarch --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
+VALIDARGS="clean libcuvs python rust docs tests bench-ann examples --uninstall  -v -g -n --compile-static-lib --allgpuarch --no-mg --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
 HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench-ann=<targets>] [--build-metrics=<filename>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -40,6 +40,7 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    --limit-tests               - semicolon-separated list of test executables to compile (e.g. NEIGHBORS_TEST;CLUSTER_TEST)
    --limit-bench-ann           - semicolon-separated list of ann benchmark executables to compute (e.g. HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH)
    --allgpuarch                - build for all supported GPU architectures
+   --no-mg                     - disable multi-GPU support
    --no-nvtx                   - disable nvtx (profiling markers), but allow enabling it in downstream projects
    --show_depr_warn            - show cmake deprecation warnings
    --build-metrics             - filename for generating build metrics report for libcuvs
@@ -65,6 +66,7 @@ CMAKE_LOG_LEVEL=""
 VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
 BUILD_TESTS=ON
+BUILD_MG_ALGOS=ON
 BUILD_TYPE=Release
 COMPILE_LIBRARY=OFF
 INSTALL_TARGET=install
@@ -261,6 +263,10 @@ if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
 
+if hasArg --no-mg; then
+    BUILD_MG_ALGOS=OFF
+fi
+
 if hasArg tests || (( ${NUMARGS} == 0 )); then
     BUILD_TESTS=ON
     CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}"
@@ -353,6 +359,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has
           -DBUILD_C_TESTS=${BUILD_TESTS} \
           -DBUILD_CUVS_BENCH=${BUILD_CUVS_BENCH} \
           -DBUILD_CPU_ONLY=${BUILD_CPU_ONLY} \
+          -DBUILD_MG_ALGOS=${BUILD_MG_ALGOS} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           ${CACHE_ARGS} \
           ${EXTRA_CMAKE_ARGS}

@@ -37,6 +37,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

@@ -37,6 +37,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

@@ -34,6 +34,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

@@ -34,6 +34,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc

@@ -35,6 +35,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-aarch64=11.8

@@ -35,6 +35,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8

@@ -32,6 +32,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas

@@ -32,6 +32,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas

@@ -22,6 +22,9 @@ cmake_version:
 h5py_version:
   - ">=3.8.0"
 
+nccl_version:
+  - ">=2.19"
+
 # The CTK libraries below are missing from the conda-forge::cudatoolkit package
 # for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
 # and the "*_run_*" version specifiers correspond to `11.x` packages.

@@ -65,6 +65,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}
         - cuda-profiler-api {{ cuda11_cuda_profiler_api_host_version }}
@@ -131,6 +132,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}
         - cuda-profiler-api {{ cuda11_cuda_profiler_api_host_version }}
@@ -197,6 +199,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - {{ pin_subpackage('libcuvs', exact=True) }}
         - cuda-version ={{ cuda_version }}
         - openblas # required by some CPU algos in benchmarks
@@ -268,6 +271,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - {{ pin_subpackage('libcuvs', exact=True) }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}

@@ -57,6 +57,7 @@ option(BUILD_C_LIBRARY "Build cuVS C API library" OFF)
 option(BUILD_C_TESTS "Build cuVS C API tests" OFF)
 option(BUILD_CUVS_BENCH "Build cuVS ann benchmarks" OFF)
 option(BUILD_CAGRA_HNSWLIB "Build CAGRA+hnswlib interface" ON)
+option(BUILD_MG_ALGOS "Build with multi-GPU support" ON)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
        "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
@@ -287,6 +288,23 @@ target_compile_options(
                             "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
 )
 
+if(BUILD_MG_ALGOS)
+  set(CUVS_MG_ALGOS
+      src/neighbors/mg/mg_flat_float_int64_t.cu
+      src/neighbors/mg/mg_flat_int8_t_int64_t.cu
+      src/neighbors/mg/mg_flat_uint8_t_int64_t.cu
+      src/neighbors/mg/mg_pq_float_int64_t.cu
+      src/neighbors/mg/mg_pq_half_int64_t.cu
+      src/neighbors/mg/mg_pq_int8_t_int64_t.cu
+      src/neighbors/mg/mg_pq_uint8_t_int64_t.cu
+      src/neighbors/mg/mg_cagra_float_uint32_t.cu
+      src/neighbors/mg/mg_cagra_half_uint32_t.cu
+      src/neighbors/mg/mg_cagra_int8_t_uint32_t.cu
+      src/neighbors/mg/mg_cagra_uint8_t_uint32_t.cu
+      src/neighbors/mg/omp_checks.cu
+  )
+endif()
+
 add_library(
   cuvs SHARED
   src/cluster/kmeans_balanced_fit_float.cu
@@ -367,6 +385,17 @@ add_library(
   src/neighbors/cagra_serialize_half.cu
   src/neighbors/cagra_serialize_int8.cu
   src/neighbors/cagra_serialize_uint8.cu
+  src/neighbors/iface/iface_cagra_float_uint32_t.cu
+  src/neighbors/iface/iface_cagra_half_uint32_t.cu
+  src/neighbors/iface/iface_cagra_int8_t_uint32_t.cu
+  src/neighbors/iface/iface_cagra_uint8_t_uint32_t.cu
+  src/neighbors/iface/iface_flat_float_int64_t.cu
+  src/neighbors/iface/iface_flat_int8_t_int64_t.cu
+  src/neighbors/iface/iface_flat_uint8_t_int64_t.cu
+  src/neighbors/iface/iface_pq_float_int64_t.cu
+  src/neighbors/iface/iface_pq_half_int64_t.cu
+  src/neighbors/iface/iface_pq_int8_t_int64_t.cu
+  src/neighbors/iface/iface_pq_uint8_t_int64_t.cu
   src/neighbors/detail/cagra/cagra_build.cpp
   src/neighbors/detail/cagra/topk_for_cagra/topk.cu
   $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
@@ -428,8 +457,13 @@ add_library(
   src/selection/select_k_half_uint32_t.cu
   src/stats/silhouette_score.cu
   src/stats/trustworthiness_score.cu
+  ${CUVS_MG_ALGOS}
 )
 
+if(BUILD_MG_ALGOS)
+  target_compile_definitions(cuvs PUBLIC CUVS_BUILD_MG_ALGOS)
+endif()
+
 target_compile_options(
   cuvs INTERFACE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-extended-lambda
                  --expt-relaxed-constexpr>
@@ -459,11 +493,16 @@ if(NOT BUILD_CPU_ONLY)
                                  ${CUVS_CUSPARSE_DEPENDENCY} ${CUVS_CURAND_DEPENDENCY}
   )
 
+  if(BUILD_MG_ALGOS)
+    set(CUVS_COMMS_DEPENDENCY nccl)
+  endif()
+
   # Keep cuVS as lightweight as possible. Only CUDA libs and rmm should be used in global target.
   target_link_libraries(
     cuvs
     PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
     PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX> cuvs-cagra-search
+            ${CUVS_COMMS_DEPENDENCY}
   )
 endif()
 

@@ -32,6 +32,7 @@ option(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE "Include cuVS brute force knn in benc
 option(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB "Include cuVS CAGRA with HNSW search in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" OFF)
+option(CUVS_ANN_BENCH_USE_CUVS_MG "Include cuVS ann mg algorithm in benchmark" ${BUILD_MG_ALGOS})
 option(CUVS_ANN_BENCH_SINGLE_EXE
        "Make a single executable with benchmark as shared library modules" OFF
 )
@@ -55,6 +56,7 @@ if(BUILD_CPU_ONLY)
   set(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB OFF)
   set(CUVS_ANN_BENCH_USE_GGNN OFF)
   set(CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_MG OFF)
 else()
   set(CUVS_FAISS_ENABLE_GPU ON)
 endif()
@@ -66,6 +68,7 @@ if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
    OR CUVS_ANN_BENCH_USE_CUVS_CAGRA
    OR CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB
    OR CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE
+   OR CUVS_ANN_BENCH_USE_CUVS_MG
 )
   set(CUVS_ANN_BENCH_USE_CUVS ON)
 endif()
@@ -245,6 +248,21 @@ if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_CUVS_MG)
+  ConfigureAnnBench(
+    NAME
+    CUVS_MG
+    PATH
+    src/cuvs/cuvs_benchmark.cu
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_MG}>:src/cuvs/cuvs_mg_ivf_flat.cu>
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_MG}>:src/cuvs/cuvs_mg_ivf_pq.cu>
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_MG}>:src/cuvs/cuvs_mg_cagra.cu>
+    LINKS
+    cuvs
+    nccl
+  )
+endif()
+
 message("CUVS_FAISS_TARGETS: ${CUVS_FAISS_TARGETS}")
 message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}")
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT)

@@ -45,7 +45,18 @@ extern template class cuvs::bench::cuvs_cagra<uint8_t, uint32_t>;
 extern template class cuvs::bench::cuvs_cagra<int8_t, uint32_t>;
 #endif
 
-#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT
+#ifdef CUVS_ANN_BENCH_USE_CUVS_MG
+#include "cuvs_ivf_flat_wrapper.h"
+#include "cuvs_mg_ivf_flat_wrapper.h"
+
+#include "cuvs_ivf_pq_wrapper.h"
+#include "cuvs_mg_ivf_pq_wrapper.h"
+
+#include "cuvs_cagra_wrapper.h"
+#include "cuvs_mg_cagra_wrapper.h"
+#endif
+
+#if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT) || defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
                        typename cuvs::bench::cuvs_ivf_flat<T, IdxT>::build_param& param)
@@ -64,7 +75,7 @@ void parse_search_param(const nlohmann::json& conf,
 #endif
 
 #if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || \
-  defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+  defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) || defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
                        typename cuvs::bench::cuvs_ivf_pq<T, IdxT>::build_param& param)
@@ -130,7 +141,8 @@ void parse_search_param(const nlohmann::json& conf,
 }
 #endif
 
-#if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+#if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) || \
+  defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::nn_descent::index_params& param)
 {