diff --git a/README.md b/README.md
index 7a0861238..5303e738f 100755
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@
 
 ## Useful Resources
 
-- [Documentation](https://docs.rapids.ai/api/cuvs/): Library documentation.
+- [Documentation](https://docs.rapids.ai/api/cuvs/nightly/): Library documentation.
 - [Build and Install Guide](https://docs.rapids.ai/api/cuvs/nightly/build): Instructions for installing and building cuVS.
 - [Getting Started Guide](https://docs.rapids.ai/api/cuvs/nightly/getting_started): Guide to getting started with cuVS.
 - [Code Examples](https://github.com/rapidsai/cuvs/tree/HEAD/examples): Self-contained Code Examples.
diff --git a/build.sh b/build.sh
index b463f0f0d..b787d3a41 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # scripts, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcuvs python rust docs tests bench-ann examples --uninstall  -v -g -n --compile-static-lib --allgpuarch --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
+VALIDARGS="clean libcuvs python rust docs tests bench-ann examples --uninstall  -v -g -n --compile-static-lib --allgpuarch --no-mg --no-cpu --cpu-only --no-shared-libs --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
 HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench-ann=<targets>] [--build-metrics=<filename>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -37,10 +37,13 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    -n                          - no install step
    --uninstall                 - uninstall files for specified targets which were built and installed prior
    --compile-static-lib        - compile static library for all components
+   --cpu-only                  - build CPU only components without CUDA. Currently only applies to bench-ann.
    --limit-tests               - semicolon-separated list of test executables to compile (e.g. NEIGHBORS_TEST;CLUSTER_TEST)
    --limit-bench-ann           - semicolon-separated list of ann benchmark executables to compute (e.g. HNSWLIB_ANN_BENCH;RAFT_IVF_PQ_ANN_BENCH)
    --allgpuarch                - build for all supported GPU architectures
+   --no-mg                     - disable multi-GPU support
    --no-nvtx                   - disable nvtx (profiling markers), but allow enabling it in downstream projects
+   --no-shared-libs            - build without shared libraries
    --show_depr_warn            - show cmake deprecation warnings
    --build-metrics             - filename for generating build metrics report for libcuvs
    --incl-cache-stats          - include cache statistics in build metrics report
@@ -65,11 +68,13 @@ CMAKE_LOG_LEVEL=""
 VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
 BUILD_TESTS=ON
+BUILD_MG_ALGOS=ON
 BUILD_TYPE=Release
 COMPILE_LIBRARY=OFF
 INSTALL_TARGET=install
 BUILD_REPORT_METRICS=""
 BUILD_REPORT_INCL_CACHE_STATS=OFF
+BUILD_SHARED_LIBS=ON
 
 TEST_TARGETS="NEIGHBORS_ANN_CAGRA_TEST"
 ANN_BENCH_TARGETS="CUVS_ANN_BENCH_ALL"
@@ -261,6 +266,10 @@ if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
 
+if hasArg --no-mg; then
+    BUILD_MG_ALGOS=OFF
+fi
+
 if hasArg tests || (( ${NUMARGS} == 0 )); then
     BUILD_TESTS=ON
     CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}"
@@ -276,7 +285,20 @@ fi
 
 if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
     BUILD_CUVS_BENCH=ON
+    if ! hasArg tests; then
+        BUILD_TESTS=OFF
+    fi
+    COMPILE_LIBRARY=OFF
     CMAKE_TARGET="${CMAKE_TARGET};${ANN_BENCH_TARGETS}"
+    if hasArg --cpu-only; then
+        BUILD_CPU_ONLY=ON
+        BUILD_SHARED_LIBS=OFF
+        NVTX=OFF
+    fi
+fi
+
+if hasArg --no-shared-libs; then
+    BUILD_SHARED_LIBS=OFF
 fi
 
 if hasArg --no-nvtx; then
@@ -323,7 +345,11 @@ fi
 # Configure for building all C++ targets
 if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann; then
     COMPILE_LIBRARY=ON
-    CMAKE_TARGET="${CMAKE_TARGET};cuvs"
+    if (( ${BUILD_SHARED_LIBS} == "OFF" )); then
+        CMAKE_TARGET="${CMAKE_TARGET};"
+    else
+        CMAKE_TARGET="${CMAKE_TARGET};cuvs"
+    fi
 
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         CUVS_CMAKE_CUDA_ARCHITECTURES="NATIVE"
@@ -353,7 +379,9 @@ if (( ${NUMARGS} == 0 )) || hasArg libcuvs || hasArg docs || hasArg tests || has
           -DBUILD_C_TESTS=${BUILD_TESTS} \
           -DBUILD_CUVS_BENCH=${BUILD_CUVS_BENCH} \
           -DBUILD_CPU_ONLY=${BUILD_CPU_ONLY} \
+          -DBUILD_MG_ALGOS=${BUILD_MG_ALGOS} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
+          -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} \
           ${CACHE_ARGS} \
           ${EXTRA_CMAKE_ARGS}
 
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 82ecba43a..7b0c639af 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -31,4 +31,22 @@ rapids-conda-retry mambabuild \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/cuvs
 
+# Build cuvs_bench for each cuda and python version
+rapids-conda-retry mambabuild \
+  --no-test \
+  --channel "${CPP_CHANNEL}" \
+  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
+  conda/recipes/cuvs_bench
+
+# Build cuvs_bench_cpu only in CUDA 12 jobs since it only depends on python
+# version
+RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
+if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then
+  rapids-conda-retry mambabuild \
+  --no-test \
+  --channel "${CPP_CHANNEL}" \
+  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
+  conda/recipes/cuvs_bench_cpu
+fi
+
 rapids-upload-conda-to-s3 python
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 5c599fcc2..065851064 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -37,6 +37,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index ce9a7f058..a25393050 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -37,6 +37,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 116e80ac2..bb4a96d48 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -34,6 +34,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 7f7ad045d..bd1b95ae8 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -34,6 +34,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - make
+- nccl>=2.19
 - ninja
 - numpy>=1.23,<3.0a0
 - numpydoc
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 73c42ca71..554ad41ab 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -35,6 +35,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-aarch64=11.8
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 473e50bc6..dc38f3565 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -35,6 +35,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index 8a877c4c0..aeb23a9ef 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -32,6 +32,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index 54859a77f..3a408cd64 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -32,6 +32,7 @@ dependencies:
 - libcusparse-dev
 - librmm==24.10.*,>=0.0.0a0
 - matplotlib
+- nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/recipes/cuvs_bench/build.sh b/conda/recipes/cuvs_bench/build.sh
new file mode 100644
index 000000000..05fb7bada
--- /dev/null
+++ b/conda/recipes/cuvs_bench/build.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+./build.sh bench-ann --allgpuarch --no-nvtx --build-metrics=bench_ann --incl-cache-stats
+cmake --install cpp/build --component ann_bench
diff --git a/conda/recipes/cuvs_bench/conda_build_config.yaml b/conda/recipes/cuvs_bench/conda_build_config.yaml
new file mode 100644
index 000000000..47bd730da
--- /dev/null
+++ b/conda/recipes/cuvs_bench/conda_build_config.yaml
@@ -0,0 +1,70 @@
+c_compiler_version:
+  - 11
+
+cxx_compiler_version:
+  - 11
+
+cuda_compiler:
+  - cuda-nvcc
+
+cuda11_compiler:
+  - nvcc
+
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
+  - "2.17"
+
+cmake_version:
+  - ">=3.26.4,!=3.30.0"
+
+nccl_version:
+  - ">=2.19"
+
+glog_version:
+  - ">=0.6.0"
+
+h5py_version:
+  - ">=3.8.0"
+
+nlohmann_json_version:
+  - ">=3.11.2"
+
+# The CTK libraries below are missing from the conda-forge::cudatoolkit package
+# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
+# and the "*_run_*" version specifiers correspond to `11.x` packages.
+
+cuda11_libcublas_host_version:
+  - "=11.11.3.6"
+
+cuda11_libcublas_run_version:
+  - ">=11.5.2.43,<12.0.0"
+
+cuda11_libcurand_host_version:
+  - "=10.3.0.86"
+
+cuda11_libcurand_run_version:
+  - ">=10.2.5.43,<10.3.1"
+
+cuda11_libcusolver_host_version:
+  - "=11.4.1.48"
+
+cuda11_libcusolver_run_version:
+  - ">=11.2.0.43,<11.4.2"
+
+cuda11_libcusparse_host_version:
+  - "=11.7.5.86"
+
+cuda11_libcusparse_run_version:
+  - ">=11.6.0.43,<12.0.0"
+
+# `cuda-profiler-api` only has `11.8.0` and `12.0.0` packages for all
+# architectures. The "*_host_*" version specifiers correspond to `11.8` packages and the
+# "*_run_*" version specifiers correspond to `11.x` packages.
+
+cuda11_cuda_profiler_api_host_version:
+  - "=11.8.86"
+
+cuda11_cuda_profiler_api_run_version:
+  - ">=11.4.240,<12"
diff --git a/conda/recipes/cuvs_bench/meta.yaml b/conda/recipes/cuvs_bench/meta.yaml
new file mode 100644
index 000000000..9ecbf82bb
--- /dev/null
+++ b/conda/recipes/cuvs_bench/meta.yaml
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# Usage:
+#   conda build . -c rapidsai -c conda-forge -c nvidia
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version = environ['CONDA_PY'] %}
+{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
+{% set cuda_major = cuda_version.split('.')[0] %}
+{% set date_string = environ['RAPIDS_DATE_STRING'] %}
+
+package:
+  name: cuvs_bench
+  version: {{ version }}
+  script: build.sh
+
+source:
+  path: ../../..
+
+build:
+  script_env:
+    - AWS_ACCESS_KEY_ID
+    - AWS_SECRET_ACCESS_KEY
+    - AWS_SESSION_TOKEN
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+    - CMAKE_GENERATOR
+    - PARALLEL_LEVEL
+    - RAPIDS_ARTIFACTS_DIR
+    - SCCACHE_BUCKET
+    - SCCACHE_IDLE_TIMEOUT
+    - SCCACHE_REGION
+    - SCCACHE_S3_KEY_PREFIX=cuvs-bench-aarch64 # [aarch64]
+    - SCCACHE_S3_KEY_PREFIX=cuvs-bench-linux64 # [linux64]
+    - SCCACHE_S3_USE_SSL
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  ignore_run_exports_from:
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }}
+    {% else %}
+    - {{ compiler('cuda') }}
+    - cuda-cudart-dev
+    - libcublas-dev
+    {% endif %}
+
+requirements:
+  build:
+    - {{ compiler('c') }}
+    - {{ compiler('cxx') }}
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    {% else %}
+    - {{ compiler('cuda') }}
+    {% endif %}
+    - cuda-version ={{ cuda_version }}
+    - cmake {{ cmake_version }}
+    - ninja
+    - {{ stdlib("c") }}
+
+  host:
+    - benchmark
+    - cuda-version ={{ cuda_version }}
+    {% if cuda_major == "11" %}
+    - cuda-profiler-api {{ cuda11_cuda_profiler_api_run_version }}
+    - libcublas {{ cuda11_libcublas_host_version }}
+    - libcublas-dev {{ cuda11_libcublas_host_version }}
+    {% else %}
+    - cuda-cudart-dev
+    - cuda-profiler-api
+    - libcublas-dev
+    {% endif %}
+    - glog {{ glog_version }}
+    - libcuvs {{ version }}
+    - nlohmann_json {{ nlohmann_json_version }}
+    - openblas
+    # rmm is needed to determine if package is gpu-enabled
+    - python
+    - rapids-build-backend>=0.3.0,<0.4.0.dev0
+    - rmm ={{ minor_version }}
+
+  run:
+    - benchmark
+    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
+    {% if cuda_major == "11" %}
+    - cudatoolkit
+    {% else %}
+    - cuda-cudart
+    - libcublas
+    {% endif %}
+    - glog {{ glog_version }}
+    - libcuvs {{ version }}
+    - h5py {{ h5py_version }}
+    - matplotlib
+    - pandas
+    - pyyaml
+    # rmm is needed to determine if package is gpu-enabled
+    - pylibraft ={{ minor_version }}
+    - python
+    - rmm ={{ minor_version }}
+about:
+  home: https://rapids.ai/
+  license: Apache-2.0
+  summary: cuVS GPU and CPU benchmarks
diff --git a/conda/recipes/cuvs_bench_cpu/build.sh b/conda/recipes/cuvs_bench_cpu/build.sh
new file mode 100644
index 000000000..163872053
--- /dev/null
+++ b/conda/recipes/cuvs_bench_cpu/build.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+./build.sh bench-ann --cpu-only --no-nvtx --build-metrics=bench_ann_cpu --incl-cache-stats
+cmake --install cpp/build --component ann_bench
diff --git a/conda/recipes/cuvs_bench_cpu/conda_build_config.yaml b/conda/recipes/cuvs_bench_cpu/conda_build_config.yaml
new file mode 100644
index 000000000..ed6f708e1
--- /dev/null
+++ b/conda/recipes/cuvs_bench_cpu/conda_build_config.yaml
@@ -0,0 +1,29 @@
+c_compiler_version:
+  - 11
+
+cxx_compiler_version:
+  - 11
+
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
+  - "2.17"
+
+cmake_version:
+  - ">=3.26.4,!=3.30.0"
+
+glog_version:
+  - ">=0.6.0"
+
+h5py_version:
+  - ">=3.8.0"
+
+nlohmann_json_version:
+  - ">=3.11.2"
+
+spdlog_version:
+  - ">=1.14.1,<1.15"
+
+fmt_version:
+  - ">=11.0.2,<12"
diff --git a/conda/recipes/cuvs_bench_cpu/meta.yaml b/conda/recipes/cuvs_bench_cpu/meta.yaml
new file mode 100644
index 000000000..0ce5db744
--- /dev/null
+++ b/conda/recipes/cuvs_bench_cpu/meta.yaml
@@ -0,0 +1,67 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# Usage:
+#   conda build .  -c rapidsai -c conda-forge -c nvidia
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version = environ['CONDA_PY'] %}
+{% set date_string = environ['RAPIDS_DATE_STRING'] %}
+
+package:
+  name: cuvs_bench_cpu
+  version: {{ version }}
+  script: build.sh
+
+source:
+  path: ../../..
+
+build:
+  script_env:
+    - AWS_ACCESS_KEY_ID
+    - AWS_SECRET_ACCESS_KEY
+    - AWS_SESSION_TOKEN
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+    - CMAKE_GENERATOR
+    - PARALLEL_LEVEL
+    - RAPIDS_ARTIFACTS_DIR
+    - SCCACHE_BUCKET
+    - SCCACHE_IDLE_TIMEOUT
+    - SCCACHE_REGION
+    - SCCACHE_S3_KEY_PREFIX=cuvs-bench-cpu-aarch64 # [aarch64]
+    - SCCACHE_S3_KEY_PREFIX=cuvs-bench-cpu-linux64 # [linux64]
+    - SCCACHE_S3_USE_SSL
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+
+requirements:
+  build:
+    - {{ compiler('c') }}
+    - {{ compiler('cxx') }}
+    - cmake {{ cmake_version }}
+    - ninja
+    - {{ stdlib("c") }}
+
+  host:
+    - benchmark
+    - fmt {{ fmt_version }}
+    - glog {{ glog_version }}
+    - nlohmann_json {{ nlohmann_json_version }}
+    - openblas
+    - python
+    - rapids-build-backend>=0.3.0,<0.4.0.dev0
+    - spdlog {{ spdlog_version }}
+
+  run:
+    - benchmark
+    - glog {{ glog_version }}
+    - h5py {{ h5py_version }}
+    - matplotlib
+    - pandas
+    - pyyaml
+    - python
+about:
+  home: https://rapids.ai/
+  license: Apache-2.0
+  summary: cuVS CPU benchmarks
diff --git a/conda/recipes/libcuvs/conda_build_config.yaml b/conda/recipes/libcuvs/conda_build_config.yaml
index e165f7ed9..b8c49943e 100644
--- a/conda/recipes/libcuvs/conda_build_config.yaml
+++ b/conda/recipes/libcuvs/conda_build_config.yaml
@@ -22,6 +22,9 @@ cmake_version:
 h5py_version:
   - ">=3.8.0"
 
+nccl_version:
+  - ">=2.19"
+
 # The CTK libraries below are missing from the conda-forge::cudatoolkit package
 # for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
 # and the "*_run_*" version specifiers correspond to `11.x` packages.
diff --git a/conda/recipes/libcuvs/meta.yaml b/conda/recipes/libcuvs/meta.yaml
index e154ccf41..46552c397 100644
--- a/conda/recipes/libcuvs/meta.yaml
+++ b/conda/recipes/libcuvs/meta.yaml
@@ -65,6 +65,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}
         - cuda-profiler-api {{ cuda11_cuda_profiler_api_host_version }}
@@ -131,6 +132,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}
         - cuda-profiler-api {{ cuda11_cuda_profiler_api_host_version }}
@@ -159,6 +161,7 @@ outputs:
         - libcusolver
         - libcusparse
         {% endif %}
+        - libraft-headers ={{ minor_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
@@ -197,6 +200,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - {{ pin_subpackage('libcuvs', exact=True) }}
         - cuda-version ={{ cuda_version }}
         - openblas # required by some CPU algos in benchmarks
@@ -268,6 +272,7 @@ outputs:
       host:
         - librmm ={{ minor_version }}
         - libraft-headers ={{ minor_version }}
+        - nccl {{ nccl_version }}
         - {{ pin_subpackage('libcuvs', exact=True) }}
         - cuda-version ={{ cuda_version }}
         {% if cuda_major == "11" %}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6f5178251..3e98a247e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -57,6 +57,7 @@ option(BUILD_C_LIBRARY "Build cuVS C API library" OFF)
 option(BUILD_C_TESTS "Build cuVS C API tests" OFF)
 option(BUILD_CUVS_BENCH "Build cuVS ann benchmarks" OFF)
 option(BUILD_CAGRA_HNSWLIB "Build CAGRA+hnswlib interface" ON)
+option(BUILD_MG_ALGOS "Build with multi-GPU support" ON)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO
        "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF
@@ -85,6 +86,12 @@ if(NOT BUILD_C_LIBRARY)
   set(BUILD_C_TESTS OFF)
 endif()
 
+if(NOT BUILD_SHARED_LIBS)
+  set(BUILD_TESTS OFF)
+  set(BUILD_C_LIBRARY OFF)
+  set(BUILD_CAGRA_HNSWLIB OFF)
+endif()
+
 # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
 # have different values for the `Threads::Threads` target. Setting this flag ensures
 # `Threads::Threads` is the same value across all builds so that cache hits occur
@@ -175,6 +182,7 @@ rapids_cpm_init()
 
 if(NOT BUILD_CPU_ONLY)
   include(cmake/thirdparty/get_raft.cmake)
+  include(cmake/thirdparty/get_cutlass.cmake)
 endif()
 
 if(BUILD_C_LIBRARY)
@@ -186,8 +194,6 @@ if(BUILD_TESTS OR BUILD_C_TESTS)
   rapids_cpm_gtest(BUILD_STATIC)
 endif()
 
-include(cmake/thirdparty/get_cutlass.cmake)
-
 if(BUILD_CUVS_BENCH)
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
   rapids_cpm_gbench(BUILD_STATIC)
@@ -199,526 +205,568 @@ endif()
 
 # ##################################################################################################
 # * cuvs ---------------------------------------------------------------------
-add_library(
-  cuvs-cagra-search STATIC
-  src/neighbors/cagra_search_float.cu
-  src/neighbors/cagra_search_half.cu
-  src/neighbors/cagra_search_int8.cu
-  src/neighbors/cagra_search_uint8.cu
-  src/neighbors/detail/cagra/compute_distance.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu
-  src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
-  src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
-  src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
-  src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
-  src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
-  src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu
-  src/neighbors/detail/cagra/search_single_cta_float_uint32.cu
-  src/neighbors/detail/cagra/search_single_cta_half_uint32.cu
-  src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu
-  src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu
-)
-
-file(GLOB_RECURSE compute_distance_sources "src/neighbors/detail/cagra/compute_distance_*.cu")
-set_source_files_properties(${compute_distance_sources} PROPERTIES COMPILE_FLAGS -maxrregcount=64)
-
-set_target_properties(
-  cuvs-cagra-search
-  PROPERTIES BUILD_RPATH "\$ORIGIN"
-             CXX_STANDARD 17
-             CXX_STANDARD_REQUIRED ON
-             CUDA_STANDARD 17
-             CUDA_STANDARD_REQUIRED ON
-             CUDA_SEPARABLE_COMPILATION ON
-             INTERFACE_POSITION_INDEPENDENT_CODE ON
-             POSITION_INDEPENDENT_CODE ON
-)
-target_link_libraries(cuvs-cagra-search PRIVATE raft::raft)
-target_include_directories(
-  cuvs-cagra-search PRIVATE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-)
-target_compile_options(
-  cuvs-cagra-search PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
-                            "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
-)
-
-add_library(
-  cuvs_objs OBJECT
-  src/cluster/kmeans_balanced_fit_float.cu
-  src/cluster/kmeans_fit_mg_float.cu
-  src/cluster/kmeans_fit_mg_double.cu
-  src/cluster/kmeans_fit_double.cu
-  src/cluster/kmeans_fit_float.cu
-  src/cluster/kmeans_auto_find_k_float.cu
-  src/cluster/kmeans_fit_predict_double.cu
-  src/cluster/kmeans_fit_predict_float.cu
-  src/cluster/kmeans_predict_double.cu
-  src/cluster/kmeans_predict_float.cu
-  src/cluster/kmeans_balanced_fit_float.cu
-  src/cluster/kmeans_balanced_fit_predict_float.cu
-  src/cluster/kmeans_balanced_predict_float.cu
-  src/cluster/kmeans_balanced_fit_int8.cu
-  src/cluster/kmeans_balanced_fit_predict_int8.cu
-  src/cluster/kmeans_balanced_predict_int8.cu
-  src/cluster/kmeans_transform_double.cu
-  src/cluster/kmeans_transform_float.cu
-  src/cluster/single_linkage_float.cu
-  src/core/bitset.cu
-  src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_canberra_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_correlation_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_cosine_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_kl_divergence_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l1_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l2_expanded_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l_inf_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_russel_rao_half_float_float_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
-  src/distance/detail/pairwise_matrix/dispatch_rbf.cu
-  src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int64_t.cu
-  src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int64_t.cu
-  src/distance/detail/fused_distance_nn.cu
-  src/distance/distance.cu
-  src/distance/pairwise_distance.cu
-  src/neighbors/brute_force.cu
-  src/neighbors/cagra_build_float.cu
-  src/neighbors/cagra_build_half.cu
-  src/neighbors/cagra_build_int8.cu
-  src/neighbors/cagra_build_uint8.cu
-  src/neighbors/cagra_extend_float.cu
-  src/neighbors/cagra_extend_int8.cu
-  src/neighbors/cagra_extend_uint8.cu
-  src/neighbors/cagra_optimize.cu
-  src/neighbors/cagra_serialize_float.cu
-  src/neighbors/cagra_serialize_half.cu
-  src/neighbors/cagra_serialize_int8.cu
-  src/neighbors/cagra_serialize_uint8.cu
-  src/neighbors/detail/cagra/cagra_build.cpp
-  src/neighbors/detail/cagra/topk_for_cagra/topk.cu
-  $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
-  src/neighbors/ivf_flat_index.cpp
-  src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
-  src/neighbors/ivf_flat/ivf_flat_build_extend_int8_t_int64_t.cu
-  src/neighbors/ivf_flat/ivf_flat_build_extend_uint8_t_int64_t.cu
-  src/neighbors/ivf_flat/ivf_flat_helpers.cu
-  src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cu
-  src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cu
-  src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cu
-  src/neighbors/ivf_flat/ivf_flat_serialize_float_int64_t.cu
-  src/neighbors/ivf_flat/ivf_flat_serialize_int8_t_int64_t.cu
-  src/neighbors/ivf_flat/ivf_flat_serialize_uint8_t_int64_t.cu
-  src/neighbors/ivf_pq_index.cpp
-  src/neighbors/ivf_pq/ivf_pq_build_common.cu
-  src/neighbors/ivf_pq/ivf_pq_serialize.cu
-  src/neighbors/ivf_pq/ivf_pq_deserialize.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_build_extend_float_int64_t.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_build_extend_half_int64_t.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_build_extend_int8_t_int64_t.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_build_extend_uint8_t_int64_t.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_search_float_int64_t.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_search_half_int64_t.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_search_int8_t_int64_t.cu
-  src/neighbors/ivf_pq/detail/ivf_pq_search_uint8_t_int64_t.cu
-  src/neighbors/nn_descent.cu
-  src/neighbors/nn_descent_float.cu
-  src/neighbors/nn_descent_half.cu
-  src/neighbors/nn_descent_int8.cu
-  src/neighbors/nn_descent_uint8.cu
-  src/neighbors/reachability.cu
-  src/neighbors/refine/detail/refine_device_float_float.cu
-  src/neighbors/refine/detail/refine_device_half_float.cu
-  src/neighbors/refine/detail/refine_device_int8_t_float.cu
-  src/neighbors/refine/detail/refine_device_uint8_t_float.cu
-  src/neighbors/refine/detail/refine_host_float_float.cpp
-  src/neighbors/refine/detail/refine_host_half_float.cpp
-  src/neighbors/refine/detail/refine_host_int8_t_float.cpp
-  src/neighbors/refine/detail/refine_host_uint8_t_float.cpp
-  src/neighbors/sample_filter.cu
-  src/neighbors/vamana_build_float.cu
-  src/neighbors/vamana_build_uint8.cu
-  src/neighbors/vamana_build_int8.cu
-  src/neighbors/vamana_serialize_float.cu
-  src/neighbors/vamana_serialize_uint8.cu
-  src/neighbors/vamana_serialize_int8.cu
-  src/selection/select_k_float_int64_t.cu
-  src/selection/select_k_float_int32_t.cu
-  src/selection/select_k_float_uint32_t.cu
-  src/selection/select_k_half_uint32_t.cu
-  src/stats/silhouette_score.cu
-  src/stats/trustworthiness_score.cu
-)
+if(BUILD_SHARED_LIBS)
+  add_library(
+    cuvs-cagra-search STATIC
+    src/neighbors/cagra_search_float.cu
+    src/neighbors/cagra_search_half.cu
+    src/neighbors/cagra_search_int8.cu
+    src/neighbors/cagra_search_uint8.cu
+    src/neighbors/detail/cagra/compute_distance.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_float_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_half_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_int8_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_InnerProduct_uint8_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_float_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_half_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_int8_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim128_t8.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim256_t16.cu
+    src/neighbors/detail/cagra/compute_distance_standard_L2Expanded_uint8_uint32_dim512_t32.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim128_t8_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim256_t16_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_float_uint32_dim512_t32_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim128_t8_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim256_t16_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_half_uint32_dim512_t32_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim128_t8_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim256_t16_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_int8_uint32_dim512_t32_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim128_t8_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim256_t16_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_2subd_half.cu
+    src/neighbors/detail/cagra/compute_distance_vpq_L2Expanded_uint8_uint32_dim512_t32_8pq_4subd_half.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint32.cu
+    src/neighbors/detail/cagra/search_multi_cta_half_uint32.cu
+    src/neighbors/detail/cagra/search_multi_cta_int8_uint32.cu
+    src/neighbors/detail/cagra/search_multi_cta_uint8_uint32.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint32.cu
+    src/neighbors/detail/cagra/search_single_cta_half_uint32.cu
+    src/neighbors/detail/cagra/search_single_cta_int8_uint32.cu
+    src/neighbors/detail/cagra/search_single_cta_uint8_uint32.cu
+  )
 
-set_target_properties(
-  cuvs_objs
-  PROPERTIES CXX_STANDARD 17
-             CXX_STANDARD_REQUIRED ON
-             CUDA_STANDARD 17
-             CUDA_STANDARD_REQUIRED ON
-             POSITION_INDEPENDENT_CODE ON
-)
-target_compile_options(
-  cuvs_objs PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
-                    "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
-)
-target_link_libraries(
-  cuvs_objs PUBLIC raft::raft rmm::rmm ${CUVS_CTK_MATH_DEPENDENCIES}
-                   $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-)
+  file(GLOB_RECURSE compute_distance_sources "src/neighbors/detail/cagra/compute_distance_*.cu")
+  set_source_files_properties(${compute_distance_sources} PROPERTIES COMPILE_FLAGS -maxrregcount=64)
 
-add_library(cuvs SHARED $<TARGET_OBJECTS:cuvs_objs>)
-add_library(cuvs_static STATIC $<TARGET_OBJECTS:cuvs_objs>)
+  set_target_properties(
+    cuvs-cagra-search
+    PROPERTIES BUILD_RPATH "\$ORIGIN"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               CUDA_SEPARABLE_COMPILATION ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+               POSITION_INDEPENDENT_CODE ON
+  )
+  target_link_libraries(cuvs-cagra-search PRIVATE raft::raft)
+  target_include_directories(
+    cuvs-cagra-search PRIVATE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+  )
+  target_compile_options(
+    cuvs-cagra-search PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
+                              "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
+  )
 
-target_compile_options(
-  cuvs INTERFACE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-extended-lambda
-                 --expt-relaxed-constexpr>
-)
+  if(BUILD_MG_ALGOS)
+    set(CUVS_MG_ALGOS
+        src/neighbors/mg/mg_flat_float_int64_t.cu
+        src/neighbors/mg/mg_flat_int8_t_int64_t.cu
+        src/neighbors/mg/mg_flat_uint8_t_int64_t.cu
+        src/neighbors/mg/mg_pq_float_int64_t.cu
+        src/neighbors/mg/mg_pq_half_int64_t.cu
+        src/neighbors/mg/mg_pq_int8_t_int64_t.cu
+        src/neighbors/mg/mg_pq_uint8_t_int64_t.cu
+        src/neighbors/mg/mg_cagra_float_uint32_t.cu
+        src/neighbors/mg/mg_cagra_half_uint32_t.cu
+        src/neighbors/mg/mg_cagra_int8_t_uint32_t.cu
+        src/neighbors/mg/mg_cagra_uint8_t_uint32_t.cu
+        src/neighbors/mg/omp_checks.cpp
+        src/neighbors/mg/nccl_comm.cpp
+    )
+  endif()
 
-add_library(cuvs::cuvs ALIAS cuvs)
-add_library(cuvs::cuvs_static ALIAS cuvs_static)
-
-set_target_properties(
-  cuvs_static
-  PROPERTIES BUILD_RPATH "\$ORIGIN"
-             INSTALL_RPATH "\$ORIGIN"
-             CXX_STANDARD 17
-             CXX_STANDARD_REQUIRED ON
-             POSITION_INDEPENDENT_CODE ON
-             INTERFACE_POSITION_INDEPENDENT_CODE ON
-             EXPORT_NAME cuvs_static
-)
+  add_library(
+    cuvs_objs OBJECT
+    src/cluster/kmeans_balanced_fit_float.cu
+    src/cluster/kmeans_fit_mg_float.cu
+    src/cluster/kmeans_fit_mg_double.cu
+    src/cluster/kmeans_fit_double.cu
+    src/cluster/kmeans_fit_float.cu
+    src/cluster/kmeans_auto_find_k_float.cu
+    src/cluster/kmeans_fit_predict_double.cu
+    src/cluster/kmeans_fit_predict_float.cu
+    src/cluster/kmeans_predict_double.cu
+    src/cluster/kmeans_predict_float.cu
+    src/cluster/kmeans_balanced_fit_float.cu
+    src/cluster/kmeans_balanced_fit_predict_float.cu
+    src/cluster/kmeans_balanced_predict_float.cu
+    src/cluster/kmeans_balanced_fit_int8.cu
+    src/cluster/kmeans_balanced_fit_predict_int8.cu
+    src/cluster/kmeans_balanced_predict_int8.cu
+    src/cluster/kmeans_transform_double.cu
+    src/cluster/kmeans_transform_float.cu
+    src/cluster/single_linkage_float.cu
+    src/core/bitset.cu
+    src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_canberra_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_correlation_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_cosine_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_kl_divergence_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l1_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_expanded_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l_inf_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_russel_rao_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_rbf.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int64_t.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int64_t.cu
+    src/distance/detail/fused_distance_nn.cu
+    src/distance/distance.cu
+    src/distance/pairwise_distance.cu
+    src/neighbors/brute_force.cu
+    src/neighbors/cagra_build_float.cu
+    src/neighbors/cagra_build_half.cu
+    src/neighbors/cagra_build_int8.cu
+    src/neighbors/cagra_build_uint8.cu
+    src/neighbors/cagra_extend_float.cu
+    src/neighbors/cagra_extend_int8.cu
+    src/neighbors/cagra_extend_uint8.cu
+    src/neighbors/cagra_optimize.cu
+    src/neighbors/cagra_serialize_float.cu
+    src/neighbors/cagra_serialize_half.cu
+    src/neighbors/cagra_serialize_int8.cu
+    src/neighbors/cagra_serialize_uint8.cu
+    src/neighbors/iface/iface_cagra_float_uint32_t.cu
+    src/neighbors/iface/iface_cagra_half_uint32_t.cu
+    src/neighbors/iface/iface_cagra_int8_t_uint32_t.cu
+    src/neighbors/iface/iface_cagra_uint8_t_uint32_t.cu
+    src/neighbors/iface/iface_flat_float_int64_t.cu
+    src/neighbors/iface/iface_flat_int8_t_int64_t.cu
+    src/neighbors/iface/iface_flat_uint8_t_int64_t.cu
+    src/neighbors/iface/iface_pq_float_int64_t.cu
+    src/neighbors/iface/iface_pq_half_int64_t.cu
+    src/neighbors/iface/iface_pq_int8_t_int64_t.cu
+    src/neighbors/iface/iface_pq_uint8_t_int64_t.cu
+    src/neighbors/detail/cagra/cagra_build.cpp
+    src/neighbors/detail/cagra/topk_for_cagra/topk.cu
+    $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
+    src/neighbors/ivf_flat_index.cpp
+    src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
+    src/neighbors/ivf_flat/ivf_flat_build_extend_int8_t_int64_t.cu
+    src/neighbors/ivf_flat/ivf_flat_build_extend_uint8_t_int64_t.cu
+    src/neighbors/ivf_flat/ivf_flat_helpers.cu
+    src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cu
+    src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cu
+    src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cu
+    src/neighbors/ivf_flat/ivf_flat_serialize_float_int64_t.cu
+    src/neighbors/ivf_flat/ivf_flat_serialize_int8_t_int64_t.cu
+    src/neighbors/ivf_flat/ivf_flat_serialize_uint8_t_int64_t.cu
+    src/neighbors/ivf_pq_index.cpp
+    src/neighbors/ivf_pq/ivf_pq_build_common.cu
+    src/neighbors/ivf_pq/ivf_pq_serialize.cu
+    src/neighbors/ivf_pq/ivf_pq_deserialize.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_build_extend_float_int64_t.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_build_extend_half_int64_t.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_build_extend_int8_t_int64_t.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_build_extend_uint8_t_int64_t.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_search_float_int64_t.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_search_half_int64_t.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_search_int8_t_int64_t.cu
+    src/neighbors/ivf_pq/detail/ivf_pq_search_uint8_t_int64_t.cu
+    src/neighbors/nn_descent.cu
+    src/neighbors/nn_descent_float.cu
+    src/neighbors/nn_descent_half.cu
+    src/neighbors/nn_descent_int8.cu
+    src/neighbors/nn_descent_uint8.cu
+    src/neighbors/reachability.cu
+    src/neighbors/refine/detail/refine_device_float_float.cu
+    src/neighbors/refine/detail/refine_device_half_float.cu
+    src/neighbors/refine/detail/refine_device_int8_t_float.cu
+    src/neighbors/refine/detail/refine_device_uint8_t_float.cu
+    src/neighbors/refine/detail/refine_host_float_float.cpp
+    src/neighbors/refine/detail/refine_host_half_float.cpp
+    src/neighbors/refine/detail/refine_host_int8_t_float.cpp
+    src/neighbors/refine/detail/refine_host_uint8_t_float.cpp
+    src/neighbors/sample_filter.cu
+    src/neighbors/vamana_build_float.cu
+    src/neighbors/vamana_build_uint8.cu
+    src/neighbors/vamana_build_int8.cu
+    src/neighbors/vamana_serialize_float.cu
+    src/neighbors/vamana_serialize_uint8.cu
+    src/neighbors/vamana_serialize_int8.cu
+    src/selection/select_k_float_int64_t.cu
+    src/selection/select_k_float_int32_t.cu
+    src/selection/select_k_float_uint32_t.cu
+    src/selection/select_k_half_uint32_t.cu
+    src/stats/silhouette_score.cu
+    src/stats/trustworthiness_score.cu
+    ${CUVS_MG_ALGOS}
+  )
 
-target_compile_options(cuvs_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
+  set_target_properties(
+    cuvs_objs
+    PROPERTIES CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+  )
+  target_compile_options(
+    cuvs_objs PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
+                      "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
+  )
+  target_link_libraries(
+    cuvs_objs PUBLIC raft::raft rmm::rmm ${CUVS_CTK_MATH_DEPENDENCIES}
+                     $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+  )
 
-target_include_directories(
-  cuvs_objs
-  PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
-         "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-  INTERFACE "$<INSTALL_INTERFACE:include>"
-)
+  add_library(cuvs SHARED $<TARGET_OBJECTS:cuvs_objs>)
+  add_library(cuvs_static STATIC $<TARGET_OBJECTS:cuvs_objs>)
 
-target_include_directories(
-  cuvs_static
-  PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
-  INTERFACE "$<INSTALL_INTERFACE:include>"
-)
+  target_compile_options(
+    cuvs INTERFACE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-extended-lambda
+                   --expt-relaxed-constexpr>
+  )
 
-# ensure CUDA symbols aren't relocated to the middle of the debug build binaries
-target_link_options(cuvs_static PRIVATE $<HOST_LINK:${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld>)
+  add_library(cuvs::cuvs ALIAS cuvs)
+  add_library(cuvs::cuvs_static ALIAS cuvs_static)
 
-target_include_directories(
-  cuvs_static PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-                     "$<INSTALL_INTERFACE:include>"
-)
+  set_target_properties(
+    cuvs_static
+    PROPERTIES BUILD_RPATH "\$ORIGIN"
+               INSTALL_RPATH "\$ORIGIN"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+               EXPORT_NAME cuvs_static
+  )
 
-target_include_directories(
-  cuvs PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-              "$<INSTALL_INTERFACE:include>"
-)
+  target_compile_options(cuvs_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
 
-rapids_find_package(
-  OpenMP REQUIRED
-  BUILD_EXPORT_SET cuvs-exports
-  INSTALL_EXPORT_SET cuvs-exports
-)
+  target_include_directories(
+    cuvs_objs
+    PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
+           "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+    INTERFACE "$<INSTALL_INTERFACE:include>"
+  )
 
-if(NOT BUILD_CPU_ONLY)
+  target_include_directories(
+    cuvs_static
+    PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
+    INTERFACE "$<INSTALL_INTERFACE:include>"
+  )
 
-  set(CUVS_CUSOLVER_DEPENDENCY CUDA::cusolver${_ctk_static_suffix})
-  set(CUVS_CUBLAS_DEPENDENCY CUDA::cublas${_ctk_static_suffix})
-  set(CUVS_CURAND_DEPENDENCY CUDA::curand${_ctk_static_suffix})
-  set(CUVS_CUSPARSE_DEPENDENCY CUDA::cusparse${_ctk_static_suffix})
+  # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
+  target_link_options(cuvs_static PRIVATE $<HOST_LINK:${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld>)
 
-  set(CUVS_CTK_MATH_DEPENDENCIES ${CUVS_CUBLAS_DEPENDENCY} ${CUVS_CUSOLVER_DEPENDENCY}
-                                 ${CUVS_CUSPARSE_DEPENDENCY} ${CUVS_CURAND_DEPENDENCY}
+  target_include_directories(
+    cuvs_static PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+                       "$<INSTALL_INTERFACE:include>"
   )
 
-  # Keep cuVS as lightweight as possible. Only CUDA libs and rmm should be used in global target.
-  target_link_libraries(
-    cuvs
-    PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
-    PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX> cuvs-cagra-search
+  target_include_directories(
+    cuvs PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+                "$<INSTALL_INTERFACE:include>"
   )
 
-  target_link_libraries(
-    cuvs_static
-    PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
-    PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX> cuvs-cagra-search
+  rapids_find_package(
+    OpenMP REQUIRED
+    BUILD_EXPORT_SET cuvs-exports
+    INSTALL_EXPORT_SET cuvs-exports
   )
-endif()
 
-if(BUILD_CAGRA_HNSWLIB)
-  target_link_libraries(cuvs_objs PRIVATE hnswlib::hnswlib)
-  target_compile_definitions(cuvs_objs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
-endif()
+  if(NOT BUILD_CPU_ONLY)
 
-# Endian detection
-include(TestBigEndian)
-test_big_endian(BIG_ENDIAN)
-if(BIG_ENDIAN)
-  target_compile_definitions(cuvs PRIVATE CUVS_SYSTEM_LITTLE_ENDIAN=0)
-else()
-  target_compile_definitions(cuvs PRIVATE CUVS_SYSTEM_LITTLE_ENDIAN=1)
-endif()
+    set(CUVS_CUSOLVER_DEPENDENCY CUDA::cusolver${_ctk_static_suffix})
+    set(CUVS_CUBLAS_DEPENDENCY CUDA::cublas${_ctk_static_suffix})
+    set(CUVS_CURAND_DEPENDENCY CUDA::curand${_ctk_static_suffix})
+    set(CUVS_CUSPARSE_DEPENDENCY CUDA::cusparse${_ctk_static_suffix})
+
+    set(CUVS_CTK_MATH_DEPENDENCIES ${CUVS_CUBLAS_DEPENDENCY} ${CUVS_CUSOLVER_DEPENDENCY}
+                                   ${CUVS_CUSPARSE_DEPENDENCY} ${CUVS_CURAND_DEPENDENCY}
+    )
+
+    if(BUILD_MG_ALGOS)
+      set(CUVS_COMMS_DEPENDENCY nccl)
+    endif()
+
+    # Keep cuVS as lightweight as possible. Only CUDA libs and rmm should be used in global target.
+    target_link_libraries(
+      cuvs
+      PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
+      PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+              cuvs-cagra-search ${CUVS_COMMS_DEPENDENCY}
+    )
+
+    target_link_libraries(
+      cuvs_static
+      PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
+      PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+    )
+  endif()
+
+  if(BUILD_MG_ALGOS)
+    target_compile_definitions(cuvs PUBLIC CUVS_BUILD_MG_ALGOS)
+    target_compile_definitions(cuvs_objs PUBLIC CUVS_BUILD_MG_ALGOS)
+  endif()
+
+  if(BUILD_CAGRA_HNSWLIB)
+    target_link_libraries(cuvs_objs PRIVATE hnswlib::hnswlib)
+    target_compile_definitions(cuvs_objs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
+  endif()
+
+  # Endian detection
+  include(TestBigEndian)
+  test_big_endian(BIG_ENDIAN)
+  if(BIG_ENDIAN)
+    target_compile_definitions(cuvs PRIVATE CUVS_SYSTEM_LITTLE_ENDIAN=0)
+  else()
+    target_compile_definitions(cuvs PRIVATE CUVS_SYSTEM_LITTLE_ENDIAN=1)
+  endif()
 
-file(
-  WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
-  [=[
+  file(
+    WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
+    [=[
 SECTIONS
 {
 .nvFatBinSegment : { *(.nvFatBinSegment) }
 .nv_fatbin : { *(.nv_fatbin) }
 }
 ]=]
-)
-
-# ##################################################################################################
-# * NVTX support in cuvs -----------------------------------------------------
-
-if(CUVS_NVTX)
-  # This enables NVTX within the project with no option to disable it downstream.
-  target_link_libraries(cuvs PUBLIC CUDA::nvtx3)
-  target_compile_definitions(cuvs PUBLIC NVTX_ENABLED)
-else()
-  # Allow enable NVTX downstream if not set here. This creates a new option at build/install time,
-  # which is set by default to OFF, but can be enabled in the dependent project.
-  get_property(
-    nvtx_option_help_string
-    CACHE CUVS_NVTX
-    PROPERTY HELPSTRING
   )
-  string(
-    CONCAT
-      nvtx_export_string
-      "option(CUVS_NVTX \""
-      ${nvtx_option_help_string}
-      "\" OFF)"
-      [=[
+
+  # ################################################################################################
+  # * NVTX support in cuvs -----------------------------------------------------
+
+  if(CUVS_NVTX)
+    # This enables NVTX within the project with no option to disable it downstream.
+    target_link_libraries(cuvs PUBLIC CUDA::nvtx3)
+    target_compile_definitions(cuvs PUBLIC NVTX_ENABLED)
+  else()
+    # Allow enable NVTX downstream if not set here. This creates a new option at build/install time,
+    # which is set by default to OFF, but can be enabled in the dependent project.
+    get_property(
+      nvtx_option_help_string
+      CACHE CUVS_NVTX
+      PROPERTY HELPSTRING
+    )
+    string(
+      CONCAT
+        nvtx_export_string
+        "option(CUVS_NVTX \""
+        ${nvtx_option_help_string}
+        "\" OFF)"
+        [=[
 
 target_link_libraries(cuvs::cuvs INTERFACE $<$<BOOL:${CUVS_NVTX}>:CUDA::nvtx3>)
 target_compile_definitions(cuvs::cuvs INTERFACE $<$<BOOL:${CUVS_NVTX}>:NVTX_ENABLED>)
 
   ]=]
-  )
-endif()
-
-set_target_properties(
-  cuvs
-  PROPERTIES BUILD_RPATH "\$ORIGIN"
-             INSTALL_RPATH "\$ORIGIN"
-             CXX_STANDARD 17
-             CXX_STANDARD_REQUIRED ON
-             CUDA_STANDARD 17
-             CUDA_STANDARD_REQUIRED ON
-             INTERFACE_POSITION_INDEPENDENT_CODE ON
-             POSITION_INDEPENDENT_CODE ON
-)
-
-target_compile_options(
-  cuvs PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
-               "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
-)
-# ensure CUDA symbols aren't relocated to the middle of the debug build binaries
-target_link_options(cuvs PRIVATE $<HOST_LINK:${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld>)
-
-# ##################################################################################################
-# * cuvs_c -------------------------------------------------------------------------------
-if(BUILD_C_LIBRARY)
-  add_library(
-    cuvs_c SHARED
-    src/core/c_api.cpp
-    src/neighbors/brute_force_c.cpp
-    src/neighbors/ivf_flat_c.cpp
-    src/neighbors/ivf_pq_c.cpp
-    src/neighbors/cagra_c.cpp
-    $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw_c.cpp>
-    src/neighbors/refine/refine_c.cpp
-    src/distance/pairwise_distance_c.cpp
-  )
-
-  if(BUILD_CAGRA_HNSWLIB)
-    target_link_libraries(cuvs_c PRIVATE hnswlib::hnswlib)
-    target_compile_definitions(cuvs_c PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
+    )
   endif()
 
-  add_library(cuvs::c_api ALIAS cuvs_c)
-
   set_target_properties(
-    cuvs_c
+    cuvs
     PROPERTIES BUILD_RPATH "\$ORIGIN"
                INSTALL_RPATH "\$ORIGIN"
                CXX_STANDARD 17
                CXX_STANDARD_REQUIRED ON
-               POSITION_INDEPENDENT_CODE ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
                INTERFACE_POSITION_INDEPENDENT_CODE ON
-               EXPORT_NAME c_api
+               POSITION_INDEPENDENT_CODE ON
   )
 
-  target_compile_options(cuvs_c PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
-
-  target_include_directories(
-    cuvs_c
-    PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
-    INTERFACE "$<INSTALL_INTERFACE:include>"
+  target_compile_options(
+    cuvs PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>"
+                 "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
   )
+  # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
+  target_link_options(cuvs PRIVATE $<HOST_LINK:${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld>)
+
+  # ################################################################################################
+  # * cuvs_c -------------------------------------------------------------------------------
+  if(BUILD_C_LIBRARY)
+    add_library(
+      cuvs_c SHARED
+      src/core/c_api.cpp
+      src/neighbors/brute_force_c.cpp
+      src/neighbors/ivf_flat_c.cpp
+      src/neighbors/ivf_pq_c.cpp
+      src/neighbors/cagra_c.cpp
+      $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw_c.cpp>
+      src/neighbors/refine/refine_c.cpp
+      src/distance/pairwise_distance_c.cpp
+    )
 
-  target_link_libraries(
-    cuvs_c
-    PUBLIC cuvs::cuvs ${CUVS_CTK_MATH_DEPENDENCIES}
-    PRIVATE raft::raft
-  )
+    if(BUILD_CAGRA_HNSWLIB)
+      target_link_libraries(cuvs_c PRIVATE hnswlib::hnswlib)
+      target_compile_definitions(cuvs_c PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
+    endif()
+
+    add_library(cuvs::c_api ALIAS cuvs_c)
+
+    set_target_properties(
+      cuvs_c
+      PROPERTIES BUILD_RPATH "\$ORIGIN"
+                 INSTALL_RPATH "\$ORIGIN"
+                 CXX_STANDARD 17
+                 CXX_STANDARD_REQUIRED ON
+                 POSITION_INDEPENDENT_CODE ON
+                 INTERFACE_POSITION_INDEPENDENT_CODE ON
+                 EXPORT_NAME c_api
+    )
 
-  # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
-  target_link_options(cuvs_c PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
-endif()
+    target_compile_options(cuvs_c PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
 
-# ##################################################################################################
-# * install targets-----------------------------------------------------------
-rapids_cmake_install_lib_dir(lib_dir)
-include(GNUInstallDirs)
-include(CPack)
-
-install(
-  TARGETS cuvs cuvs_static cuvs-cagra-search
-  DESTINATION ${lib_dir}
-  COMPONENT cuvs
-  EXPORT cuvs-exports
-)
+    target_include_directories(
+      cuvs_c
+      PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
+      INTERFACE "$<INSTALL_INTERFACE:include>"
+    )
 
-install(
-  DIRECTORY include/cuvs
-  COMPONENT cuvs
-  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-)
+    target_link_libraries(
+      cuvs_c
+      PUBLIC cuvs::cuvs ${CUVS_CTK_MATH_DEPENDENCIES}
+      PRIVATE raft::raft
+    )
+
+    # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
+    target_link_options(cuvs_c PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
+  endif()
+
+  # ################################################################################################
+  # * install targets-----------------------------------------------------------
+  rapids_cmake_install_lib_dir(lib_dir)
+  include(GNUInstallDirs)
+  include(CPack)
 
-if(BUILD_C_LIBRARY)
   install(
-    TARGETS cuvs_c
+    TARGETS cuvs cuvs_static
     DESTINATION ${lib_dir}
-    COMPONENT c_api
-    EXPORT cuvs-c-exports
+    COMPONENT cuvs
+    EXPORT cuvs-exports
   )
-endif()
 
-install(
-  FILES ${CMAKE_CURRENT_BINARY_DIR}/include/cuvs/version_config.hpp
-  COMPONENT cuvs
-  DESTINATION include/cuvs
-)
+  install(
+    DIRECTORY include/cuvs
+    COMPONENT cuvs
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  )
 
-if(TARGET cuvs_c)
-  list(APPEND cuvs_components c_api)
-  list(APPEND cuvs_export_sets cuvs-c-exports)
-  set(CUVS_C_TARGET cuvs_c)
-endif()
+  if(BUILD_C_LIBRARY)
+    install(
+      TARGETS cuvs_c
+      DESTINATION ${lib_dir}
+      COMPONENT c_api
+      EXPORT cuvs-c-exports
+    )
+  endif()
 
-# Use `rapids_export` for 22.04 as it will have COMPONENT support
-rapids_export(
-  INSTALL cuvs
-  EXPORT_SET cuvs-exports
-  COMPONENTS ${cuvs_components}
-  COMPONENTS_EXPORT_SET ${cuvs_export_sets}
-  GLOBAL_TARGETS cuvs ${CUVS_C_TARGET}
-  NAMESPACE cuvs::
-)
+  install(
+    FILES ${CMAKE_CURRENT_BINARY_DIR}/include/cuvs/version_config.hpp
+    COMPONENT cuvs
+    DESTINATION include/cuvs
+  )
 
-# ##################################################################################################
-# * build export -------------------------------------------------------------
-rapids_export(
-  BUILD cuvs
-  EXPORT_SET cuvs-exports
-  COMPONENTS ${cuvs_components}
-  COMPONENTS_EXPORT_SET ${cuvs_export_sets}
-  GLOBAL_TARGETS cuvs ${CUVS_C_TARGET}
-  NAMESPACE cuvs::
-)
+  if(TARGET cuvs_c)
+    list(APPEND cuvs_components c_api)
+    list(APPEND cuvs_export_sets cuvs-c-exports)
+    set(CUVS_C_TARGET cuvs_c)
+  endif()
+
+  # Use `rapids_export` for 22.04 as it will have COMPONENT support
+  rapids_export(
+    INSTALL cuvs
+    EXPORT_SET cuvs-exports
+    COMPONENTS ${cuvs_components}
+    COMPONENTS_EXPORT_SET ${cuvs_export_sets}
+    GLOBAL_TARGETS cuvs ${CUVS_C_TARGET}
+    NAMESPACE cuvs::
+  )
+
+  # ################################################################################################
+  # * build export -------------------------------------------------------------
+  rapids_export(
+    BUILD cuvs
+    EXPORT_SET cuvs-exports
+    COMPONENTS ${cuvs_components}
+    COMPONENTS_EXPORT_SET ${cuvs_export_sets}
+    GLOBAL_TARGETS cuvs ${CUVS_C_TARGET}
+    NAMESPACE cuvs::
+  )
+endif()
 
 # ##################################################################################################
 # * build test executable ----------------------------------------------------
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 8cbf8c8b3..c36e70ace 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -32,6 +32,7 @@ option(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE "Include cuVS brute force knn in benc
 option(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB "Include cuVS CAGRA with HNSW search in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_HNSWLIB "Include hnsw algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_GGNN "Include ggnn algorithm in benchmark" OFF)
+option(CUVS_ANN_BENCH_USE_CUVS_MG "Include cuVS ann mg algorithm in benchmark" ${BUILD_MG_ALGOS})
 option(CUVS_ANN_BENCH_SINGLE_EXE
        "Make a single executable with benchmark as shared library modules" OFF
 )
@@ -55,6 +56,7 @@ if(BUILD_CPU_ONLY)
   set(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB OFF)
   set(CUVS_ANN_BENCH_USE_GGNN OFF)
   set(CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_MG OFF)
 else()
   set(CUVS_FAISS_ENABLE_GPU ON)
 endif()
@@ -66,6 +68,7 @@ if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
    OR CUVS_ANN_BENCH_USE_CUVS_CAGRA
    OR CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB
    OR CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE
+   OR CUVS_ANN_BENCH_USE_CUVS_MG
 )
   set(CUVS_ANN_BENCH_USE_CUVS ON)
 endif()
@@ -245,6 +248,21 @@ if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_CUVS_MG)
+  ConfigureAnnBench(
+    NAME
+    CUVS_MG
+    PATH
+    src/cuvs/cuvs_benchmark.cu
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_MG}>:src/cuvs/cuvs_mg_ivf_flat.cu>
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_MG}>:src/cuvs/cuvs_mg_ivf_pq.cu>
+    $<$<BOOL:${CUVS_ANN_BENCH_USE_CUVS_MG}>:src/cuvs/cuvs_mg_cagra.cu>
+    LINKS
+    cuvs
+    nccl
+  )
+endif()
+
 message("CUVS_FAISS_TARGETS: ${CUVS_FAISS_TARGETS}")
 message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}")
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_FLAT)
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 22f0cab6f..57d5b1910 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -45,7 +45,18 @@ extern template class cuvs::bench::cuvs_cagra<uint8_t, uint32_t>;
 extern template class cuvs::bench::cuvs_cagra<int8_t, uint32_t>;
 #endif
 
-#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT
+#ifdef CUVS_ANN_BENCH_USE_CUVS_MG
+#include "cuvs_ivf_flat_wrapper.h"
+#include "cuvs_mg_ivf_flat_wrapper.h"
+
+#include "cuvs_ivf_pq_wrapper.h"
+#include "cuvs_mg_ivf_pq_wrapper.h"
+
+#include "cuvs_cagra_wrapper.h"
+#include "cuvs_mg_cagra_wrapper.h"
+#endif
+
+#if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT) || defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
                        typename cuvs::bench::cuvs_ivf_flat<T, IdxT>::build_param& param)
@@ -64,7 +75,7 @@ void parse_search_param(const nlohmann::json& conf,
 #endif
 
 #if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || \
-  defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+  defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) || defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
                        typename cuvs::bench::cuvs_ivf_pq<T, IdxT>::build_param& param)
@@ -130,7 +141,8 @@ void parse_search_param(const nlohmann::json& conf,
 }
 #endif
 
-#if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
+#if defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) || \
+  defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf, cuvs::neighbors::nn_descent::index_params& param)
 {
diff --git a/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu b/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu
index a956ab139..893097236 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu
+++ b/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu
@@ -29,6 +29,43 @@
 
 namespace cuvs::bench {
 
+#ifdef CUVS_ANN_BENCH_USE_CUVS_MG
+void add_distribution_mode(cuvs::neighbors::mg::distribution_mode* dist_mode,
+                           const nlohmann::json& conf)
+{
+  if (conf.contains("distribution_mode")) {
+    std::string distribution_mode = conf.at("distribution_mode");
+    if (distribution_mode == "replicated") {
+      *dist_mode = cuvs::neighbors::mg::distribution_mode::REPLICATED;
+    } else if (distribution_mode == "sharded") {
+      *dist_mode = cuvs::neighbors::mg::distribution_mode::SHARDED;
+    } else {
+      throw std::runtime_error("invalid value for distribution_mode");
+    }
+  } else {
+    // default
+    *dist_mode = cuvs::neighbors::mg::distribution_mode::SHARDED;
+  }
+};
+
+void add_merge_mode(cuvs::neighbors::mg::sharded_merge_mode* merge_mode, const nlohmann::json& conf)
+{
+  if (conf.contains("merge_mode")) {
+    std::string sharded_merge_mode = conf.at("merge_mode");
+    if (sharded_merge_mode == "tree_merge") {
+      *merge_mode = cuvs::neighbors::mg::sharded_merge_mode::TREE_MERGE;
+    } else if (sharded_merge_mode == "merge_on_root_rank") {
+      *merge_mode = cuvs::neighbors::mg::sharded_merge_mode::MERGE_ON_ROOT_RANK;
+    } else {
+      throw std::runtime_error("invalid value for merge_mode");
+    }
+  } else {
+    // default
+    *merge_mode = cuvs::neighbors::mg::sharded_merge_mode::TREE_MERGE;
+  }
+};
+#endif
+
 template <typename T>
 auto create_algo(const std::string& algo_name,
                  const std::string& distance,
@@ -71,6 +108,32 @@ auto create_algo(const std::string& algo_name,
     parse_build_param<T, uint32_t>(conf, param);
     a = std::make_unique<cuvs::bench::cuvs_cagra<T, uint32_t>>(metric, dim, param);
   }
+#endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_MG
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, uint8_t> ||
+                std::is_same_v<T, int8_t>) {
+    if (algo_name == "raft_mg_ivf_flat" || algo_name == "cuvs_mg_ivf_flat") {
+      typename cuvs::bench::cuvs_mg_ivf_flat<T, int64_t>::build_param param;
+      parse_build_param<T, int64_t>(conf, param);
+      add_distribution_mode(&param.mode, conf);
+      a = std::make_unique<cuvs::bench::cuvs_mg_ivf_flat<T, int64_t>>(metric, dim, param);
+    }
+  }
+
+  if (algo_name == "raft_mg_ivf_pq" || algo_name == "cuvs_mg_ivf_pq") {
+    typename cuvs::bench::cuvs_mg_ivf_pq<T, int64_t>::build_param param;
+    parse_build_param<T, int64_t>(conf, param);
+    add_distribution_mode(&param.mode, conf);
+    a = std::make_unique<cuvs::bench::cuvs_mg_ivf_pq<T, int64_t>>(metric, dim, param);
+  }
+
+  if (algo_name == "raft_mg_cagra" || algo_name == "cuvs_mg_cagra") {
+    typename cuvs::bench::cuvs_mg_cagra<T, uint32_t>::build_param param;
+    parse_build_param<T, uint32_t>(conf, param);
+    add_distribution_mode(&param.mode, conf);
+    a = std::make_unique<cuvs::bench::cuvs_mg_cagra<T, uint32_t>>(metric, dim, param);
+  }
+
 #endif
 
   if (!a) { throw std::runtime_error("invalid algo: '" + algo_name + "'"); }
@@ -113,6 +176,32 @@ auto create_search_param(const std::string& algo_name, const nlohmann::json& con
     return param;
   }
 #endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_MG
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, uint8_t> ||
+                std::is_same_v<T, int8_t>) {
+    if (algo_name == "raft_mg_ivf_flat" || algo_name == "cuvs_mg_ivf_flat") {
+      auto param =
+        std::make_unique<typename cuvs::bench::cuvs_mg_ivf_flat<T, int64_t>::search_param>();
+      parse_search_param<T, int64_t>(conf, *param);
+      add_merge_mode(&param->merge_mode, conf);
+      return param;
+    }
+  }
+
+  if (algo_name == "raft_mg_ivf_pq" || algo_name == "cuvs_mg_ivf_pq") {
+    auto param = std::make_unique<typename cuvs::bench::cuvs_mg_ivf_pq<T, int64_t>::search_param>();
+    parse_search_param<T, int64_t>(conf, *param);
+    add_merge_mode(&param->merge_mode, conf);
+    return param;
+  }
+
+  if (algo_name == "raft_mg_cagra" || algo_name == "cuvs_mg_cagra") {
+    auto param = std::make_unique<typename cuvs::bench::cuvs_mg_cagra<T, uint32_t>::search_param>();
+    parse_search_param<T, uint32_t>(conf, *param);
+    add_merge_mode(&param->merge_mode, conf);
+    return param;
+  }
+#endif
 
   // else
   throw std::runtime_error("invalid algo: '" + algo_name + "'");
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
index ff854f890..b2ba35eee 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
@@ -72,6 +72,23 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
     std::optional<float> ivf_pq_refine_rate                                    = std::nullopt;
     std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_build_params   = std::nullopt;
     std::optional<cuvs::neighbors::ivf_pq::search_params> ivf_pq_search_params = std::nullopt;
+
+    void prepare_build_params(const raft::extent_2d<IdxT>& dataset_extents)
+    {
+      if (algo == CagraBuildAlgo::kIvfPq) {
+        auto pq_params = cuvs::neighbors::cagra::graph_build_params::ivf_pq_params(
+          dataset_extents, cagra_params.metric);
+        if (ivf_pq_build_params) { pq_params.build_params = *ivf_pq_build_params; }
+        if (ivf_pq_search_params) { pq_params.search_params = *ivf_pq_search_params; }
+        if (ivf_pq_refine_rate) { pq_params.refinement_rate = *ivf_pq_refine_rate; }
+        cagra_params.graph_build_params = pq_params;
+      } else if (algo == CagraBuildAlgo::kNnDescent) {
+        auto nn_params = cuvs::neighbors::cagra::graph_build_params::nn_descent_params(
+          cagra_params.intermediate_graph_degree);
+        if (nn_descent_params) { nn_params = *nn_descent_params; }
+        cagra_params.graph_build_params = nn_params;
+      }
+    }
   };
 
   cuvs_cagra(Metric metric, int dim, const build_param& param, int concurrent_searches = 1)
@@ -168,28 +185,9 @@ template <typename T, typename IdxT>
 void cuvs_cagra<T, IdxT>::build(const T* dataset, size_t nrow)
 {
   auto dataset_extents = raft::make_extents<IdxT>(nrow, dimension_);
+  index_params_.prepare_build_params(dataset_extents);
 
   auto& params = index_params_.cagra_params;
-
-  if (index_params_.algo == CagraBuildAlgo::kIvfPq) {
-    auto pq_params =
-      cuvs::neighbors::cagra::graph_build_params::ivf_pq_params(dataset_extents, params.metric);
-    if (index_params_.ivf_pq_build_params) {
-      pq_params.build_params = *index_params_.ivf_pq_build_params;
-    }
-    if (index_params_.ivf_pq_search_params) {
-      pq_params.search_params = *index_params_.ivf_pq_search_params;
-    }
-    if (index_params_.ivf_pq_refine_rate) {
-      pq_params.refinement_rate = *index_params_.ivf_pq_refine_rate;
-    }
-    params.graph_build_params = pq_params;
-  } else if (index_params_.algo == CagraBuildAlgo::kNnDescent) {
-    auto nn_params = cuvs::neighbors::cagra::graph_build_params::nn_descent_params(
-      params.intermediate_graph_degree);
-    if (index_params_.nn_descent_params) { nn_params = *index_params_.nn_descent_params; }
-    params.graph_build_params = nn_params;
-  }
   auto dataset_view_host =
     raft::make_mdspan<const T, IdxT, raft::row_major, true, false>(dataset, dataset_extents);
   auto dataset_view_device =
diff --git a/cpp/bench/ann/src/cuvs/cuvs_mg_cagra.cu b/cpp/bench/ann/src/cuvs/cuvs_mg_cagra.cu
new file mode 100644
index 000000000..801caa85f
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_mg_cagra.cu
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuvs_mg_cagra_wrapper.h"
+
+namespace cuvs::bench {
+template class cuvs_mg_cagra<float, uint32_t>;
+template class cuvs_mg_cagra<half, uint32_t>;
+template class cuvs_mg_cagra<uint8_t, uint32_t>;
+template class cuvs_mg_cagra<int8_t, uint32_t>;
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_mg_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_mg_cagra_wrapper.h
new file mode 100644
index 000000000..50c1ff4db
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_mg_cagra_wrapper.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "cuvs_ann_bench_utils.h"
+#include "cuvs_cagra_wrapper.h"
+#include <cuvs/neighbors/mg.hpp>
+#include <raft/core/resource/nccl_clique.hpp>
+
+namespace cuvs::bench {
+using namespace cuvs::neighbors;
+
+enum class AllocatorType;
+enum class CagraBuildAlgo;
+
+template <typename T, typename IdxT>
+class cuvs_mg_cagra : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+  using algo<T>::dim_;
+
+  struct build_param : public cuvs::bench::cuvs_cagra<T, IdxT>::build_param {
+    cuvs::neighbors::mg::distribution_mode mode;
+  };
+
+  struct search_param : public cuvs::bench::cuvs_cagra<T, IdxT>::search_param {
+    cuvs::neighbors::mg::sharded_merge_mode merge_mode;
+  };
+
+  cuvs_mg_cagra(Metric metric, int dim, const build_param& param, int concurrent_searches = 1)
+    : algo<T>(metric, dim), index_params_(param)
+  {
+    index_params_.cagra_params.metric         = parse_metric_type(metric);
+    index_params_.ivf_pq_build_params->metric = parse_metric_type(metric);
+
+    // init nccl clique outside as to not affect benchmark
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle_);
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+
+  void set_search_param(const search_param_base& param) override;
+
+  void set_search_dataset(const T* dataset, size_t nrow) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+  void search_base(const T* queries,
+                   int batch_size,
+                   int k,
+                   algo_base::index_type* neighbors,
+                   float* distances) const;
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    auto stream = raft::resource::get_cuda_stream(handle_);
+    return stream;
+  }
+
+  // to enable dataset access from GPU memory
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHost;
+    property.query_memory_type   = MemoryType::kHost;
+    return property;
+  }
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  void save_to_hnswlib(const std::string& file) const;
+  std::unique_ptr<algo<T>> copy() override;
+
+ private:
+  raft::device_resources handle_;
+  float refine_ratio_;
+  build_param index_params_;
+  cuvs::neighbors::mg::search_params<cagra::search_params> search_params_;
+  std::shared_ptr<cuvs::neighbors::mg::index<cuvs::neighbors::cagra::index<T, IdxT>, T, IdxT>>
+    index_;
+};
+
+template <typename T, typename IdxT>
+void cuvs_mg_cagra<T, IdxT>::build(const T* dataset, size_t nrow)
+{
+  auto dataset_extents = raft::make_extents<IdxT>(nrow, dim_);
+  index_params_.prepare_build_params(dataset_extents);
+  cuvs::neighbors::mg::index_params<cagra::index_params> build_params = index_params_.cagra_params;
+  build_params.mode                                                   = index_params_.mode;
+
+  auto dataset_view =
+    raft::make_host_matrix_view<const T, int64_t, raft::row_major>(dataset, nrow, dim_);
+  auto idx = cuvs::neighbors::mg::build(handle_, build_params, dataset_view);
+  index_ =
+    std::make_shared<cuvs::neighbors::mg::index<cuvs::neighbors::cagra::index<T, IdxT>, T, IdxT>>(
+      std::move(idx));
+}
+
+inline auto allocator_to_string(AllocatorType mem_type) -> std::string;
+
+template <typename T, typename IdxT>
+void cuvs_mg_cagra<T, IdxT>::set_search_param(const search_param_base& param)
+{
+  auto sp = dynamic_cast<const search_param&>(param);
+  // search_params_ = static_cast<mg::search_params<cagra::search_params>>(sp.p);
+  cagra::search_params* search_params_ptr_ = static_cast<cagra::search_params*>(&search_params_);
+  *search_params_ptr_                      = sp.p;
+  search_params_.merge_mode                = sp.merge_mode;
+  refine_ratio_                            = sp.refine_ratio;
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_cagra<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
+{
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_cagra<T, IdxT>::save(const std::string& file) const
+{
+  cuvs::neighbors::mg::serialize(handle_, *index_, file);
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_cagra<T, IdxT>::load(const std::string& file)
+{
+  index_ =
+    std::make_shared<cuvs::neighbors::mg::index<cuvs::neighbors::cagra::index<T, IdxT>, T, IdxT>>(
+      std::move(cuvs::neighbors::mg::deserialize_cagra<T, IdxT>(handle_, file)));
+}
+
+template <typename T, typename IdxT>
+std::unique_ptr<algo<T>> cuvs_mg_cagra<T, IdxT>::copy()
+{
+  return std::make_unique<cuvs_mg_cagra<T, IdxT>>(*this);  // use copy constructor
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_cagra<T, IdxT>::search_base(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  static_assert(std::is_integral_v<algo_base::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+
+  auto queries_view =
+    raft::make_host_matrix_view<const T, int64_t, raft::row_major>(queries, batch_size, dim_);
+  auto neighbors_view =
+    raft::make_host_matrix_view<IdxT, int64_t, raft::row_major>((IdxT*)neighbors, batch_size, k);
+  auto distances_view =
+    raft::make_host_matrix_view<float, int64_t, raft::row_major>(distances, batch_size, k);
+
+  cuvs::neighbors::mg::search(
+    handle_, *index_, search_params_, queries_view, neighbors_view, distances_view);
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_cagra<T, IdxT>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  auto k0                       = static_cast<size_t>(refine_ratio_ * k);
+  const bool disable_refinement = k0 <= static_cast<size_t>(k);
+
+  if (disable_refinement) {
+    search_base(queries, batch_size, k, neighbors, distances);
+  } else {
+    throw std::runtime_error("refinement not supported");
+  }
+}
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_flat.cu b/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_flat.cu
new file mode 100644
index 000000000..20cdc41e3
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_flat.cu
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuvs_mg_ivf_flat_wrapper.h"
+
+namespace cuvs::bench {
+template class cuvs_mg_ivf_flat<float, int64_t>;
+// template class cuvs_mg_ivf_flat<half, int64_t>;
+template class cuvs_mg_ivf_flat<uint8_t, int64_t>;
+template class cuvs_mg_ivf_flat<int8_t, int64_t>;
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_flat_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_flat_wrapper.h
new file mode 100644
index 000000000..54a0d2fac
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_flat_wrapper.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuvs_ann_bench_utils.h"
+#include "cuvs_ivf_flat_wrapper.h"
+#include <cuvs/neighbors/mg.hpp>
+#include <raft/core/resource/nccl_clique.hpp>
+
+namespace cuvs::bench {
+using namespace cuvs::neighbors;
+
+template <typename T, typename IdxT>
+class cuvs_mg_ivf_flat : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+  using algo<T>::dim_;
+
+  using build_param = cuvs::neighbors::mg::index_params<ivf_flat::index_params>;
+
+  struct search_param : public cuvs::bench::cuvs_ivf_flat<T, IdxT>::search_param {
+    cuvs::neighbors::mg::sharded_merge_mode merge_mode;
+  };
+
+  cuvs_mg_ivf_flat(Metric metric, int dim, const build_param& param)
+    : algo<T>(metric, dim), index_params_(param)
+  {
+    index_params_.metric = parse_metric_type(metric);
+    // init nccl clique outside as to not affect benchmark
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle_);
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+  void set_search_param(const search_param_base& param) override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHost;
+    property.query_memory_type   = MemoryType::kHost;
+    return property;
+  }
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    auto stream = raft::resource::get_cuda_stream(handle_);
+    return stream;
+  }
+
+  [[nodiscard]] auto uses_stream() const noexcept -> bool override { return false; }
+
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  std::unique_ptr<algo<T>> copy() override;
+
+ private:
+  raft::device_resources handle_;
+  build_param index_params_;
+  cuvs::neighbors::mg::search_params<ivf_flat::search_params> search_params_;
+  std::shared_ptr<cuvs::neighbors::mg::index<cuvs::neighbors::ivf_flat::index<T, IdxT>, T, IdxT>>
+    index_;
+};
+
+template <typename T, typename IdxT>
+void cuvs_mg_ivf_flat<T, IdxT>::build(const T* dataset, size_t nrow)
+{
+  auto dataset_view =
+    raft::make_host_matrix_view<const T, int64_t, raft::row_major>(dataset, IdxT(nrow), IdxT(dim_));
+  auto idx = cuvs::neighbors::mg::build(handle_, index_params_, dataset_view);
+  index_   = std::make_shared<
+    cuvs::neighbors::mg::index<cuvs::neighbors::ivf_flat::index<T, IdxT>, T, IdxT>>(std::move(idx));
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_ivf_flat<T, IdxT>::set_search_param(const search_param_base& param)
+{
+  auto sp = dynamic_cast<const search_param&>(param);
+  // search_params_ = sp.ivf_flat_params;
+  ivf_flat::search_params* search_params_ptr_ =
+    static_cast<ivf_flat::search_params*>(&search_params_);
+  *search_params_ptr_       = sp.ivf_flat_params;
+  search_params_.merge_mode = sp.merge_mode;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_ivf_flat<T, IdxT>::save(const std::string& file) const
+{
+  cuvs::neighbors::mg::serialize(handle_, *index_, file);
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_ivf_flat<T, IdxT>::load(const std::string& file)
+{
+  index_ = std::make_shared<
+    cuvs::neighbors::mg::index<cuvs::neighbors::ivf_flat::index<T, IdxT>, T, IdxT>>(
+    std::move(cuvs::neighbors::mg::deserialize_flat<T, IdxT>(handle_, file)));
+}
+
+template <typename T, typename IdxT>
+std::unique_ptr<algo<T>> cuvs_mg_ivf_flat<T, IdxT>::copy()
+{
+  return std::make_unique<cuvs_mg_ivf_flat<T, IdxT>>(*this);  // use copy constructor
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_ivf_flat<T, IdxT>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  auto queries_view = raft::make_host_matrix_view<const T, int64_t, raft::row_major>(
+    queries, IdxT(batch_size), IdxT(dim_));
+  auto neighbors_view = raft::make_host_matrix_view<IdxT, int64_t, raft::row_major>(
+    (IdxT*)neighbors, IdxT(batch_size), IdxT(k));
+  auto distances_view = raft::make_host_matrix_view<float, int64_t, raft::row_major>(
+    distances, IdxT(batch_size), IdxT(k));
+
+  cuvs::neighbors::mg::search(
+    handle_, *index_, search_params_, queries_view, neighbors_view, distances_view);
+}
+
+}  // namespace cuvs::bench
\ No newline at end of file
diff --git a/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_pq.cu b/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_pq.cu
new file mode 100644
index 000000000..a74bab6f5
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_pq.cu
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cuvs_mg_ivf_pq_wrapper.h"
+
+namespace cuvs::bench {
+template class cuvs_mg_ivf_pq<float, int64_t>;
+template class cuvs_mg_ivf_pq<half, int64_t>;
+template class cuvs_mg_ivf_pq<uint8_t, int64_t>;
+template class cuvs_mg_ivf_pq<int8_t, int64_t>;
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_pq_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_pq_wrapper.h
new file mode 100644
index 000000000..84aea7d4a
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_mg_ivf_pq_wrapper.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuvs_ann_bench_utils.h"
+#include "cuvs_ivf_pq_wrapper.h"
+#include <cuvs/neighbors/mg.hpp>
+#include <raft/core/resource/nccl_clique.hpp>
+
+namespace cuvs::bench {
+using namespace cuvs::neighbors;
+
+template <typename T, typename IdxT>
+class cuvs_mg_ivf_pq : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+  using algo<T>::dim_;
+
+  using build_param = cuvs::neighbors::mg::index_params<ivf_pq::index_params>;
+
+  struct search_param : public cuvs::bench::cuvs_ivf_pq<T, IdxT>::search_param {
+    cuvs::neighbors::mg::sharded_merge_mode merge_mode;
+  };
+
+  cuvs_mg_ivf_pq(Metric metric, int dim, const build_param& param)
+    : algo<T>(metric, dim), index_params_(param)
+  {
+    index_params_.metric = parse_metric_type(metric);
+    // init nccl clique outside as to not affect benchmark
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle_);
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+  void set_search_param(const search_param_base& param) override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHost;
+    property.query_memory_type   = MemoryType::kHost;
+    return property;
+  }
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    auto stream = raft::resource::get_cuda_stream(handle_);
+    return stream;
+  }
+
+  [[nodiscard]] auto uses_stream() const noexcept -> bool override { return false; }
+
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  std::unique_ptr<algo<T>> copy() override;
+
+ private:
+  raft::device_resources handle_;
+  build_param index_params_;
+  cuvs::neighbors::mg::search_params<ivf_pq::search_params> search_params_;
+  std::shared_ptr<cuvs::neighbors::mg::index<cuvs::neighbors::ivf_pq::index<IdxT>, T, IdxT>> index_;
+};
+
+template <typename T, typename IdxT>
+void cuvs_mg_ivf_pq<T, IdxT>::build(const T* dataset, size_t nrow)
+{
+  auto dataset_view =
+    raft::make_host_matrix_view<const T, int64_t, raft::row_major>(dataset, IdxT(nrow), IdxT(dim_));
+  auto idx = cuvs::neighbors::mg::build(handle_, index_params_, dataset_view);
+  index_ =
+    std::make_shared<cuvs::neighbors::mg::index<cuvs::neighbors::ivf_pq::index<IdxT>, T, IdxT>>(
+      std::move(idx));
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_ivf_pq<T, IdxT>::set_search_param(const search_param_base& param)
+{
+  auto sp = dynamic_cast<const search_param&>(param);
+  // search_params_ = static_cast<mg::search_params<ivf_pq::search_params>>(sp.pq_param);
+  ivf_pq::search_params* search_params_ptr_ = static_cast<ivf_pq::search_params*>(&search_params_);
+  *search_params_ptr_                       = sp.pq_param;
+  search_params_.merge_mode                 = sp.merge_mode;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_ivf_pq<T, IdxT>::save(const std::string& file) const
+{
+  cuvs::neighbors::mg::serialize(handle_, *index_, file);
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_ivf_pq<T, IdxT>::load(const std::string& file)
+{
+  index_ =
+    std::make_shared<cuvs::neighbors::mg::index<cuvs::neighbors::ivf_pq::index<IdxT>, T, IdxT>>(
+      std::move(cuvs::neighbors::mg::deserialize_pq<T, IdxT>(handle_, file)));
+}
+
+template <typename T, typename IdxT>
+std::unique_ptr<algo<T>> cuvs_mg_ivf_pq<T, IdxT>::copy()
+{
+  return std::make_unique<cuvs_mg_ivf_pq<T, IdxT>>(*this);  // use copy constructor
+}
+
+template <typename T, typename IdxT>
+void cuvs_mg_ivf_pq<T, IdxT>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  auto queries_view = raft::make_host_matrix_view<const T, int64_t, raft::row_major>(
+    queries, IdxT(batch_size), IdxT(dim_));
+  auto neighbors_view = raft::make_host_matrix_view<IdxT, int64_t, raft::row_major>(
+    (IdxT*)neighbors, IdxT(batch_size), IdxT(k));
+  auto distances_view = raft::make_host_matrix_view<float, int64_t, raft::row_major>(
+    distances, IdxT(batch_size), IdxT(k));
+
+  cuvs::neighbors::mg::search(
+    handle_, *index_, search_params_, queries_view, neighbors_view, distances_view);
+}
+
+}  // namespace cuvs::bench
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_cuvs.cmake b/cpp/cmake/thirdparty/get_cuvs.cmake
new file mode 100644
index 000000000..c21cccbcc
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_cuvs.cmake
@@ -0,0 +1,64 @@
+# =============================================================================
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# Use RAPIDS_VERSION_MAJOR_MINOR from rapids_config.cmake
+set(CUVS_VERSION "${RAPIDS_VERSION_MAJOR_MINOR}")
+set(CUVS_FORK "rapidsai")
+set(CUVS_PINNED_TAG "branch-${RAPIDS_VERSION_MAJOR_MINOR}")
+
+function(find_and_configure_cuvs)
+    set(oneValueArgs VERSION FORK PINNED_TAG ENABLE_NVTX CLONE_ON_PIN BUILD_CPU_ONLY BUILD_SHARED_LIBS)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    if(PKG_CLONE_ON_PIN AND NOT PKG_PINNED_TAG STREQUAL "branch-${CUVS_VERSION}")
+        message(STATUS "cuVS: pinned tag found: ${PKG_PINNED_TAG}. Cloning cuVS locally.")
+        set(CPM_DOWNLOAD_cuvs ON)
+    endif()
+
+    #-----------------------------------------------------
+    # Invoke CPM find_package()
+    #-----------------------------------------------------
+    rapids_cpm_find(cuvs ${PKG_VERSION}
+            GLOBAL_TARGETS      cuvs::cuvs
+            BUILD_EXPORT_SET    cuvs-bench-exports
+            INSTALL_EXPORT_SET  cuvs-bench-exports
+            COMPONENTS          cuvs
+            CPM_ARGS
+              GIT_REPOSITORY        https://github.com/${PKG_FORK}/cuvs.git
+              GIT_TAG               ${PKG_PINNED_TAG}
+              SOURCE_SUBDIR         cpp
+              OPTIONS
+              "BUILD_SHARED_LIBS ${PKG_BUILD_SHARED_LIBS}"
+              "BUILD_CPU_ONLY ${PKG_BUILD_CPU_ONLY}"
+              "BUILD_TESTS OFF"
+              "BUILD_CAGRA_HNSWLIB OFF"
+              "CUVS_CLONE_ON_PIN ${PKG_CLONE_ON_PIN}"
+            )
+endfunction()
+
+
+# Change pinned tag here to test a commit in CI
+# To use a different cuVS locally, set the CMake variable
+# CPM_cuvs_SOURCE=/path/to/local/cuvs
+find_and_configure_cuvs(VERSION  ${CUVS_VERSION}.00
+        FORK                     ${CUVS_FORK}
+        PINNED_TAG               ${CUVS_PINNED_TAG}
+        ENABLE_NVTX              OFF
+        # When PINNED_TAG above doesn't match the default rapids branch,
+        # force local cuvs clone in build directory
+        # even if it's already installed.
+        CLONE_ON_PIN     ${CUVS_CLONE_ON_PIN}
+        BUILD_CPU_ONLY ${BUILD_CPU_ONLY}
+        BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}
+)
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 2459d521d..e28572457 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -2149,7 +2149,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             =
+PREDEFINED             = "CUVS_BUILD_MG_ALGOS=1"
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/cpp/include/cuvs/neighbors/cagra.h b/cpp/include/cuvs/neighbors/cagra.h
index 241f5d8b0..14331ebbc 100644
--- a/cpp/include/cuvs/neighbors/cagra.h
+++ b/cpp/include/cuvs/neighbors/cagra.h
@@ -267,6 +267,15 @@ cuvsError_t cuvsCagraIndexCreate(cuvsCagraIndex_t* index);
  */
 cuvsError_t cuvsCagraIndexDestroy(cuvsCagraIndex_t index);
 
+/**
+ * @brief Get dimension of the CAGRA index
+ *
+ * @param[in] index CAGRA index
+ * @param[out] dim return dimension of the index
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsCagraIndexGetDims(cuvsCagraIndex_t index, int* dim);
+
 /**
  * @}
  */
@@ -338,7 +347,7 @@ cuvsError_t cuvsCagraBuild(cuvsResources_t res,
  *        with the same type of `queries`, such that `index.dtype.code ==
  * queries.dl_tensor.dtype.code` Types for input are:
  *        1. `queries`:
- *`         a. kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
+ *          a. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
  *          b. `kDLDataType.code == kDLInt` and `kDLDataType.bits = 8`
  *          c. `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 8`
  *        2. `neighbors`: `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 32`
diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp
index 73ce80b41..60b8cc122 100644
--- a/cpp/include/cuvs/neighbors/common.hpp
+++ b/cpp/include/cuvs/neighbors/common.hpp
@@ -19,7 +19,8 @@
 #include <cstdint>
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>   // get_device_for_address
@@ -636,5 +637,56 @@ enable_if_valid_list_t<ListT> deserialize_list(const raft::resources& handle,
                                                const typename ListT::spec_type& store_spec,
                                                const typename ListT::spec_type& device_spec);
 }  // namespace ivf
+}  // namespace cuvs::neighbors
+
+namespace cuvs::neighbors {
+using namespace raft;
+
+template <typename AnnIndexType, typename T, typename IdxT>
+struct iface {
+  iface() : mutex_(std::make_shared<std::mutex>()) {}
+
+  const IdxT size() const { return index_.value().size(); }
+
+  std::optional<AnnIndexType> index_;
+  std::shared_ptr<std::mutex> mutex_;
+};
+
+template <typename AnnIndexType, typename T, typename IdxT, typename Accessor>
+void build(const raft::device_resources& handle,
+           cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+           const cuvs::neighbors::index_params* index_params,
+           raft::mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> index_dataset);
+
+template <typename AnnIndexType, typename T, typename IdxT, typename Accessor1, typename Accessor2>
+void extend(
+  const raft::device_resources& handle,
+  cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+  raft::mdspan<const T, matrix_extent<int64_t>, row_major, Accessor1> new_vectors,
+  std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, Accessor2>>
+    new_indices);
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void search(const raft::device_resources& handle,
+            const cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+            const cuvs::neighbors::search_params* search_params,
+            raft::device_matrix_view<const T, int64_t, row_major> h_queries,
+            raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,
+            raft::device_matrix_view<float, int64_t, row_major> d_distances);
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void serialize(const raft::device_resources& handle,
+               const cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+               std::ostream& os);
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void deserialize(const raft::device_resources& handle,
+                 cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+                 std::istream& is);
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void deserialize(const raft::device_resources& handle,
+                 cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+                 const std::string& filename);
 
 };  // namespace cuvs::neighbors
diff --git a/cpp/include/cuvs/neighbors/hnsw.h b/cpp/include/cuvs/neighbors/hnsw.h
index 5e94de60a..0495c574a 100644
--- a/cpp/include/cuvs/neighbors/hnsw.h
+++ b/cpp/include/cuvs/neighbors/hnsw.h
@@ -105,8 +105,10 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
  *        with the same type of `queries`, such that `index.dtype.code ==
  *        queries.dl_tensor.dtype.code`
  *        Supported types for input are:
- *        1. `queries`: `kDLDataType.code == kDLFloat` or `kDLDataType.code == kDLInt` and
- * `kDLDataType.bits = 32`
+ *        1. `queries`:
+ *          a. `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
+ *          b. `kDLDataType.code == kDLInt` and `kDLDataType.bits = 8`
+ *          c. `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 8`
  *        2. `neighbors`: `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 64`
  *        3. `distances`: `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
  * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp
index 007adef0d..d5abd6d55 100644
--- a/cpp/include/cuvs/neighbors/hnsw.hpp
+++ b/cpp/include/cuvs/neighbors/hnsw.hpp
@@ -173,6 +173,8 @@ std::unique_ptr<index<int8_t>> from_cagra(
 
 /**@}*/
 
+// TODO: Filtered Search APIs: https://github.com/rapidsai/cuvs/issues/363
+
 /**
  * @defgroup hnsw_cpp_index_search Search hnswlib index
  * @{
@@ -260,7 +262,7 @@ void search(raft::resources const& res,
 void search(raft::resources const& res,
             const search_params& params,
             const index<uint8_t>& idx,
-            raft::host_matrix_view<const int, int64_t, raft::row_major> queries,
+            raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> queries,
             raft::host_matrix_view<uint64_t, int64_t, raft::row_major> neighbors,
             raft::host_matrix_view<float, int64_t, raft::row_major> distances);
 
@@ -303,7 +305,7 @@ void search(raft::resources const& res,
 void search(raft::resources const& res,
             const search_params& params,
             const index<int8_t>& idx,
-            raft::host_matrix_view<const int, int64_t, raft::row_major> queries,
+            raft::host_matrix_view<const int8_t, int64_t, raft::row_major> queries,
             raft::host_matrix_view<uint64_t, int64_t, raft::row_major> neighbors,
             raft::host_matrix_view<float, int64_t, raft::row_major> distances);
 
diff --git a/cpp/include/cuvs/neighbors/ivf_flat.hpp b/cpp/include/cuvs/neighbors/ivf_flat.hpp
index 67d1b46c0..7f852d635 100644
--- a/cpp/include/cuvs/neighbors/ivf_flat.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_flat.hpp
@@ -1168,7 +1168,7 @@ void extend(raft::resources const& handle,
  */
 void search(raft::resources const& handle,
             const cuvs::neighbors::ivf_flat::search_params& params,
-            cuvs::neighbors::ivf_flat::index<float, int64_t>& index,
+            const cuvs::neighbors::ivf_flat::index<float, int64_t>& index,
             raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
@@ -1209,7 +1209,7 @@ void search(raft::resources const& handle,
  */
 void search(raft::resources const& handle,
             const cuvs::neighbors::ivf_flat::search_params& params,
-            cuvs::neighbors::ivf_flat::index<int8_t, int64_t>& index,
+            const cuvs::neighbors::ivf_flat::index<int8_t, int64_t>& index,
             raft::device_matrix_view<const int8_t, int64_t, raft::row_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
@@ -1250,7 +1250,7 @@ void search(raft::resources const& handle,
  */
 void search(raft::resources const& handle,
             const cuvs::neighbors::ivf_flat::search_params& params,
-            cuvs::neighbors::ivf_flat::index<uint8_t, int64_t>& index,
+            const cuvs::neighbors::ivf_flat::index<uint8_t, int64_t>& index,
             raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
diff --git a/cpp/include/cuvs/neighbors/ivf_pq.hpp b/cpp/include/cuvs/neighbors/ivf_pq.hpp
index 3ce5f382f..ae543c9e9 100644
--- a/cpp/include/cuvs/neighbors/ivf_pq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_pq.hpp
@@ -1221,6 +1221,75 @@ void extend(raft::resources const& handle,
             std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
             cuvs::neighbors::ivf_pq::index<int64_t>* idx);
 
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Note, the user can set a stream pool in the input raft::resource with
+ * at least one stream to enable kernel and copy overlapping.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_pq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_pq::build(handle, index_params, dataset);
+ *   // optional: create a stream pool with at least one stream to enable kernel and copy
+ *   // overlapping
+ *   raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
+ *   auto index = ivf_pq::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a host vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx
+ */
+auto extend(raft::resources const& handle,
+            raft::host_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_pq::index<int64_t>& idx)
+  -> cuvs::neighbors::ivf_pq::index<int64_t>;
+
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Note, the user can set a stream pool in the input raft::resource with
+ * at least one stream to enable kernel and copy overlapping.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_pq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_pq::build(handle, index_params, dataset);
+ *   // optional: create a stream pool with at least one stream to enable kernel and copy
+ *   // overlapping
+ *   raft::resource::set_cuda_stream_pool(handle, std::make_shared<rmm::cuda_stream_pool>(1));
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
+ *   ivf_pq::extend(handle, new_vectors, no_op, &index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a host vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx
+ */
+void extend(raft::resources const& handle,
+            raft::host_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_pq::index<int64_t>* idx);
+
 /**
  * @brief Extend the index with the new data.
  *
@@ -1405,7 +1474,7 @@ void extend(raft::resources const& handle,
  */
 void search(raft::resources const& handle,
             const cuvs::neighbors::ivf_pq::search_params& search_params,
-            cuvs::neighbors::ivf_pq::index<int64_t>& index,
+            const cuvs::neighbors::ivf_pq::index<int64_t>& index,
             raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
@@ -1450,7 +1519,7 @@ void search(raft::resources const& handle,
  */
 void search(raft::resources const& handle,
             const cuvs::neighbors::ivf_pq::search_params& search_params,
-            cuvs::neighbors::ivf_pq::index<int64_t>& index,
+            const cuvs::neighbors::ivf_pq::index<int64_t>& index,
             raft::device_matrix_view<const half, int64_t, raft::row_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
@@ -1495,7 +1564,7 @@ void search(raft::resources const& handle,
  */
 void search(raft::resources const& handle,
             const cuvs::neighbors::ivf_pq::search_params& search_params,
-            cuvs::neighbors::ivf_pq::index<int64_t>& index,
+            const cuvs::neighbors::ivf_pq::index<int64_t>& index,
             raft::device_matrix_view<const int8_t, int64_t, raft::row_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
@@ -1540,7 +1609,7 @@ void search(raft::resources const& handle,
  */
 void search(raft::resources const& handle,
             const cuvs::neighbors::ivf_pq::search_params& search_params,
-            cuvs::neighbors::ivf_pq::index<int64_t>& index,
+            const cuvs::neighbors::ivf_pq::index<int64_t>& index,
             raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
diff --git a/cpp/include/cuvs/neighbors/mg.hpp b/cpp/include/cuvs/neighbors/mg.hpp
new file mode 100644
index 000000000..4657fa8fb
--- /dev/null
+++ b/cpp/include/cuvs/neighbors/mg.hpp
@@ -0,0 +1,1367 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef CUVS_BUILD_MG_ALGOS
+
+#include <atomic>
+#include <memory>
+
+#include <raft/core/device_resources.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/ivf_flat.hpp>
+#include <cuvs/neighbors/ivf_pq.hpp>
+
+#define DEFAULT_SEARCH_BATCH_SIZE 1 << 20
+
+/// \defgroup mg_cpp_index_params ANN MG index build parameters
+
+namespace cuvs::neighbors::mg {
+/** Distribution mode */
+/// \ingroup mg_cpp_index_params
+enum distribution_mode {
+  /** Index is replicated on each device, favors throughput */
+  REPLICATED,
+  /** Index is split on several devices, favors scaling */
+  SHARDED
+};
+
+/// \defgroup mg_cpp_search_params ANN MG search parameters
+
+/** Search mode when using a replicated index */
+/// \ingroup mg_cpp_search_params
+enum replicated_search_mode {
+  /** Search queries are splited to maintain equal load on GPUs */
+  LOAD_BALANCER,
+  /** Each search query is processed by a single GPU in a round-robin fashion */
+  ROUND_ROBIN
+};
+
+/** Merge mode when using a sharded index */
+/// \ingroup mg_cpp_search_params
+enum sharded_merge_mode {
+  /** Search batches are merged on the root rank */
+  MERGE_ON_ROOT_RANK,
+  /** Search batches are merged in a tree reduction fashion */
+  TREE_MERGE
+};
+
+/** Build parameters */
+/// \ingroup mg_cpp_index_params
+template <typename Upstream>
+struct index_params : public Upstream {
+  index_params() : mode(SHARDED) {}
+
+  index_params(const Upstream& sp) : Upstream(sp), mode(SHARDED) {}
+
+  /** Distribution mode */
+  cuvs::neighbors::mg::distribution_mode mode = SHARDED;
+};
+
+/** Search parameters */
+/// \ingroup mg_cpp_search_params
+template <typename Upstream>
+struct search_params : public Upstream {
+  search_params() : search_mode(LOAD_BALANCER), merge_mode(TREE_MERGE) {}
+
+  search_params(const Upstream& sp)
+    : Upstream(sp), search_mode(LOAD_BALANCER), merge_mode(TREE_MERGE)
+  {
+  }
+
+  /** Replicated search mode */
+  cuvs::neighbors::mg::replicated_search_mode search_mode = LOAD_BALANCER;
+  /** Sharded merge mode */
+  cuvs::neighbors::mg::sharded_merge_mode merge_mode = TREE_MERGE;
+};
+
+}  // namespace cuvs::neighbors::mg
+
+namespace cuvs::neighbors::mg {
+
+using namespace raft;
+
+template <typename AnnIndexType, typename T, typename IdxT>
+struct index {
+  index(distribution_mode mode, int num_ranks_);
+  index(const raft::device_resources& handle, const std::string& filename);
+
+  index(const index&)                    = delete;
+  index(index&&)                         = default;
+  auto operator=(const index&) -> index& = delete;
+  auto operator=(index&&) -> index&      = default;
+
+  distribution_mode mode_;
+  int num_ranks_;
+  std::vector<iface<AnnIndexType, T, IdxT>> ann_interfaces_;
+
+  // for load balancing mechanism
+  std::shared_ptr<std::atomic<int64_t>> round_robin_counter_;
+};
+
+/// \defgroup mg_cpp_index_build ANN MG index build
+
+/// \ingroup mg_cpp_index_build
+/**
+ * @brief Builds a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] index_dataset a row-major matrix on host [n_rows, dim]
+ *
+ * @return the constructed IVF-Flat MG index
+ */
+auto build(const raft::device_resources& handle,
+           const mg::index_params<ivf_flat::index_params>& index_params,
+           raft::host_matrix_view<const float, int64_t, row_major> index_dataset)
+  -> index<ivf_flat::index<float, int64_t>, float, int64_t>;
+
+/// \ingroup mg_cpp_index_build
+/**
+ * @brief Builds a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] index_dataset a row-major matrix on host [n_rows, dim]
+ *
+ * @return the constructed IVF-Flat MG index
+ */
+auto build(const raft::device_resources& handle,
+           const mg::index_params<ivf_flat::index_params>& index_params,
+           raft::host_matrix_view<const int8_t, int64_t, row_major> index_dataset)
+  -> index<ivf_flat::index<int8_t, int64_t>, int8_t, int64_t>;
+
+/// \ingroup mg_cpp_index_build
+/**
+ * @brief Builds a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] index_dataset a row-major matrix on host [n_rows, dim]
+ *
+ * @return the constructed IVF-Flat MG index
+ */
+auto build(const raft::device_resources& handle,
+           const mg::index_params<ivf_flat::index_params>& index_params,
+           raft::host_matrix_view<const uint8_t, int64_t, row_major> index_dataset)
+  -> index<ivf_flat::index<uint8_t, int64_t>, uint8_t, int64_t>;
+
+/// \ingroup mg_cpp_index_build
+/**
+ * @brief Builds a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] index_dataset a row-major matrix on host [n_rows, dim]
+ *
+ * @return the constructed IVF-PQ MG index
+ */
+auto build(const raft::device_resources& handle,
+           const mg::index_params<ivf_pq::index_params>& index_params,
+           raft::host_matrix_view<const float, int64_t, row_major> index_dataset)
+  -> index<ivf_pq::index<int64_t>, float, int64_t>;
+
+/// \ingroup mg_cpp_index_build
+/**
+ * @brief Builds a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] index_dataset a row-major matrix on host [n_rows, dim]
+ *
+ * @return the constructed IVF-PQ MG index
+ */
+auto build(const raft::device_resources& handle,
+           const mg::index_params<ivf_pq::index_params>& index_params,
+           raft::host_matrix_view<const half, int64_t, row_major> index_dataset)
+  -> index<ivf_pq::index<int64_t>, half, int64_t>;
+
+/// \ingroup mg_cpp_index_build
+/**
+ * @brief Builds a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] index_dataset a row-major matrix on host [n_rows, dim]
+ *
+ * @return the constructed IVF-PQ MG index
+ */
+auto build(const raft::device_resources& handle,
+           const mg::index_params<ivf_pq::index_params>& index_params,
+           raft::host_matrix_view<const int8_t, int64_t, row_major> index_dataset)
+  -> index<ivf_pq::index<int64_t>, int8_t, int64_t>;
+
+/// \ingroup mg_cpp_index_build
+/**
+ * @brief Builds a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] index_dataset a row-major matrix on host [n_rows, dim]
+ *
+ * @return the constructed IVF-PQ MG index
+ */
+auto build(const raft::device_resources& handle,
+           const mg::index_params<ivf_pq::index_params>& index_params,
+           raft::host_matrix_view<const uint8_t, int64_t, row_major> index_dataset)
+  -> index<ivf_pq::index<int64_t>, uint8_t, int64_t>;
+
+/// \ingroup mg_cpp_index_build
+/**
+ * @brief Builds a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] index_dataset a row-major matrix on host [n_rows, dim]
+ *
+ * @return the constructed CAGRA MG index
+ */
+auto build(const raft::device_resources& handle,
+           const mg::index_params<cagra::index_params>& index_params,
+           raft::host_matrix_view<const float, int64_t, row_major> index_dataset)
+  -> index<cagra::index<float, uint32_t>, float, uint32_t>;
+
+/// \ingroup mg_cpp_index_build
+/**
+ * @brief Builds a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] index_dataset a row-major matrix on host [n_rows, dim]
+ *
+ * @return the constructed CAGRA MG index
+ */
+auto build(const raft::device_resources& handle,
+           const mg::index_params<cagra::index_params>& index_params,
+           raft::host_matrix_view<const half, int64_t, row_major> index_dataset)
+  -> index<cagra::index<half, uint32_t>, half, uint32_t>;
+
+/// \ingroup mg_cpp_index_build
+/**
+ * @brief Builds a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] index_dataset a row-major matrix on host [n_rows, dim]
+ *
+ * @return the constructed CAGRA MG index
+ */
+auto build(const raft::device_resources& handle,
+           const mg::index_params<cagra::index_params>& index_params,
+           raft::host_matrix_view<const int8_t, int64_t, row_major> index_dataset)
+  -> index<cagra::index<int8_t, uint32_t>, int8_t, uint32_t>;
+
+/// \ingroup mg_cpp_index_build
+/**
+ * @brief Builds a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] index_dataset a row-major matrix on host [n_rows, dim]
+ *
+ * @return the constructed CAGRA MG index
+ */
+auto build(const raft::device_resources& handle,
+           const mg::index_params<cagra::index_params>& index_params,
+           raft::host_matrix_view<const uint8_t, int64_t, row_major> index_dataset)
+  -> index<cagra::index<uint8_t, uint32_t>, uint8_t, uint32_t>;
+
+/// \defgroup mg_cpp_index_extend ANN MG index extend
+
+/// \ingroup mg_cpp_index_extend
+/**
+ * @brief Extends a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::extend(handle, index, new_vectors, std::nullopt);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] new_vectors a row-major matrix on host [n_rows, dim]
+ * @param[in] new_indices optional vector on host [n_rows],
+ * `std::nullopt` means default continuous range `[0...n_rows)`
+ *
+ */
+void extend(const raft::device_resources& handle,
+            index<ivf_flat::index<float, int64_t>, float, int64_t>& index,
+            raft::host_matrix_view<const float, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices);
+
+/// \ingroup mg_cpp_index_extend
+/**
+ * @brief Extends a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::extend(handle, index, new_vectors, std::nullopt);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] new_vectors a row-major matrix on host [n_rows, dim]
+ * @param[in] new_indices optional vector on host [n_rows],
+ * `std::nullopt` means default continuous range `[0...n_rows)`
+ *
+ */
+void extend(const raft::device_resources& handle,
+            index<ivf_flat::index<int8_t, int64_t>, int8_t, int64_t>& index,
+            raft::host_matrix_view<const int8_t, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices);
+
+/// \ingroup mg_cpp_index_extend
+/**
+ * @brief Extends a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::extend(handle, index, new_vectors, std::nullopt);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] new_vectors a row-major matrix on host [n_rows, dim]
+ * @param[in] new_indices optional vector on host [n_rows],
+ * `std::nullopt` means default continuous range `[0...n_rows)`
+ *
+ */
+void extend(const raft::device_resources& handle,
+            index<ivf_flat::index<uint8_t, int64_t>, uint8_t, int64_t>& index,
+            raft::host_matrix_view<const uint8_t, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices);
+
+/// \ingroup mg_cpp_index_extend
+/**
+ * @brief Extends a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::extend(handle, index, new_vectors, std::nullopt);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] new_vectors a row-major matrix on host [n_rows, dim]
+ * @param[in] new_indices optional vector on host [n_rows],
+ * `std::nullopt` means default continuous range `[0...n_rows)`
+ *
+ */
+void extend(const raft::device_resources& handle,
+            index<ivf_pq::index<int64_t>, float, int64_t>& index,
+            raft::host_matrix_view<const float, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices);
+
+/// \ingroup mg_cpp_index_extend
+/**
+ * @brief Extends a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::extend(handle, index, new_vectors, std::nullopt);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] new_vectors a row-major matrix on host [n_rows, dim]
+ * @param[in] new_indices optional vector on host [n_rows],
+ * `std::nullopt` means default continuous range `[0...n_rows)`
+ *
+ */
+void extend(const raft::device_resources& handle,
+            index<ivf_pq::index<int64_t>, half, int64_t>& index,
+            raft::host_matrix_view<const half, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices);
+
+/// \ingroup mg_cpp_index_extend
+/**
+ * @brief Extends a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::extend(handle, index, new_vectors, std::nullopt);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] new_vectors a row-major matrix on host [n_rows, dim]
+ * @param[in] new_indices optional vector on host [n_rows],
+ * `std::nullopt` means default continuous range `[0...n_rows)`
+ *
+ */
+void extend(const raft::device_resources& handle,
+            index<ivf_pq::index<int64_t>, int8_t, int64_t>& index,
+            raft::host_matrix_view<const int8_t, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices);
+
+/// \ingroup mg_cpp_index_extend
+/**
+ * @brief Extends a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::extend(handle, index, new_vectors, std::nullopt);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] new_vectors a row-major matrix on host [n_rows, dim]
+ * @param[in] new_indices optional vector on host [n_rows],
+ * `std::nullopt` means default continuous range `[0...n_rows)`
+ *
+ */
+void extend(const raft::device_resources& handle,
+            index<ivf_pq::index<int64_t>, uint8_t, int64_t>& index,
+            raft::host_matrix_view<const uint8_t, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices);
+
+/// \ingroup mg_cpp_index_extend
+/**
+ * @brief Extends a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::extend(handle, index, new_vectors, std::nullopt);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] new_vectors a row-major matrix on host [n_rows, dim]
+ * @param[in] new_indices optional vector on host [n_rows],
+ * `std::nullopt` means default continuous range `[0...n_rows)`
+ *
+ */
+void extend(const raft::device_resources& handle,
+            index<cagra::index<float, uint32_t>, float, uint32_t>& index,
+            raft::host_matrix_view<const float, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const uint32_t, int64_t>> new_indices);
+
+/// \ingroup mg_cpp_index_extend
+/**
+ * @brief Extends a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::extend(handle, index, new_vectors, std::nullopt);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] new_vectors a row-major matrix on host [n_rows, dim]
+ * @param[in] new_indices optional vector on host [n_rows],
+ * `std::nullopt` means default continuous range `[0...n_rows)`
+ *
+ */
+void extend(const raft::device_resources& handle,
+            index<cagra::index<half, uint32_t>, half, uint32_t>& index,
+            raft::host_matrix_view<const half, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const uint32_t, int64_t>> new_indices);
+
+/// \ingroup mg_cpp_index_extend
+/**
+ * @brief Extends a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::extend(handle, index, new_vectors, std::nullopt);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] new_vectors a row-major matrix on host [n_rows, dim]
+ * @param[in] new_indices optional vector on host [n_rows],
+ * `std::nullopt` means default continuous range `[0...n_rows)`
+ *
+ */
+void extend(const raft::device_resources& handle,
+            index<cagra::index<int8_t, uint32_t>, int8_t, uint32_t>& index,
+            raft::host_matrix_view<const int8_t, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const uint32_t, int64_t>> new_indices);
+
+/// \ingroup mg_cpp_index_extend
+/**
+ * @brief Extends a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::extend(handle, index, new_vectors, std::nullopt);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] new_vectors a row-major matrix on host [n_rows, dim]
+ * @param[in] new_indices optional vector on host [n_rows],
+ * `std::nullopt` means default continuous range `[0...n_rows)`
+ *
+ */
+void extend(const raft::device_resources& handle,
+            index<cagra::index<uint8_t, uint32_t>, uint8_t, uint32_t>& index,
+            raft::host_matrix_view<const uint8_t, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const uint32_t, int64_t>> new_indices);
+
+/// \defgroup mg_cpp_index_search ANN MG index search
+
+/// \ingroup mg_cpp_index_search
+/**
+ * @brief Searches a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::search_params<ivf_flat::search_params> search_params;
+ * cuvs::neighbors::mg::search(handle, index, search_params, queries, neighbors,
+ * distances);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] search_params configure the index search
+ * @param[in] queries a row-major matrix on host [n_rows, dim]
+ * @param[out] neighbors a row-major matrix on host [n_rows, n_neighbors]
+ * @param[out] distances a row-major matrix on host [n_rows, n_neighbors]
+ * @param[in] n_rows_per_batch (optional) search batch size
+ *
+ */
+void search(const raft::device_resources& handle,
+            const index<ivf_flat::index<float, int64_t>, float, int64_t>& index,
+            const mg::search_params<ivf_flat::search_params>& search_params,
+            raft::host_matrix_view<const float, int64_t, row_major> queries,
+            raft::host_matrix_view<int64_t, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch = DEFAULT_SEARCH_BATCH_SIZE);
+
+/// \ingroup mg_cpp_index_search
+/**
+ * @brief Searches a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::search_params<ivf_flat::search_params> search_params;
+ * cuvs::neighbors::mg::search(handle, index, search_params, queries, neighbors,
+ * distances);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] search_params configure the index search
+ * @param[in] queries a row-major matrix on host [n_rows, dim]
+ * @param[out] neighbors a row-major matrix on host [n_rows, n_neighbors]
+ * @param[out] distances a row-major matrix on host [n_rows, n_neighbors]
+ * @param[in] n_rows_per_batch (optional) search batch size
+ *
+ */
+void search(const raft::device_resources& handle,
+            const index<ivf_flat::index<int8_t, int64_t>, int8_t, int64_t>& index,
+            const mg::search_params<ivf_flat::search_params>& search_params,
+            raft::host_matrix_view<const int8_t, int64_t, row_major> queries,
+            raft::host_matrix_view<int64_t, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch = DEFAULT_SEARCH_BATCH_SIZE);
+
+/// \ingroup mg_cpp_index_search
+/**
+ * @brief Searches a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::search_params<ivf_flat::search_params> search_params;
+ * cuvs::neighbors::mg::search(handle, index, search_params, queries, neighbors,
+ * distances);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] search_params configure the index search
+ * @param[in] queries a row-major matrix on host [n_rows, dim]
+ * @param[out] neighbors a row-major matrix on host [n_rows, n_neighbors]
+ * @param[out] distances a row-major matrix on host [n_rows, n_neighbors]
+ * @param[in] n_rows_per_batch (optional) search batch size
+ *
+ */
+void search(const raft::device_resources& handle,
+            const index<ivf_flat::index<uint8_t, int64_t>, uint8_t, int64_t>& index,
+            const mg::search_params<ivf_flat::search_params>& search_params,
+            raft::host_matrix_view<const uint8_t, int64_t, row_major> queries,
+            raft::host_matrix_view<int64_t, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch = DEFAULT_SEARCH_BATCH_SIZE);
+
+/// \ingroup mg_cpp_index_search
+/**
+ * @brief Searches a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::search_params<ivf_pq::search_params> search_params;
+ * cuvs::neighbors::mg::search(handle, index, search_params, queries, neighbors,
+ * distances);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] search_params configure the index search
+ * @param[in] queries a row-major matrix on host [n_rows, dim]
+ * @param[out] neighbors a row-major matrix on host [n_rows, n_neighbors]
+ * @param[out] distances a row-major matrix on host [n_rows, n_neighbors]
+ * @param[in] n_rows_per_batch (optional) search batch size
+ *
+ */
+void search(const raft::device_resources& handle,
+            const index<ivf_pq::index<int64_t>, float, int64_t>& index,
+            const mg::search_params<ivf_pq::search_params>& search_params,
+            raft::host_matrix_view<const float, int64_t, row_major> queries,
+            raft::host_matrix_view<int64_t, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch = DEFAULT_SEARCH_BATCH_SIZE);
+
+/// \ingroup mg_cpp_index_search
+/**
+ * @brief Searches a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::search_params<ivf_pq::search_params> search_params;
+ * cuvs::neighbors::mg::search(handle, index, search_params, queries, neighbors,
+ * distances);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] search_params configure the index search
+ * @param[in] queries a row-major matrix on host [n_rows, dim]
+ * @param[out] neighbors a row-major matrix on host [n_rows, n_neighbors]
+ * @param[out] distances a row-major matrix on host [n_rows, n_neighbors]
+ * @param[in] n_rows_per_batch (optional) search batch size
+ *
+ */
+void search(const raft::device_resources& handle,
+            const index<ivf_pq::index<int64_t>, half, int64_t>& index,
+            const mg::search_params<ivf_pq::search_params>& search_params,
+            raft::host_matrix_view<const half, int64_t, row_major> queries,
+            raft::host_matrix_view<int64_t, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch = DEFAULT_SEARCH_BATCH_SIZE);
+
+/// \ingroup mg_cpp_index_search
+/**
+ * @brief Searches a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::search_params<ivf_pq::search_params> search_params;
+ * cuvs::neighbors::mg::search(handle, index, search_params, queries, neighbors,
+ * distances);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] search_params configure the index search
+ * @param[in] queries a row-major matrix on host [n_rows, dim]
+ * @param[out] neighbors a row-major matrix on host [n_rows, n_neighbors]
+ * @param[out] distances a row-major matrix on host [n_rows, n_neighbors]
+ * @param[in] n_rows_per_batch (optional) search batch size
+ *
+ */
+void search(const raft::device_resources& handle,
+            const index<ivf_pq::index<int64_t>, int8_t, int64_t>& index,
+            const mg::search_params<ivf_pq::search_params>& search_params,
+            raft::host_matrix_view<const int8_t, int64_t, row_major> queries,
+            raft::host_matrix_view<int64_t, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch = DEFAULT_SEARCH_BATCH_SIZE);
+
+/// \ingroup mg_cpp_index_search
+/**
+ * @brief Searches a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::search_params<ivf_pq::search_params> search_params;
+ * cuvs::neighbors::mg::search(handle, index, search_params, queries, neighbors,
+ * distances);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] search_params configure the index search
+ * @param[in] queries a row-major matrix on host [n_rows, dim]
+ * @param[out] neighbors a row-major matrix on host [n_rows, n_neighbors]
+ * @param[out] distances a row-major matrix on host [n_rows, n_neighbors]
+ * @param[in] n_rows_per_batch (optional) search batch size
+ *
+ */
+void search(const raft::device_resources& handle,
+            const index<ivf_pq::index<int64_t>, uint8_t, int64_t>& index,
+            const mg::search_params<ivf_pq::search_params>& search_params,
+            raft::host_matrix_view<const uint8_t, int64_t, row_major> queries,
+            raft::host_matrix_view<int64_t, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch = DEFAULT_SEARCH_BATCH_SIZE);
+
+/// \ingroup mg_cpp_index_search
+/**
+ * @brief Searches a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::search_params<cagra::search_params> search_params;
+ * cuvs::neighbors::mg::search(handle, index, search_params, queries, neighbors,
+ * distances);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] search_params configure the index search
+ * @param[in] queries a row-major matrix on host [n_rows, dim]
+ * @param[out] neighbors a row-major matrix on host [n_rows, n_neighbors]
+ * @param[out] distances a row-major matrix on host [n_rows, n_neighbors]
+ * @param[in] n_rows_per_batch (optional) search batch size
+ *
+ */
+void search(const raft::device_resources& handle,
+            const index<cagra::index<float, uint32_t>, float, uint32_t>& index,
+            const mg::search_params<cagra::search_params>& search_params,
+            raft::host_matrix_view<const float, int64_t, row_major> queries,
+            raft::host_matrix_view<uint32_t, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch = DEFAULT_SEARCH_BATCH_SIZE);
+
+/// \ingroup mg_cpp_index_search
+/**
+ * @brief Searches a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::search_params<cagra::search_params> search_params;
+ * cuvs::neighbors::mg::search(handle, index, search_params, queries, neighbors,
+ * distances);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] search_params configure the index search
+ * @param[in] queries a row-major matrix on host [n_rows, dim]
+ * @param[out] neighbors a row-major matrix on host [n_rows, n_neighbors]
+ * @param[out] distances a row-major matrix on host [n_rows, n_neighbors]
+ * @param[in] n_rows_per_batch (optional) search batch size
+ *
+ */
+void search(const raft::device_resources& handle,
+            const index<cagra::index<half, uint32_t>, half, uint32_t>& index,
+            const mg::search_params<cagra::search_params>& search_params,
+            raft::host_matrix_view<const half, int64_t, row_major> queries,
+            raft::host_matrix_view<uint32_t, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch = DEFAULT_SEARCH_BATCH_SIZE);
+
+/// \ingroup mg_cpp_index_search
+/**
+ * @brief Searches a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::search_params<cagra::search_params> search_params;
+ * cuvs::neighbors::mg::search(handle, index, search_params, queries, neighbors,
+ * distances);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] search_params configure the index search
+ * @param[in] queries a row-major matrix on host [n_rows, dim]
+ * @param[out] neighbors a row-major matrix on host [n_rows, n_neighbors]
+ * @param[out] distances a row-major matrix on host [n_rows, n_neighbors]
+ * @param[in] n_rows_per_batch (optional) search batch size
+ *
+ */
+void search(const raft::device_resources& handle,
+            const index<cagra::index<int8_t, uint32_t>, int8_t, uint32_t>& index,
+            const mg::search_params<cagra::search_params>& search_params,
+            raft::host_matrix_view<const int8_t, int64_t, row_major> queries,
+            raft::host_matrix_view<uint32_t, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch = DEFAULT_SEARCH_BATCH_SIZE);
+
+/// \ingroup mg_cpp_index_search
+/**
+ * @brief Searches a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * cuvs::neighbors::mg::search_params<cagra::search_params> search_params;
+ * cuvs::neighbors::mg::search(handle, index, search_params, queries, neighbors,
+ * distances);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] search_params configure the index search
+ * @param[in] queries a row-major matrix on host [n_rows, dim]
+ * @param[out] neighbors a row-major matrix on host [n_rows, n_neighbors]
+ * @param[out] distances a row-major matrix on host [n_rows, n_neighbors]
+ * @param[in] n_rows_per_batch (optional) search batch size
+ *
+ */
+void search(const raft::device_resources& handle,
+            const index<cagra::index<uint8_t, uint32_t>, uint8_t, uint32_t>& index,
+            const mg::search_params<cagra::search_params>& search_params,
+            raft::host_matrix_view<const uint8_t, int64_t, row_major> queries,
+            raft::host_matrix_view<uint32_t, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch = DEFAULT_SEARCH_BATCH_SIZE);
+
+/// \defgroup mg_cpp_serialize ANN MG index serialization
+
+/// \ingroup mg_cpp_serialize
+/**
+ * @brief Serializes a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] filename path to the file to be serialized
+ *
+ */
+void serialize(const raft::device_resources& handle,
+               const index<ivf_flat::index<float, int64_t>, float, int64_t>& index,
+               const std::string& filename);
+
+/// \ingroup mg_cpp_serialize
+/**
+ * @brief Serializes a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] filename path to the file to be serialized
+ *
+ */
+void serialize(const raft::device_resources& handle,
+               const index<ivf_flat::index<int8_t, int64_t>, int8_t, int64_t>& index,
+               const std::string& filename);
+
+/// \ingroup mg_cpp_serialize
+/**
+ * @brief Serializes a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] filename path to the file to be serialized
+ *
+ */
+void serialize(const raft::device_resources& handle,
+               const index<ivf_flat::index<uint8_t, int64_t>, uint8_t, int64_t>& index,
+               const std::string& filename);
+
+/// \ingroup mg_cpp_serialize
+/**
+ * @brief Serializes a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] filename path to the file to be serialized
+ *
+ */
+void serialize(const raft::device_resources& handle,
+               const index<ivf_pq::index<int64_t>, float, int64_t>& index,
+               const std::string& filename);
+
+/// \ingroup mg_cpp_serialize
+/**
+ * @brief Serializes a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] filename path to the file to be serialized
+ *
+ */
+void serialize(const raft::device_resources& handle,
+               const index<ivf_pq::index<int64_t>, half, int64_t>& index,
+               const std::string& filename);
+
+/// \ingroup mg_cpp_serialize
+/**
+ * @brief Serializes a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] filename path to the file to be serialized
+ *
+ */
+void serialize(const raft::device_resources& handle,
+               const index<ivf_pq::index<int64_t>, int8_t, int64_t>& index,
+               const std::string& filename);
+
+/// \ingroup mg_cpp_serialize
+/**
+ * @brief Serializes a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] filename path to the file to be serialized
+ *
+ */
+void serialize(const raft::device_resources& handle,
+               const index<ivf_pq::index<int64_t>, uint8_t, int64_t>& index,
+               const std::string& filename);
+
+/// \ingroup mg_cpp_serialize
+/**
+ * @brief Serializes a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] filename path to the file to be serialized
+ *
+ */
+void serialize(const raft::device_resources& handle,
+               const index<cagra::index<float, uint32_t>, float, uint32_t>& index,
+               const std::string& filename);
+
+/// \ingroup mg_cpp_serialize
+/**
+ * @brief Serializes a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] filename path to the file to be serialized
+ *
+ */
+void serialize(const raft::device_resources& handle,
+               const index<cagra::index<half, uint32_t>, half, uint32_t>& index,
+               const std::string& filename);
+
+/// \ingroup mg_cpp_serialize
+/**
+ * @brief Serializes a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] filename path to the file to be serialized
+ *
+ */
+void serialize(const raft::device_resources& handle,
+               const index<cagra::index<int8_t, uint32_t>, int8_t, uint32_t>& index,
+               const std::string& filename);
+
+/// \ingroup mg_cpp_serialize
+/**
+ * @brief Serializes a multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index the pre-built index
+ * @param[in] filename path to the file to be serialized
+ *
+ */
+void serialize(const raft::device_resources& handle,
+               const index<cagra::index<uint8_t, uint32_t>, uint8_t, uint32_t>& index,
+               const std::string& filename);
+
+/// \defgroup mg_cpp_deserialize ANN MG index deserialization
+
+/// \ingroup mg_cpp_deserialize
+/**
+ * @brief Deserializes an IVF-Flat multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_flat::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * auto new_index = cuvs::neighbors::mg::deserialize_flat<float, int64_t>(handle, filename);
+ *
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] filename path to the file to be deserialized
+ *
+ */
+template <typename T, typename IdxT>
+auto deserialize_flat(const raft::device_resources& handle, const std::string& filename)
+  -> index<ivf_flat::index<T, IdxT>, T, IdxT>;
+
+/// \ingroup mg_cpp_deserialize
+/**
+ * @brief Deserializes an IVF-PQ multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<ivf_pq::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * auto new_index = cuvs::neighbors::mg::deserialize_pq<float, int64_t>(handle, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] filename path to the file to be deserialized
+ *
+ */
+template <typename T, typename IdxT>
+auto deserialize_pq(const raft::device_resources& handle, const std::string& filename)
+  -> index<ivf_pq::index<IdxT>, T, IdxT>;
+
+/// \ingroup mg_cpp_deserialize
+/**
+ * @brief Deserializes a CAGRA multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::mg::index_params<cagra::index_params> index_params;
+ * auto index = cuvs::neighbors::mg::build(handle, index_params, index_dataset);
+ * const std::string filename = "mg_index.cuvs";
+ * cuvs::neighbors::mg::serialize(handle, index, filename);
+ * auto new_index = cuvs::neighbors::mg::deserialize_cagra<float, uint32_t>(handle, filename);
+ *
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] filename path to the file to be deserialized
+ *
+ */
+template <typename T, typename IdxT>
+auto deserialize_cagra(const raft::device_resources& handle, const std::string& filename)
+  -> index<cagra::index<T, IdxT>, T, IdxT>;
+
+/// \defgroup mg_cpp_distribute ANN MG local index distribution
+
+/// \ingroup mg_cpp_distribute
+/**
+ * @brief Replicates a locally built and serialized IVF-Flat index to all GPUs to form a distributed
+ * multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::ivf_flat::index_params index_params;
+ * auto index = cuvs::neighbors::ivf_flat::build(handle, index_params, index_dataset);
+ * const std::string filename = "local_index.cuvs";
+ * cuvs::neighbors::ivf_flat::serialize(handle, filename, index);
+ * auto new_index = cuvs::neighbors::mg::distribute_flat<float, int64_t>(handle, filename);
+ *
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] filename path to the file to be deserialized : a local index
+ *
+ */
+template <typename T, typename IdxT>
+auto distribute_flat(const raft::device_resources& handle, const std::string& filename)
+  -> index<ivf_flat::index<T, IdxT>, T, IdxT>;
+
+/// \ingroup mg_cpp_distribute
+/**
+ * @brief Replicates a locally built and serialized IVF-PQ index to all GPUs to form a distributed
+ * multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::ivf_pq::index_params index_params;
+ * auto index = cuvs::neighbors::ivf_pq::build(handle, index_params, index_dataset);
+ * const std::string filename = "local_index.cuvs";
+ * cuvs::neighbors::ivf_pq::serialize(handle, filename, index);
+ * auto new_index = cuvs::neighbors::mg::distribute_pq<float, int64_t>(handle, filename);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] filename path to the file to be deserialized : a local index
+ *
+ */
+template <typename T, typename IdxT>
+auto distribute_pq(const raft::device_resources& handle, const std::string& filename)
+  -> index<ivf_pq::index<IdxT>, T, IdxT>;
+
+/// \ingroup mg_cpp_distribute
+/**
+ * @brief Replicates a locally built and serialized CAGRA index to all GPUs to form a distributed
+ * multi-GPU index
+ *
+ * Usage example:
+ * @code{.cpp}
+ * raft::handle_t handle;
+ * cuvs::neighbors::cagra::index_params index_params;
+ * auto index = cuvs::neighbors::cagra::build(handle, index_params, index_dataset);
+ * const std::string filename = "local_index.cuvs";
+ * cuvs::neighbors::cagra::serialize(handle, filename, index);
+ * auto new_index = cuvs::neighbors::mg::distribute_cagra<float, uint32_t>(handle, filename);
+ *
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] filename path to the file to be deserialized : a local index
+ *
+ */
+template <typename T, typename IdxT>
+auto distribute_cagra(const raft::device_resources& handle, const std::string& filename)
+  -> index<cagra::index<T, IdxT>, T, IdxT>;
+
+}  // namespace cuvs::neighbors::mg
+
+#else
+
+static_assert(false,
+              "FORBIDEN_MG_ALGORITHM_IMPORT\n\n"
+              "Please recompile the cuVS library with MG algorithms BUILD_MG_ALGOS=ON.\n");
+
+#endif
diff --git a/cpp/src/neighbors/cagra_c.cpp b/cpp/src/neighbors/cagra_c.cpp
index 164448f2c..6985ff094 100644
--- a/cpp/src/neighbors/cagra_c.cpp
+++ b/cpp/src/neighbors/cagra_c.cpp
@@ -176,6 +176,14 @@ extern "C" cuvsError_t cuvsCagraIndexDestroy(cuvsCagraIndex_t index_c_ptr)
   });
 }
 
+extern "C" cuvsError_t cuvsCagraIndexGetDims(cuvsCagraIndex_t index, int* dim)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto index_ptr = reinterpret_cast<cuvs::neighbors::cagra::index<float, uint32_t>*>(index->addr);
+    *dim           = index_ptr->dim();
+  });
+}
+
 extern "C" cuvsError_t cuvsCagraBuild(cuvsResources_t res,
                                       cuvsCagraIndexParams_t params,
                                       DLManagedTensor* dataset_tensor,
diff --git a/cpp/src/neighbors/detail/cagra/add_nodes.cuh b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
index 9694a3e7a..b03b8214b 100644
--- a/cpp/src/neighbors/detail/cagra/add_nodes.cuh
+++ b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
@@ -29,9 +29,10 @@
 
 #include <cstdint>
 
+namespace cuvs::neighbors::cagra {
+
 static const std::string RAFT_NAME = "raft";
 
-namespace cuvs::neighbors::cagra {
 template <class T, class IdxT, class Accessor>
 void add_node_core(
   raft::resources const& handle,
diff --git a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
index f86ed9ef6..a077c098f 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
@@ -32,9 +32,10 @@
 #include <fstream>
 #include <type_traits>
 
-static const std::string RAFT_NAME = "raft";
 namespace cuvs::neighbors::cagra::detail {
 
+static const std::string RAFT_NAME = "raft";
+
 constexpr int serialization_version = 4;
 
 /**
@@ -119,9 +120,9 @@ void serialize_to_hnswlib(raft::resources const& res,
   os.write(reinterpret_cast<char*>(&curr_element_count), sizeof(std::size_t));
   // Example:M: 16, dim = 128, data_t = float, index_t = uint32_t, list_size_type = uint32_t,
   // labeltype: size_t size_data_per_element_ = M * 2 * sizeof(index_t) + sizeof(list_size_type) +
-  // dim * 4 + sizeof(labeltype)
-  auto size_data_per_element =
-    static_cast<std::size_t>(index_.graph_degree() * sizeof(IdxT) + 4 + index_.dim() * 4 + 8);
+  // dim * sizeof(T) + sizeof(labeltype)
+  auto size_data_per_element = static_cast<std::size_t>(index_.graph_degree() * sizeof(IdxT) + 4 +
+                                                        index_.dim() * sizeof(T) + 8);
   os.write(reinterpret_cast<char*>(&size_data_per_element), sizeof(std::size_t));
   // label_offset
   std::size_t label_offset = size_data_per_element - 8;
@@ -184,18 +185,9 @@ void serialize_to_hnswlib(raft::resources const& res,
     }
 
     auto data_row = host_dataset.data_handle() + (index_.dim() * i);
-    if constexpr (std::is_same_v<T, float>) {
-      for (std::size_t j = 0; j < index_.dim(); ++j) {
-        auto data_elem = static_cast<float>(host_dataset(i, j));
-        os.write(reinterpret_cast<char*>(&data_elem), sizeof(float));
-      }
-    } else if constexpr (std::is_same_v<T, std::int8_t> or std::is_same_v<T, std::uint8_t>) {
-      for (std::size_t j = 0; j < index_.dim(); ++j) {
-        auto data_elem = static_cast<int>(host_dataset(i, j));
-        os.write(reinterpret_cast<char*>(&data_elem), sizeof(int));
-      }
-    } else {
-      RAFT_FAIL("Unsupported dataset type while saving CAGRA dataset to HNSWlib format");
+    for (std::size_t j = 0; j < index_.dim(); ++j) {
+      auto data_elem = static_cast<T>(host_dataset(i, j));
+      os.write(reinterpret_cast<char*>(&data_elem), sizeof(T));
     }
 
     os.write(reinterpret_cast<char*>(&i), sizeof(std::size_t));
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index 0d1ae4ec9..ce1e03264 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -110,9 +110,9 @@ std::unique_ptr<index<T>> from_cagra(raft::resources const& res,
   return std::unique_ptr<index<T>>(hnsw_index);
 }
 
-template <typename QueriesT>
-void get_search_knn_results(hnswlib::HierarchicalNSW<QueriesT> const* idx,
-                            const QueriesT* query,
+template <typename T>
+void get_search_knn_results(hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type> const* idx,
+                            const T* query,
                             int k,
                             uint64_t* indices,
                             float* distances)
@@ -127,11 +127,11 @@ void get_search_knn_results(hnswlib::HierarchicalNSW<QueriesT> const* idx,
   }
 }
 
-template <typename T, typename QueriesT>
+template <typename T>
 void search(raft::resources const& res,
             const search_params& params,
             const index<T>& idx,
-            raft::host_matrix_view<const QueriesT, int64_t, raft::row_major> queries,
+            raft::host_matrix_view<const T, int64_t, raft::row_major> queries,
             raft::host_matrix_view<uint64_t, int64_t, raft::row_major> neighbors,
             raft::host_matrix_view<float, int64_t, raft::row_major> distances)
 {
@@ -146,7 +146,8 @@ void search(raft::resources const& res,
 
   idx.set_ef(params.ef);
   auto const* hnswlib_index =
-    reinterpret_cast<hnswlib::HierarchicalNSW<QueriesT> const*>(idx.get_index());
+    reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type> const*>(
+      idx.get_index());
 
   // when num_threads == 0, automatically maximize parallelism
   if (params.num_threads) {
diff --git a/cpp/src/neighbors/hnsw.cpp b/cpp/src/neighbors/hnsw.cpp
index 36cbb16c9..e6f3fbcc7 100644
--- a/cpp/src/neighbors/hnsw.cpp
+++ b/cpp/src/neighbors/hnsw.cpp
@@ -34,20 +34,20 @@ CUVS_INST_HNSW_FROM_CAGRA(int8_t);
 
 #undef CUVS_INST_HNSW_FROM_CAGRA
 
-#define CUVS_INST_HNSW_SEARCH(T, QueriesT)                                              \
-  void search(raft::resources const& res,                                               \
-              const search_params& params,                                              \
-              const index<T>& idx,                                                      \
-              raft::host_matrix_view<const QueriesT, int64_t, raft::row_major> queries, \
-              raft::host_matrix_view<uint64_t, int64_t, raft::row_major> neighbors,     \
-              raft::host_matrix_view<float, int64_t, raft::row_major> distances)        \
-  {                                                                                     \
-    detail::search<T, QueriesT>(res, params, idx, queries, neighbors, distances);       \
+#define CUVS_INST_HNSW_SEARCH(T)                                                    \
+  void search(raft::resources const& res,                                           \
+              const search_params& params,                                          \
+              const index<T>& idx,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> queries,    \
+              raft::host_matrix_view<uint64_t, int64_t, raft::row_major> neighbors, \
+              raft::host_matrix_view<float, int64_t, raft::row_major> distances)    \
+  {                                                                                 \
+    detail::search<T>(res, params, idx, queries, neighbors, distances);             \
   }
 
-CUVS_INST_HNSW_SEARCH(float, float);
-CUVS_INST_HNSW_SEARCH(uint8_t, int);
-CUVS_INST_HNSW_SEARCH(int8_t, int);
+CUVS_INST_HNSW_SEARCH(float);
+CUVS_INST_HNSW_SEARCH(uint8_t);
+CUVS_INST_HNSW_SEARCH(int8_t);
 
 #undef CUVS_INST_HNSW_SEARCH
 
diff --git a/cpp/src/neighbors/hnsw_c.cpp b/cpp/src/neighbors/hnsw_c.cpp
index ab5268a6d..a19875641 100644
--- a/cpp/src/neighbors/hnsw_c.cpp
+++ b/cpp/src/neighbors/hnsw_c.cpp
@@ -31,7 +31,7 @@
 #include <cuvs/neighbors/hnsw.hpp>
 
 namespace {
-template <typename T, typename QueriesT>
+template <typename T>
 void _search(cuvsResources_t res,
              cuvsHnswSearchParams params,
              cuvsHnswIndex index,
@@ -46,7 +46,7 @@ void _search(cuvsResources_t res,
   search_params.ef          = params.ef;
   search_params.num_threads = params.numThreads;
 
-  using queries_mdspan_type   = raft::host_matrix_view<QueriesT const, int64_t, raft::row_major>;
+  using queries_mdspan_type   = raft::host_matrix_view<T const, int64_t, raft::row_major>;
   using neighbors_mdspan_type = raft::host_matrix_view<uint64_t, int64_t, raft::row_major>;
   using distances_mdspan_type = raft::host_matrix_view<float, int64_t, raft::row_major>;
   auto queries_mds            = cuvs::core::from_dlpack<queries_mdspan_type>(queries_tensor);
@@ -127,16 +127,13 @@ extern "C" cuvsError_t cuvsHnswSearch(cuvsResources_t res,
 
     auto index = *index_c_ptr;
     RAFT_EXPECTS(queries.dtype.code == index.dtype.code, "type mismatch between index and queries");
-    RAFT_EXPECTS(queries.dtype.bits == 32, "number of bits in queries dtype should be 32");
 
     if (index.dtype.code == kDLFloat) {
-      _search<float, float>(
-        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor);
+      _search<float>(res, *params, index, queries_tensor, neighbors_tensor, distances_tensor);
     } else if (index.dtype.code == kDLUInt) {
-      _search<uint8_t, int>(
-        res, *params, index, queries_tensor, neighbors_tensor, distances_tensor);
+      _search<uint8_t>(res, *params, index, queries_tensor, neighbors_tensor, distances_tensor);
     } else if (index.dtype.code == kDLInt) {
-      _search<int8_t, int>(res, *params, index, queries_tensor, neighbors_tensor, distances_tensor);
+      _search<int8_t>(res, *params, index, queries_tensor, neighbors_tensor, distances_tensor);
     } else {
       RAFT_FAIL("Unsupported index dtype: %d and bits: %d", queries.dtype.code, queries.dtype.bits);
     }
@@ -152,13 +149,10 @@ extern "C" cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
   return cuvs::core::translate_exceptions([=] {
     if (index->dtype.code == kDLFloat && index->dtype.bits == 32) {
       index->addr = reinterpret_cast<uintptr_t>(_deserialize<float>(res, filename, dim, metric));
-      index->dtype.code = kDLFloat;
     } else if (index->dtype.code == kDLUInt && index->dtype.bits == 8) {
       index->addr = reinterpret_cast<uintptr_t>(_deserialize<uint8_t>(res, filename, dim, metric));
-      index->dtype.code = kDLInt;
     } else if (index->dtype.code == kDLInt && index->dtype.bits == 8) {
       index->addr = reinterpret_cast<uintptr_t>(_deserialize<int8_t>(res, filename, dim, metric));
-      index->dtype.code = kDLUInt;
     } else {
       RAFT_FAIL("Unsupported dtype in file %s", filename);
     }
diff --git a/cpp/src/neighbors/iface/generate_iface.py b/cpp/src/neighbors/iface/generate_iface.py
new file mode 100644
index 000000000..794219bbf
--- /dev/null
+++ b/cpp/src/neighbors/iface/generate_iface.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+header = """/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+"""
+
+include_macro = """
+#include "iface.hpp"
+"""
+
+namespace_macro = """
+namespace cuvs::neighbors {
+"""
+
+footer = """
+}  // namespace cuvs::neighbors
+"""
+
+flat_macro = """
+#define CUVS_INST_MG_FLAT(T, IdxT)                                                                                              \\
+  using T_ha = raft::host_device_accessor<std::experimental::default_accessor<const T>, raft::memory_type::device>;             \\
+  using T_da= raft::host_device_accessor<std::experimental::default_accessor<const T>, raft::memory_type::host>;                \\
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>, raft::memory_type::device>;       \\
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>, raft::memory_type::host>;         \\
+                                                                                                                                \\
+  template void build(const raft::device_resources& handle,                                                                     \\
+             cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                                              \\
+             const cuvs::neighbors::index_params* index_params,                                                                 \\
+             raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                                     \\
+                                                                                                                                \\
+  template void build(const raft::device_resources& handle,                                                                     \\
+             cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                                              \\
+             const cuvs::neighbors::index_params* index_params,                                                                 \\
+             raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                                     \\
+                                                                                                                                \\
+  template void extend(const raft::device_resources& handle,                                                                    \\
+              cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                                             \\
+              raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                                       \\
+              std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>> new_indices);       \\
+                                                                                                                                \\
+  template void extend(const raft::device_resources& handle,                                                                    \\
+              cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                                             \\
+              raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                                       \\
+              std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>> new_indices);       \\
+                                                                                                                                \\
+  template void search(const raft::device_resources& handle,                                                                    \\
+              const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                                       \\
+              const cuvs::neighbors::search_params* search_params,                                                              \\
+              raft::device_matrix_view<const T, int64_t, row_major> queries,                                                    \\
+              raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,                                                     \\
+              raft::device_matrix_view<float, int64_t, row_major> distances);                                                   \\
+                                                                                                                                \\
+  template void search(const raft::device_resources& handle,                                                                    \\
+              const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                                       \\
+              const cuvs::neighbors::search_params* search_params,                                                              \\
+              raft::host_matrix_view<const T, int64_t, row_major> h_queries,                                                    \\
+              raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,                                                   \\
+              raft::device_matrix_view<float, int64_t, row_major> d_distances);                                                 \\
+                                                                                                                                \\
+  template void serialize(const raft::device_resources& handle,                                                                 \\
+                 const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                                    \\
+                 std::ostream& os);                                                                                             \\
+                                                                                                                                \\
+  template void deserialize(const raft::device_resources& handle,                                                               \\
+                   cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                                        \\
+                   std::istream& is);                                                                                           \\
+                                                                                                                                \\
+  template void deserialize(const raft::device_resources& handle,                                                               \\
+                   cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                                        \\
+                   const std::string& filename);
+"""
+
+pq_macro = """
+#define CUVS_INST_MG_PQ(T, IdxT)                                                                                                \\
+  using T_ha = raft::host_device_accessor<std::experimental::default_accessor<const T>, raft::memory_type::device>;             \\
+  using T_da= raft::host_device_accessor<std::experimental::default_accessor<const T>, raft::memory_type::host>;                \\
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>, raft::memory_type::device>;       \\
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>, raft::memory_type::host>;         \\
+                                                                                                                                \\
+  template void build(const raft::device_resources& handle,                                                                     \\
+             cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                                                   \\
+             const cuvs::neighbors::index_params* index_params,                                                                 \\
+             raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                                     \\
+                                                                                                                                \\
+  template void build(const raft::device_resources& handle,                                                                     \\
+             cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                                                   \\
+             const cuvs::neighbors::index_params* index_params,                                                                 \\
+             raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                                     \\
+                                                                                                                                \\
+  template void extend(const raft::device_resources& handle,                                                                    \\
+              cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                                                  \\
+              raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                                       \\
+              std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>> new_indices);       \\
+                                                                                                                                \\
+  template void extend(const raft::device_resources& handle,                                                                    \\
+              cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                                                  \\
+              raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                                       \\
+              std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>> new_indices);       \\
+                                                                                                                                \\
+  template void search(const raft::device_resources& handle,                                                                    \\
+              const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                                            \\
+              const cuvs::neighbors::search_params* search_params,                                                              \\
+              raft::device_matrix_view<const T, int64_t, row_major> queries,                                                    \\
+              raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,                                                     \\
+              raft::device_matrix_view<float, int64_t, row_major> distances);                                                   \\
+                                                                                                                                \\
+  template void search(const raft::device_resources& handle,                                                                    \\
+              const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                                            \\
+              const cuvs::neighbors::search_params* search_params,                                                              \\
+              raft::host_matrix_view<const T, int64_t, row_major> h_queries,                                                    \\
+              raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,                                                   \\
+              raft::device_matrix_view<float, int64_t, row_major> d_distances);                                                 \\
+                                                                                                                                \\
+  template void serialize(const raft::device_resources& handle,                                                                 \\
+                 const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                                         \\
+                 std::ostream& os);                                                                                             \\
+                                                                                                                                \\
+  template void deserialize(const raft::device_resources& handle,                                                               \\
+                   cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                                             \\
+                   std::istream& is);                                                                                           \\
+                                                                                                                                \\
+  template void deserialize(const raft::device_resources& handle,                                                               \\
+                   cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                                             \\
+                   const std::string& filename);
+"""
+
+cagra_macro = """
+#define CUVS_INST_MG_CAGRA(T, IdxT)                                                                                             \\
+  using T_ha = raft::host_device_accessor<std::experimental::default_accessor<const T>, raft::memory_type::device>;             \\
+  using T_da= raft::host_device_accessor<std::experimental::default_accessor<const T>, raft::memory_type::host>;                \\
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>, raft::memory_type::device>;       \\
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>, raft::memory_type::host>;         \\
+                                                                                                                                \\
+  template void build(const raft::device_resources& handle,                                                                     \\
+             cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                                                 \\
+             const cuvs::neighbors::index_params* index_params,                                                                 \\
+             raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                                     \\
+                                                                                                                                \\
+  template void build(const raft::device_resources& handle,                                                                     \\
+             cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                                                 \\
+             const cuvs::neighbors::index_params* index_params,                                                                 \\
+             raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                                     \\
+                                                                                                                                \\
+  template void extend(const raft::device_resources& handle,                                                                    \\
+              cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                                                \\
+              raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                                       \\
+              std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>> new_indices);       \\
+                                                                                                                                \\
+  template void extend(const raft::device_resources& handle,                                                                    \\
+              cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                                                \\
+              raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                                       \\
+              std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>> new_indices);       \\
+                                                                                                                                \\
+  template void search(const raft::device_resources& handle,                                                                    \\
+              const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                                          \\
+              const cuvs::neighbors::search_params* search_params,                                                              \\
+              raft::device_matrix_view<const T, int64_t, row_major> queries,                                                    \\
+              raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,                                                     \\
+              raft::device_matrix_view<float, int64_t, row_major> distances);                                                   \\
+                                                                                                                                \\
+    template void search(const raft::device_resources& handle,                                                                  \\
+              const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                                          \\
+              const cuvs::neighbors::search_params* search_params,                                                              \\
+              raft::host_matrix_view<const T, int64_t, row_major> h_queries,                                                    \\
+              raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,                                                   \\
+              raft::device_matrix_view<float, int64_t, row_major> d_distances);                                                 \\
+                                                                                                                                \\
+  template void serialize(const raft::device_resources& handle,                                                                 \\
+                 const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                                       \\
+                 std::ostream& os);                                                                                             \\
+                                                                                                                                \\
+  template void deserialize(const raft::device_resources& handle,                                                               \\
+                   cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                                           \\
+                   std::istream& is);                                                                                           \\
+                                                                                                                                \\
+  template void deserialize(const raft::device_resources& handle,                                                               \\
+                   cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                                           \\
+                   const std::string& filename);
+"""
+
+flat_macros = dict (
+    flat = dict(
+        include=include_macro,
+        definition=flat_macro,
+        name="CUVS_INST_MG_FLAT",
+    )
+)
+
+pq_macros = dict (
+    pq = dict(
+        include=include_macro,
+        definition=pq_macro,
+        name="CUVS_INST_MG_PQ",
+    )
+)
+
+cagra_macros = dict (
+    cagra = dict(
+        include=include_macro,
+        definition=cagra_macro,
+        name="CUVS_INST_MG_CAGRA",
+    )
+)
+
+flat_types = dict(
+    float_int64_t=("float", "int64_t"),
+    int8_t_int64_t=("int8_t", "int64_t"),
+    uint8_t_int64_t=("uint8_t", "int64_t"),
+)
+
+pq_types = dict(
+    float_int64_t=("float", "int64_t"),
+    half_int64_t=("half", "int64_t"),
+    int8_t_int64_t=("int8_t", "int64_t"),
+    uint8_t_int64_t=("uint8_t", "int64_t"),
+)
+
+cagra_types = dict(
+    float_uint32_t=("float", "uint32_t"),
+    half_uint32_t=("half", "uint32_t"),
+    int8_t_uint32_t=("int8_t", "uint32_t"),
+    uint8_t_uint32_t=("uint8_t", "uint32_t"),
+)
+
+for macros, types in [(flat_macros, flat_types), (pq_macros, pq_types), (cagra_macros, cagra_types)]:
+  for type_path, (T, IdxT) in types.items():
+      for macro_path, macro in macros.items():
+          path = f"iface_{macro_path}_{type_path}.cu"
+          with open(path, "w") as f:
+              f.write(header)
+              f.write(macro['include'])
+              f.write(namespace_macro)
+              f.write(macro["definition"])
+              f.write(f"{macro['name']}({T}, {IdxT});\n\n")
+              f.write(f"#undef {macro['name']}\n")
+              f.write(footer)
+
+          print(f"src/neighbors/iface/{path}")
diff --git a/cpp/src/neighbors/iface/iface.hpp b/cpp/src/neighbors/iface/iface.hpp
new file mode 100644
index 000000000..a329db429
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface.hpp
@@ -0,0 +1,198 @@
+#include <mutex>
+
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/ivf_flat.hpp>
+#include <cuvs/neighbors/ivf_pq.hpp>
+#include <raft/core/device_resources.hpp>
+
+namespace cuvs::neighbors {
+
+using namespace raft;
+
+template <typename AnnIndexType, typename T, typename IdxT, typename Accessor>
+void build(const raft::device_resources& handle,
+           cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+           const cuvs::neighbors::index_params* index_params,
+           raft::mdspan<const T, matrix_extent<int64_t>, row_major, Accessor> index_dataset)
+{
+  interface.mutex_->lock();
+
+  if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
+    auto idx = cuvs::neighbors::ivf_flat::build(
+      handle, *static_cast<const ivf_flat::index_params*>(index_params), index_dataset);
+    interface.index_.emplace(std::move(idx));
+  } else if constexpr (std::is_same<AnnIndexType, ivf_pq::index<IdxT>>::value) {
+    auto idx = cuvs::neighbors::ivf_pq::build(
+      handle, *static_cast<const ivf_pq::index_params*>(index_params), index_dataset);
+    interface.index_.emplace(std::move(idx));
+  } else if constexpr (std::is_same<AnnIndexType, cagra::index<T, IdxT>>::value) {
+    auto idx = cuvs::neighbors::cagra::build(
+      handle, *static_cast<const cagra::index_params*>(index_params), index_dataset);
+    interface.index_.emplace(std::move(idx));
+  }
+  resource::sync_stream(handle);
+
+  interface.mutex_->unlock();
+}
+
+template <typename AnnIndexType, typename T, typename IdxT, typename Accessor1, typename Accessor2>
+void extend(
+  const raft::device_resources& handle,
+  cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+  raft::mdspan<const T, matrix_extent<int64_t>, row_major, Accessor1> new_vectors,
+  std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, Accessor2>>
+    new_indices)
+{
+  interface.mutex_->lock();
+
+  if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
+    auto idx =
+      cuvs::neighbors::ivf_flat::extend(handle, new_vectors, new_indices, interface.index_.value());
+    interface.index_.emplace(std::move(idx));
+  } else if constexpr (std::is_same<AnnIndexType, ivf_pq::index<IdxT>>::value) {
+    auto idx =
+      cuvs::neighbors::ivf_pq::extend(handle, new_vectors, new_indices, interface.index_.value());
+    interface.index_.emplace(std::move(idx));
+  } else if constexpr (std::is_same<AnnIndexType, cagra::index<T, IdxT>>::value) {
+    RAFT_FAIL("CAGRA does not implement the extend method");
+  }
+  resource::sync_stream(handle);
+
+  interface.mutex_->unlock();
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void search(const raft::device_resources& handle,
+            const cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+            const cuvs::neighbors::search_params* search_params,
+            raft::device_matrix_view<const T, int64_t, row_major> queries,
+            raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, row_major> distances)
+{
+  // interface.mutex_->lock();
+  if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, int64_t>>::value) {
+    cuvs::neighbors::ivf_flat::search(
+      handle,
+      *reinterpret_cast<const ivf_flat::search_params*>(search_params),
+      interface.index_.value(),
+      queries,
+      neighbors,
+      distances);
+  } else if constexpr (std::is_same<AnnIndexType, ivf_pq::index<int64_t>>::value) {
+    cuvs::neighbors::ivf_pq::search(handle,
+                                    *reinterpret_cast<const ivf_pq::search_params*>(search_params),
+                                    interface.index_.value(),
+                                    queries,
+                                    neighbors,
+                                    distances);
+  } else if constexpr (std::is_same<AnnIndexType, cagra::index<T, uint32_t>>::value) {
+    cuvs::neighbors::cagra::search(handle,
+                                   *reinterpret_cast<const cagra::search_params*>(search_params),
+                                   interface.index_.value(),
+                                   queries,
+                                   neighbors,
+                                   distances);
+  }
+  resource::sync_stream(handle);
+
+  // interface.mutex_->unlock();
+}
+
+// for MG ANN only
+template <typename AnnIndexType, typename T, typename IdxT>
+void search(const raft::device_resources& handle,
+            const cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+            const cuvs::neighbors::search_params* search_params,
+            raft::host_matrix_view<const T, int64_t, row_major> h_queries,
+            raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,
+            raft::device_matrix_view<float, int64_t, row_major> d_distances)
+{
+  // interface.mutex_->lock();
+
+  int64_t n_rows = h_queries.extent(0);
+  int64_t n_dims = h_queries.extent(1);
+  auto d_queries = raft::make_device_matrix<T, int64_t, row_major>(handle, n_rows, n_dims);
+  raft::copy(d_queries.data_handle(),
+             h_queries.data_handle(),
+             n_rows * n_dims,
+             resource::get_cuda_stream(handle));
+  auto d_query_view = raft::make_const_mdspan(d_queries.view());
+
+  search(handle, interface, search_params, d_query_view, d_neighbors, d_distances);
+
+  // interface.mutex_->unlock();
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void serialize(const raft::device_resources& handle,
+               const cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+               std::ostream& os)
+{
+  interface.mutex_->lock();
+
+  if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
+    ivf_flat::serialize(handle, os, interface.index_.value());
+  } else if constexpr (std::is_same<AnnIndexType, ivf_pq::index<IdxT>>::value) {
+    ivf_pq::serialize(handle, os, interface.index_.value());
+  } else if constexpr (std::is_same<AnnIndexType, cagra::index<T, IdxT>>::value) {
+    cagra::serialize(handle, os, interface.index_.value(), true);
+  }
+
+  interface.mutex_->unlock();
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void deserialize(const raft::device_resources& handle,
+                 cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+                 std::istream& is)
+{
+  interface.mutex_->lock();
+
+  if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
+    ivf_flat::index<T, IdxT> idx(handle);
+    ivf_flat::deserialize(handle, is, &idx);
+    interface.index_.emplace(std::move(idx));
+  } else if constexpr (std::is_same<AnnIndexType, ivf_pq::index<IdxT>>::value) {
+    ivf_pq::index<IdxT> idx(handle);
+    ivf_pq::deserialize(handle, is, &idx);
+    interface.index_.emplace(std::move(idx));
+  } else if constexpr (std::is_same<AnnIndexType, cagra::index<T, IdxT>>::value) {
+    cagra::index<T, IdxT> idx(handle);
+    cagra::deserialize(handle, is, &idx);
+    interface.index_.emplace(std::move(idx));
+  }
+
+  interface.mutex_->unlock();
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void deserialize(const raft::device_resources& handle,
+                 cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+                 const std::string& filename)
+{
+  interface.mutex_->lock();
+
+  std::ifstream is(filename, std::ios::in | std::ios::binary);
+  if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+
+  if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
+    ivf_flat::index<T, IdxT> idx(handle);
+    ivf_flat::deserialize(handle, is, &idx);
+    interface.index_.emplace(std::move(idx));
+  } else if constexpr (std::is_same<AnnIndexType, ivf_pq::index<IdxT>>::value) {
+    ivf_pq::index<IdxT> idx(handle);
+    ivf_pq::deserialize(handle, is, &idx);
+    interface.index_.emplace(std::move(idx));
+  } else if constexpr (std::is_same<AnnIndexType, cagra::index<T, IdxT>>::value) {
+    cagra::index<T, IdxT> idx(handle);
+    cagra::deserialize(handle, is, &idx);
+    interface.index_.emplace(std::move(idx));
+  }
+
+  is.close();
+
+  interface.mutex_->unlock();
+}
+
+};  // namespace cuvs::neighbors
\ No newline at end of file
diff --git a/cpp/src/neighbors/iface/iface_cagra_float_uint32_t.cu b/cpp/src/neighbors/iface/iface_cagra_float_uint32_t.cu
new file mode 100644
index 000000000..b5e329dd8
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface_cagra_float_uint32_t.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+#include "iface.hpp"
+
+namespace cuvs::neighbors {
+
+#define CUVS_INST_MG_CAGRA(T, IdxT)                                                                \
+  using T_ha    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::device>;                           \
+  using T_da    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::host>;                             \
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::device>;                           \
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::host>;                             \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                 \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                 \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,    \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::device_matrix_view<const T, int64_t, row_major> queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,               \
+                       raft::device_matrix_view<float, int64_t, row_major> distances);             \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,    \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::host_matrix_view<const T, int64_t, row_major> h_queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,             \
+                       raft::device_matrix_view<float, int64_t, row_major> d_distances);           \
+                                                                                                   \
+  template void serialize(const raft::device_resources& handle,                                    \
+                          const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface, \
+                          std::ostream& os);                                                       \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,     \
+                            std::istream& is);                                                     \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,     \
+                            const std::string& filename);
+CUVS_INST_MG_CAGRA(float, uint32_t);
+
+#undef CUVS_INST_MG_CAGRA
+
+}  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/iface/iface_cagra_half_uint32_t.cu b/cpp/src/neighbors/iface/iface_cagra_half_uint32_t.cu
new file mode 100644
index 000000000..23fcffc59
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface_cagra_half_uint32_t.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+#include "iface.hpp"
+
+namespace cuvs::neighbors {
+
+#define CUVS_INST_MG_CAGRA(T, IdxT)                                                                \
+  using T_ha    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::device>;                           \
+  using T_da    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::host>;                             \
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::device>;                           \
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::host>;                             \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                 \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                 \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,    \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::device_matrix_view<const T, int64_t, row_major> queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,               \
+                       raft::device_matrix_view<float, int64_t, row_major> distances);             \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,    \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::host_matrix_view<const T, int64_t, row_major> h_queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,             \
+                       raft::device_matrix_view<float, int64_t, row_major> d_distances);           \
+                                                                                                   \
+  template void serialize(const raft::device_resources& handle,                                    \
+                          const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface, \
+                          std::ostream& os);                                                       \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,     \
+                            std::istream& is);                                                     \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,     \
+                            const std::string& filename);
+CUVS_INST_MG_CAGRA(half, uint32_t);
+
+#undef CUVS_INST_MG_CAGRA
+
+}  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/iface/iface_cagra_int8_t_uint32_t.cu b/cpp/src/neighbors/iface/iface_cagra_int8_t_uint32_t.cu
new file mode 100644
index 000000000..30377ab66
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface_cagra_int8_t_uint32_t.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+#include "iface.hpp"
+
+namespace cuvs::neighbors {
+
+#define CUVS_INST_MG_CAGRA(T, IdxT)                                                                \
+  using T_ha    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::device>;                           \
+  using T_da    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::host>;                             \
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::device>;                           \
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::host>;                             \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                 \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                 \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,    \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::device_matrix_view<const T, int64_t, row_major> queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,               \
+                       raft::device_matrix_view<float, int64_t, row_major> distances);             \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,    \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::host_matrix_view<const T, int64_t, row_major> h_queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,             \
+                       raft::device_matrix_view<float, int64_t, row_major> d_distances);           \
+                                                                                                   \
+  template void serialize(const raft::device_resources& handle,                                    \
+                          const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface, \
+                          std::ostream& os);                                                       \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,     \
+                            std::istream& is);                                                     \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,     \
+                            const std::string& filename);
+CUVS_INST_MG_CAGRA(int8_t, uint32_t);
+
+#undef CUVS_INST_MG_CAGRA
+
+}  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/iface/iface_cagra_uint8_t_uint32_t.cu b/cpp/src/neighbors/iface/iface_cagra_uint8_t_uint32_t.cu
new file mode 100644
index 000000000..59a1640e8
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface_cagra_uint8_t_uint32_t.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+#include "iface.hpp"
+
+namespace cuvs::neighbors {
+
+#define CUVS_INST_MG_CAGRA(T, IdxT)                                                                \
+  using T_ha    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::device>;                           \
+  using T_da    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::host>;                             \
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::device>;                           \
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::host>;                             \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                 \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                 \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,    \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::device_matrix_view<const T, int64_t, row_major> queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,               \
+                       raft::device_matrix_view<float, int64_t, row_major> distances);             \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,    \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::host_matrix_view<const T, int64_t, row_major> h_queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,             \
+                       raft::device_matrix_view<float, int64_t, row_major> d_distances);           \
+                                                                                                   \
+  template void serialize(const raft::device_resources& handle,                                    \
+                          const cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface, \
+                          std::ostream& os);                                                       \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,     \
+                            std::istream& is);                                                     \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<cagra::index<T, IdxT>, T, IdxT>& interface,     \
+                            const std::string& filename);
+CUVS_INST_MG_CAGRA(uint8_t, uint32_t);
+
+#undef CUVS_INST_MG_CAGRA
+
+}  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/iface/iface_flat_float_int64_t.cu b/cpp/src/neighbors/iface/iface_flat_float_int64_t.cu
new file mode 100644
index 000000000..a0a455375
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface_flat_float_int64_t.cu
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+#include "iface.hpp"
+
+namespace cuvs::neighbors {
+
+#define CUVS_INST_MG_FLAT(T, IdxT)                                                                 \
+  using T_ha    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::device>;                           \
+  using T_da    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::host>;                             \
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::device>;                           \
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::host>;                             \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                 \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                 \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface, \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::device_matrix_view<const T, int64_t, row_major> queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,               \
+                       raft::device_matrix_view<float, int64_t, row_major> distances);             \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface, \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::host_matrix_view<const T, int64_t, row_major> h_queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,             \
+                       raft::device_matrix_view<float, int64_t, row_major> d_distances);           \
+                                                                                                   \
+  template void serialize(                                                                         \
+    const raft::device_resources& handle,                                                          \
+    const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                    \
+    std::ostream& os);                                                                             \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,  \
+                            std::istream& is);                                                     \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,  \
+                            const std::string& filename);
+CUVS_INST_MG_FLAT(float, int64_t);
+
+#undef CUVS_INST_MG_FLAT
+
+}  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/iface/iface_flat_int8_t_int64_t.cu b/cpp/src/neighbors/iface/iface_flat_int8_t_int64_t.cu
new file mode 100644
index 000000000..9fdd6464f
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface_flat_int8_t_int64_t.cu
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+#include "iface.hpp"
+
+namespace cuvs::neighbors {
+
+#define CUVS_INST_MG_FLAT(T, IdxT)                                                                 \
+  using T_ha    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::device>;                           \
+  using T_da    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::host>;                             \
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::device>;                           \
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::host>;                             \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                 \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                 \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface, \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::device_matrix_view<const T, int64_t, row_major> queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,               \
+                       raft::device_matrix_view<float, int64_t, row_major> distances);             \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface, \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::host_matrix_view<const T, int64_t, row_major> h_queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,             \
+                       raft::device_matrix_view<float, int64_t, row_major> d_distances);           \
+                                                                                                   \
+  template void serialize(                                                                         \
+    const raft::device_resources& handle,                                                          \
+    const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                    \
+    std::ostream& os);                                                                             \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,  \
+                            std::istream& is);                                                     \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,  \
+                            const std::string& filename);
+CUVS_INST_MG_FLAT(int8_t, int64_t);
+
+#undef CUVS_INST_MG_FLAT
+
+}  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/iface/iface_flat_uint8_t_int64_t.cu b/cpp/src/neighbors/iface/iface_flat_uint8_t_int64_t.cu
new file mode 100644
index 000000000..daee59c4a
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface_flat_uint8_t_int64_t.cu
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+#include "iface.hpp"
+
+namespace cuvs::neighbors {
+
+#define CUVS_INST_MG_FLAT(T, IdxT)                                                                 \
+  using T_ha    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::device>;                           \
+  using T_da    = raft::host_device_accessor<std::experimental::default_accessor<const T>,         \
+                                          raft::memory_type::host>;                             \
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::device>;                           \
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,      \
+                                             raft::memory_type::host>;                             \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                 \
+                                                                                                   \
+  template void build(                                                                             \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    const cuvs::neighbors::index_params* index_params,                                             \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                 \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void extend(                                                                            \
+    const raft::device_resources& handle,                                                          \
+    cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                          \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                    \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>>  \
+      new_indices);                                                                                \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface, \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::device_matrix_view<const T, int64_t, row_major> queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,               \
+                       raft::device_matrix_view<float, int64_t, row_major> distances);             \
+                                                                                                   \
+  template void search(const raft::device_resources& handle,                                       \
+                       const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface, \
+                       const cuvs::neighbors::search_params* search_params,                        \
+                       raft::host_matrix_view<const T, int64_t, row_major> h_queries,              \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,             \
+                       raft::device_matrix_view<float, int64_t, row_major> d_distances);           \
+                                                                                                   \
+  template void serialize(                                                                         \
+    const raft::device_resources& handle,                                                          \
+    const cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,                    \
+    std::ostream& os);                                                                             \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,  \
+                            std::istream& is);                                                     \
+                                                                                                   \
+  template void deserialize(const raft::device_resources& handle,                                  \
+                            cuvs::neighbors::iface<ivf_flat::index<T, IdxT>, T, IdxT>& interface,  \
+                            const std::string& filename);
+CUVS_INST_MG_FLAT(uint8_t, int64_t);
+
+#undef CUVS_INST_MG_FLAT
+
+}  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/iface/iface_pq_float_int64_t.cu b/cpp/src/neighbors/iface/iface_pq_float_int64_t.cu
new file mode 100644
index 000000000..7282d6bd0
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface_pq_float_int64_t.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+#include "iface.hpp"
+
+namespace cuvs::neighbors {
+
+#define CUVS_INST_MG_PQ(T, IdxT)                                                                  \
+  using T_ha    = raft::host_device_accessor<std::experimental::default_accessor<const T>,        \
+                                          raft::memory_type::device>;                          \
+  using T_da    = raft::host_device_accessor<std::experimental::default_accessor<const T>,        \
+                                          raft::memory_type::host>;                            \
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,     \
+                                             raft::memory_type::device>;                          \
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,     \
+                                             raft::memory_type::host>;                            \
+                                                                                                  \
+  template void build(                                                                            \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    const cuvs::neighbors::index_params* index_params,                                            \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                \
+                                                                                                  \
+  template void build(                                                                            \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    const cuvs::neighbors::index_params* index_params,                                            \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                \
+                                                                                                  \
+  template void extend(                                                                           \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                   \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>> \
+      new_indices);                                                                               \
+                                                                                                  \
+  template void extend(                                                                           \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                   \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>> \
+      new_indices);                                                                               \
+                                                                                                  \
+  template void search(const raft::device_resources& handle,                                      \
+                       const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,     \
+                       const cuvs::neighbors::search_params* search_params,                       \
+                       raft::device_matrix_view<const T, int64_t, row_major> queries,             \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,              \
+                       raft::device_matrix_view<float, int64_t, row_major> distances);            \
+                                                                                                  \
+  template void search(const raft::device_resources& handle,                                      \
+                       const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,     \
+                       const cuvs::neighbors::search_params* search_params,                       \
+                       raft::host_matrix_view<const T, int64_t, row_major> h_queries,             \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,            \
+                       raft::device_matrix_view<float, int64_t, row_major> d_distances);          \
+                                                                                                  \
+  template void serialize(const raft::device_resources& handle,                                   \
+                          const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,  \
+                          std::ostream& os);                                                      \
+                                                                                                  \
+  template void deserialize(const raft::device_resources& handle,                                 \
+                            cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,      \
+                            std::istream& is);                                                    \
+                                                                                                  \
+  template void deserialize(const raft::device_resources& handle,                                 \
+                            cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,      \
+                            const std::string& filename);
+CUVS_INST_MG_PQ(float, int64_t);
+
+#undef CUVS_INST_MG_PQ
+
+}  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/iface/iface_pq_half_int64_t.cu b/cpp/src/neighbors/iface/iface_pq_half_int64_t.cu
new file mode 100644
index 000000000..4d67f9aed
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface_pq_half_int64_t.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+#include "iface.hpp"
+
+namespace cuvs::neighbors {
+
+#define CUVS_INST_MG_PQ(T, IdxT)                                                                  \
+  using T_ha    = raft::host_device_accessor<std::experimental::default_accessor<const T>,        \
+                                          raft::memory_type::device>;                          \
+  using T_da    = raft::host_device_accessor<std::experimental::default_accessor<const T>,        \
+                                          raft::memory_type::host>;                            \
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,     \
+                                             raft::memory_type::device>;                          \
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,     \
+                                             raft::memory_type::host>;                            \
+                                                                                                  \
+  template void build(                                                                            \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    const cuvs::neighbors::index_params* index_params,                                            \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                \
+                                                                                                  \
+  template void build(                                                                            \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    const cuvs::neighbors::index_params* index_params,                                            \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                \
+                                                                                                  \
+  template void extend(                                                                           \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                   \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>> \
+      new_indices);                                                                               \
+                                                                                                  \
+  template void extend(                                                                           \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                   \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>> \
+      new_indices);                                                                               \
+                                                                                                  \
+  template void search(const raft::device_resources& handle,                                      \
+                       const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,     \
+                       const cuvs::neighbors::search_params* search_params,                       \
+                       raft::device_matrix_view<const T, int64_t, row_major> queries,             \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,              \
+                       raft::device_matrix_view<float, int64_t, row_major> distances);            \
+                                                                                                  \
+  template void search(const raft::device_resources& handle,                                      \
+                       const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,     \
+                       const cuvs::neighbors::search_params* search_params,                       \
+                       raft::host_matrix_view<const T, int64_t, row_major> h_queries,             \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,            \
+                       raft::device_matrix_view<float, int64_t, row_major> d_distances);          \
+                                                                                                  \
+  template void serialize(const raft::device_resources& handle,                                   \
+                          const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,  \
+                          std::ostream& os);                                                      \
+                                                                                                  \
+  template void deserialize(const raft::device_resources& handle,                                 \
+                            cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,      \
+                            std::istream& is);                                                    \
+                                                                                                  \
+  template void deserialize(const raft::device_resources& handle,                                 \
+                            cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,      \
+                            const std::string& filename);
+CUVS_INST_MG_PQ(half, int64_t);
+
+#undef CUVS_INST_MG_PQ
+
+}  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/iface/iface_pq_int8_t_int64_t.cu b/cpp/src/neighbors/iface/iface_pq_int8_t_int64_t.cu
new file mode 100644
index 000000000..46537b3f9
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface_pq_int8_t_int64_t.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+#include "iface.hpp"
+
+namespace cuvs::neighbors {
+
+#define CUVS_INST_MG_PQ(T, IdxT)                                                                  \
+  using T_ha    = raft::host_device_accessor<std::experimental::default_accessor<const T>,        \
+                                          raft::memory_type::device>;                          \
+  using T_da    = raft::host_device_accessor<std::experimental::default_accessor<const T>,        \
+                                          raft::memory_type::host>;                            \
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,     \
+                                             raft::memory_type::device>;                          \
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,     \
+                                             raft::memory_type::host>;                            \
+                                                                                                  \
+  template void build(                                                                            \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    const cuvs::neighbors::index_params* index_params,                                            \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                \
+                                                                                                  \
+  template void build(                                                                            \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    const cuvs::neighbors::index_params* index_params,                                            \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                \
+                                                                                                  \
+  template void extend(                                                                           \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                   \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>> \
+      new_indices);                                                                               \
+                                                                                                  \
+  template void extend(                                                                           \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                   \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>> \
+      new_indices);                                                                               \
+                                                                                                  \
+  template void search(const raft::device_resources& handle,                                      \
+                       const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,     \
+                       const cuvs::neighbors::search_params* search_params,                       \
+                       raft::device_matrix_view<const T, int64_t, row_major> queries,             \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,              \
+                       raft::device_matrix_view<float, int64_t, row_major> distances);            \
+                                                                                                  \
+  template void search(const raft::device_resources& handle,                                      \
+                       const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,     \
+                       const cuvs::neighbors::search_params* search_params,                       \
+                       raft::host_matrix_view<const T, int64_t, row_major> h_queries,             \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,            \
+                       raft::device_matrix_view<float, int64_t, row_major> d_distances);          \
+                                                                                                  \
+  template void serialize(const raft::device_resources& handle,                                   \
+                          const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,  \
+                          std::ostream& os);                                                      \
+                                                                                                  \
+  template void deserialize(const raft::device_resources& handle,                                 \
+                            cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,      \
+                            std::istream& is);                                                    \
+                                                                                                  \
+  template void deserialize(const raft::device_resources& handle,                                 \
+                            cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,      \
+                            const std::string& filename);
+CUVS_INST_MG_PQ(int8_t, int64_t);
+
+#undef CUVS_INST_MG_PQ
+
+}  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/iface/iface_pq_uint8_t_int64_t.cu b/cpp/src/neighbors/iface/iface_pq_uint8_t_int64_t.cu
new file mode 100644
index 000000000..591ac881a
--- /dev/null
+++ b/cpp/src/neighbors/iface/iface_pq_uint8_t_int64_t.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_iface.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_iface.py
+ *
+ */
+
+#include "iface.hpp"
+
+namespace cuvs::neighbors {
+
+#define CUVS_INST_MG_PQ(T, IdxT)                                                                  \
+  using T_ha    = raft::host_device_accessor<std::experimental::default_accessor<const T>,        \
+                                          raft::memory_type::device>;                          \
+  using T_da    = raft::host_device_accessor<std::experimental::default_accessor<const T>,        \
+                                          raft::memory_type::host>;                            \
+  using IdxT_ha = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,     \
+                                             raft::memory_type::device>;                          \
+  using IdxT_da = raft::host_device_accessor<std::experimental::default_accessor<const IdxT>,     \
+                                             raft::memory_type::host>;                            \
+                                                                                                  \
+  template void build(                                                                            \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    const cuvs::neighbors::index_params* index_params,                                            \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> index_dataset);                \
+                                                                                                  \
+  template void build(                                                                            \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    const cuvs::neighbors::index_params* index_params,                                            \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> index_dataset);                \
+                                                                                                  \
+  template void extend(                                                                           \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_ha> new_vectors,                   \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_ha>> \
+      new_indices);                                                                               \
+                                                                                                  \
+  template void extend(                                                                           \
+    const raft::device_resources& handle,                                                         \
+    cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,                              \
+    raft::mdspan<const T, matrix_extent<int64_t>, row_major, T_da> new_vectors,                   \
+    std::optional<raft::mdspan<const IdxT, vector_extent<int64_t>, layout_c_contiguous, IdxT_da>> \
+      new_indices);                                                                               \
+                                                                                                  \
+  template void search(const raft::device_resources& handle,                                      \
+                       const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,     \
+                       const cuvs::neighbors::search_params* search_params,                       \
+                       raft::device_matrix_view<const T, int64_t, row_major> queries,             \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> neighbors,              \
+                       raft::device_matrix_view<float, int64_t, row_major> distances);            \
+                                                                                                  \
+  template void search(const raft::device_resources& handle,                                      \
+                       const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,     \
+                       const cuvs::neighbors::search_params* search_params,                       \
+                       raft::host_matrix_view<const T, int64_t, row_major> h_queries,             \
+                       raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,            \
+                       raft::device_matrix_view<float, int64_t, row_major> d_distances);          \
+                                                                                                  \
+  template void serialize(const raft::device_resources& handle,                                   \
+                          const cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,  \
+                          std::ostream& os);                                                      \
+                                                                                                  \
+  template void deserialize(const raft::device_resources& handle,                                 \
+                            cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,      \
+                            std::istream& is);                                                    \
+                                                                                                  \
+  template void deserialize(const raft::device_resources& handle,                                 \
+                            cuvs::neighbors::iface<ivf_pq::index<IdxT>, T, IdxT>& interface,      \
+                            const std::string& filename);
+CUVS_INST_MG_PQ(uint8_t, int64_t);
+
+#undef CUVS_INST_MG_PQ
+
+}  // namespace cuvs::neighbors
diff --git a/cpp/src/neighbors/ivf_flat/generate_ivf_flat.py b/cpp/src/neighbors/ivf_flat/generate_ivf_flat.py
index 1fabcca8c..c435cc6d9 100644
--- a/cpp/src/neighbors/ivf_flat/generate_ivf_flat.py
+++ b/cpp/src/neighbors/ivf_flat/generate_ivf_flat.py
@@ -144,7 +144,7 @@
   void search(                                                              \\
     raft::resources const& handle,                                          \\
     const cuvs::neighbors::ivf_flat::search_params& params,                 \\
-    cuvs::neighbors::ivf_flat::index<T, IdxT>& index,                       \\
+    const cuvs::neighbors::ivf_flat::index<T, IdxT>& index,                 \\
     raft::device_matrix_view<const T, IdxT, raft::row_major> queries,       \\
     raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,        \\
     raft::device_matrix_view<float, IdxT, raft::row_major> distances,       \\
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
index 9626b2ce5..f5a4267cd 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
@@ -1206,8 +1206,8 @@ void launch_with_fixed_consts(cuvs::distance::DistanceType metric, Args&&... arg
                            inner_prod_dist<Veclen, T, AccT>>(
         {},
         raft::compose_op(raft::add_const_op<float>{1.0f}, raft::mul_const_op<float>{-1.0f}),
-        std::forward<Args>(args)...);
-    // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
+        std::forward<Args>(args)...);  // NB: update the description of `knn::ivf_flat::build` when
+                                       // adding here a new metric.
     default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
   }
 }
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cu
index 3f262d612..87abc0bc0 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_search_float_int64_t.cu
@@ -32,7 +32,7 @@ namespace cuvs::neighbors::ivf_flat {
 #define CUVS_INST_IVF_FLAT_SEARCH(T, IdxT)                                      \
   void search(raft::resources const& handle,                                    \
               const cuvs::neighbors::ivf_flat::search_params& params,           \
-              cuvs::neighbors::ivf_flat::index<T, IdxT>& index,                 \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& index,           \
               raft::device_matrix_view<const T, IdxT, raft::row_major> queries, \
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \
               raft::device_matrix_view<float, IdxT, raft::row_major> distances, \
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cu
index 4357afb0a..c1e92ae5b 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_search_int8_t_int64_t.cu
@@ -32,7 +32,7 @@ namespace cuvs::neighbors::ivf_flat {
 #define CUVS_INST_IVF_FLAT_SEARCH(T, IdxT)                                      \
   void search(raft::resources const& handle,                                    \
               const cuvs::neighbors::ivf_flat::search_params& params,           \
-              cuvs::neighbors::ivf_flat::index<T, IdxT>& index,                 \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& index,           \
               raft::device_matrix_view<const T, IdxT, raft::row_major> queries, \
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \
               raft::device_matrix_view<float, IdxT, raft::row_major> distances, \
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cu
index 8265a3e17..4ff8ed770 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_search_uint8_t_int64_t.cu
@@ -32,7 +32,7 @@ namespace cuvs::neighbors::ivf_flat {
 #define CUVS_INST_IVF_FLAT_SEARCH(T, IdxT)                                      \
   void search(raft::resources const& handle,                                    \
               const cuvs::neighbors::ivf_flat::search_params& params,           \
-              cuvs::neighbors::ivf_flat::index<T, IdxT>& index,                 \
+              const cuvs::neighbors::ivf_flat::index<T, IdxT>& index,           \
               raft::device_matrix_view<const T, IdxT, raft::row_major> queries, \
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \
               raft::device_matrix_view<float, IdxT, raft::row_major> distances, \
diff --git a/cpp/src/neighbors/ivf_pq/detail/generate_ivf_pq.py b/cpp/src/neighbors/ivf_pq/detail/generate_ivf_pq.py
index a5a829967..a2ac048ff 100644
--- a/cpp/src/neighbors/ivf_pq/detail/generate_ivf_pq.py
+++ b/cpp/src/neighbors/ivf_pq/detail/generate_ivf_pq.py
@@ -68,7 +68,7 @@
 #define CUVS_INST_IVF_PQ_SEARCH(T, IdxT)                                        \\
   void search(raft::resources const& handle,                                    \\
               const cuvs::neighbors::ivf_pq::search_params& params,             \\
-              cuvs::neighbors::ivf_pq::index<IdxT>& index,                      \\
+              const cuvs::neighbors::ivf_pq::index<IdxT>& index,                \\
               raft::device_matrix_view<const T, IdxT, raft::row_major> queries, \\
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \\
               raft::device_matrix_view<float, IdxT, raft::row_major> distances, \\
diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_float_int64_t.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_float_int64_t.cu
index 07ee110bc..44e9777ba 100644
--- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_float_int64_t.cu
+++ b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_float_int64_t.cu
@@ -32,7 +32,7 @@ namespace cuvs::neighbors::ivf_pq {
 #define CUVS_INST_IVF_PQ_SEARCH(T, IdxT)                                        \
   void search(raft::resources const& handle,                                    \
               const cuvs::neighbors::ivf_pq::search_params& params,             \
-              cuvs::neighbors::ivf_pq::index<IdxT>& index,                      \
+              const cuvs::neighbors::ivf_pq::index<IdxT>& index,                \
               raft::device_matrix_view<const T, IdxT, raft::row_major> queries, \
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \
               raft::device_matrix_view<float, IdxT, raft::row_major> distances, \
diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_half_int64_t.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_half_int64_t.cu
index cf387cb67..d7446e846 100644
--- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_half_int64_t.cu
+++ b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_half_int64_t.cu
@@ -32,7 +32,7 @@ namespace cuvs::neighbors::ivf_pq {
 #define CUVS_INST_IVF_PQ_SEARCH(T, IdxT)                                        \
   void search(raft::resources const& handle,                                    \
               const cuvs::neighbors::ivf_pq::search_params& params,             \
-              cuvs::neighbors::ivf_pq::index<IdxT>& index,                      \
+              const cuvs::neighbors::ivf_pq::index<IdxT>& index,                \
               raft::device_matrix_view<const T, IdxT, raft::row_major> queries, \
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \
               raft::device_matrix_view<float, IdxT, raft::row_major> distances, \
diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_int8_t_int64_t.cu
index 5ec9093df..c1ffede97 100644
--- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_int8_t_int64_t.cu
@@ -32,7 +32,7 @@ namespace cuvs::neighbors::ivf_pq {
 #define CUVS_INST_IVF_PQ_SEARCH(T, IdxT)                                        \
   void search(raft::resources const& handle,                                    \
               const cuvs::neighbors::ivf_pq::search_params& params,             \
-              cuvs::neighbors::ivf_pq::index<IdxT>& index,                      \
+              const cuvs::neighbors::ivf_pq::index<IdxT>& index,                \
               raft::device_matrix_view<const T, IdxT, raft::row_major> queries, \
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \
               raft::device_matrix_view<float, IdxT, raft::row_major> distances, \
diff --git a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_uint8_t_int64_t.cu
index d2e2f3b00..08e4f0536 100644
--- a/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_pq/detail/ivf_pq_search_uint8_t_int64_t.cu
@@ -32,7 +32,7 @@ namespace cuvs::neighbors::ivf_pq {
 #define CUVS_INST_IVF_PQ_SEARCH(T, IdxT)                                        \
   void search(raft::resources const& handle,                                    \
               const cuvs::neighbors::ivf_pq::search_params& params,             \
-              cuvs::neighbors::ivf_pq::index<IdxT>& index,                      \
+              const cuvs::neighbors::ivf_pq::index<IdxT>& index,                \
               raft::device_matrix_view<const T, IdxT, raft::row_major> queries, \
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> neighbors,  \
               raft::device_matrix_view<float, IdxT, raft::row_major> distances, \
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
index c65ea8108..4c9867126 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
@@ -31,6 +31,7 @@
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/logger-ext.hpp>
+#include <raft/core/mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cuda_stream_pool.hpp>
@@ -41,6 +42,8 @@
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm_types.hpp>
+#include <raft/linalg/normalize.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/gather.cuh>
 #include <raft/matrix/linewise_op.cuh>
@@ -1466,6 +1469,13 @@ void extend(raft::resources const& handle,
                   std::is_same_v<T, int8_t>,
                 "Unsupported data type");
 
+  if (index->metric() == distance::DistanceType::CosineExpanded) {
+    if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>)
+      RAFT_FAIL(
+        "CosineExpanded distance metric is currently not supported for uint8_t and int8_t data "
+        "type");
+  }
+
   rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle);
   rmm::device_async_resource_ref large_memory =
     raft::resource::get_large_workspace_resource(handle);
@@ -1632,6 +1642,14 @@ void extend(raft::resources const& handle,
   vec_batches.prefetch_next_batch();
   for (const auto& vec_batch : vec_batches) {
     const auto& idx_batch = *idx_batches++;
+    if (index->metric() == CosineExpanded) {
+      auto vec_batch_view = raft::make_device_matrix_view<T, internal_extents_t>(
+        const_cast<T*>(vec_batch.data()), vec_batch.size(), index->dim());
+      raft::linalg::row_normalize(handle,
+                                  raft::make_const_mdspan(vec_batch_view),
+                                  vec_batch_view,
+                                  raft::linalg::NormType::L2Norm);
+    }
     process_and_fill_codes(handle,
                            *index,
                            vec_batch.data(),
@@ -1683,6 +1701,13 @@ auto build(raft::resources const& handle,
             << (int)params.pq_dim << std::endl;
   RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
   RAFT_EXPECTS(n_rows >= params.n_lists, "number of rows can't be less than n_lists");
+  if (params.metric == distance::DistanceType::CosineExpanded) {
+    // TODO: support int8_t and uint8_t types (https://github.com/rapidsai/cuvs/issues/389)
+    if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>)
+      RAFT_FAIL(
+        "CosineExpanded distance metric is currently not supported for uint8_t and int8_t data "
+        "type");
+  }
 
   auto stream = raft::resource::get_cuda_stream(handle);
 
@@ -1755,6 +1780,11 @@ auto build(raft::resources const& handle,
     cuvs::cluster::kmeans::balanced_params kmeans_params;
     kmeans_params.n_iters = params.kmeans_n_iters;
     kmeans_params.metric  = static_cast<cuvs::distance::DistanceType>((int)index.metric());
+
+    if (index.metric() == distance::DistanceType::CosineExpanded) {
+      raft::linalg::row_normalize(
+        handle, trainset_const_view, trainset.view(), raft::linalg::NormType::L2Norm);
+    }
     cuvs::cluster::kmeans_balanced::fit(
       handle, kmeans_params, trainset_const_view, centers_view, utils::mapping<float>{});
 
@@ -1762,6 +1792,10 @@ auto build(raft::resources const& handle,
     rmm::device_uvector<uint32_t> labels(n_rows_train, stream, big_memory_resource);
     auto centers_const_view = raft::make_device_matrix_view<const float, internal_extents_t>(
       cluster_centers, index.n_lists(), index.dim());
+    if (index.metric() == distance::DistanceType::CosineExpanded) {
+      raft::linalg::row_normalize(
+        handle, centers_const_view, centers_view, raft::linalg::NormType::L2Norm);
+    }
     auto labels_view =
       raft::make_device_vector_view<uint32_t, internal_extents_t>(labels.data(), n_rows_train);
     cuvs::cluster::kmeans_balanced::predict(handle,
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity_impl.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity_impl.cuh
index 8404ca1f9..fbbdd06c2 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity_impl.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_compute_similarity_impl.cuh
@@ -369,6 +369,7 @@ RAFT_KERNEL compute_similarity_kernel(uint32_t dim,
             reinterpret_cast<float*>(lut_end)[i] = query[i] - cluster_center[i];
           }
         } break;
+        case distance::DistanceType::CosineExpanded:
         case distance::DistanceType::InnerProduct: {
           float2 pvals;
           for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
@@ -408,6 +409,7 @@ RAFT_KERNEL compute_similarity_kernel(uint32_t dim,
               diff -= pq_c;
               score += diff * diff;
             } break;
+            case distance::DistanceType::CosineExpanded:
             case distance::DistanceType::InnerProduct: {
               // NB: we negate the scores as we hardcoded select-topk to always compute the minimum
               float q;
@@ -485,6 +487,7 @@ RAFT_KERNEL compute_similarity_kernel(uint32_t dim,
           reinterpret_cast<const vec_t::io_t*>(pq_thread_data),
           lut_scores,
           early_stop_limit);
+        if (metric == distance::DistanceType::CosineExpanded) { score = OutT(1) + score; }
       }
       if constexpr (kManageLocalTopK) {
         block_topk.add(score, sample_offset + i);
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh
index e185f18dc..db8f9fbd3 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh
@@ -37,6 +37,9 @@
 #include <raft/core/resources.hpp>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/map.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/norm_types.hpp>
+#include <raft/linalg/normalize.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/util/cache.hpp>
@@ -104,12 +107,21 @@ void select_clusters(raft::resources const& handle,
 
       This is a negative inner-product distance. We minimize it to find the similar clusters.
 
+      NB: qc_distances is NOT used further in ivfpq_search.
+
+    Cosine distance:
+      `qc_distances[i, j] = - (queries[i], cluster_centers[j])`
+
+      This is a negative inner-product distance. The queries and cluster centers are row normalized.
+      We minimize it to find the similar clusters.
+
       NB: qc_distances is NOT used further in ivfpq_search.
  */
   float norm_factor;
   switch (metric) {
     case cuvs::distance::DistanceType::L2SqrtExpanded:
     case cuvs::distance::DistanceType::L2Expanded: norm_factor = 1.0 / -2.0; break;
+    case cuvs::distance::DistanceType::CosineExpanded:
     case cuvs::distance::DistanceType::InnerProduct: norm_factor = 0.0; break;
     default: RAFT_FAIL("Unsupported distance type %d.", int(metric));
   }
@@ -133,6 +145,7 @@ void select_clusters(raft::resources const& handle,
       gemm_k = dim + 1;
       RAFT_EXPECTS(gemm_k <= dim_ext, "unexpected gemm_k or dim_ext");
     } break;
+    case cuvs::distance::DistanceType::CosineExpanded:
     case cuvs::distance::DistanceType::InnerProduct: {
       alpha = -1.0;
       beta  = 0.0;
@@ -363,8 +376,9 @@ void ivfpq_search_worker(raft::resources const& handle,
       // stores basediff (query[i] - center[i])
       precomp_data_count = index.rot_dim();
     } break;
+    case distance::DistanceType::CosineExpanded:
     case distance::DistanceType::InnerProduct: {
-      // stores two components (query[i] * center[i], query[i] * center[i])
+      // stores two components (query[i], query[i] * center[i])
       precomp_data_count = index.rot_dim() * 2;
     } break;
     default: {
@@ -457,8 +471,14 @@ void ivfpq_search_worker(raft::resources const& handle,
     num_samples_vector);
 
   // Postprocessing
-  ivf::detail::postprocess_distances(
-    distances, topk_dists.data(), index.metric(), n_queries, topK, scaling_factor, true, stream);
+  ivf::detail::postprocess_distances(distances,
+                                     topk_dists.data(),
+                                     index.metric(),
+                                     n_queries,
+                                     topK,
+                                     scaling_factor,
+                                     index.metric() != distance::DistanceType::CosineExpanded,
+                                     stream);
   ivf::detail::postprocess_neighbors(neighbors,
                                      neighbors_uint32,
                                      index.inds_ptrs().data_handle(),
@@ -508,6 +528,7 @@ struct ivfpq_search {
   {
     bool signed_metric = false;
     switch (metric) {
+      case cuvs::distance::DistanceType::CosineExpanded: signed_metric = true; break;
       case cuvs::distance::DistanceType::InnerProduct: signed_metric = true; break;
       default: break;
     }
@@ -606,6 +627,12 @@ inline void search(raft::resources const& handle,
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, half> || std::is_same_v<T, uint8_t> ||
                   std::is_same_v<T, int8_t>,
                 "Unsupported element type.");
+  if (index.metric() == distance::DistanceType::CosineExpanded) {
+    if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>)
+      RAFT_FAIL(
+        "CosineExpanded distance metric is currently not supported for uint8_t and int8_t data "
+        "type");
+  }
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "ivf_pq::search(n_queries = %u, n_probes = %u, k = %u, dim = %zu)",
     n_queries,
@@ -698,7 +725,14 @@ inline void search(raft::resources const& handle,
                        rot_queries.data(),
                        index.rot_dim(),
                        stream);
-
+    if (index.metric() == distance::DistanceType::CosineExpanded) {
+      auto rot_queries_view = raft::make_device_matrix_view<float, uint32_t>(
+        rot_queries.data(), max_queries, index.rot_dim());
+      raft::linalg::row_normalize(handle,
+                                  raft::make_const_mdspan(rot_queries_view),
+                                  rot_queries_view,
+                                  raft::linalg::NormType::L2Norm);
+    }
     for (uint32_t offset_b = 0; offset_b < queries_batch; offset_b += max_batch_size) {
       uint32_t batch_size = min(max_batch_size, queries_batch - offset_b);
       /* The distance calculation is done in the rotated/transformed space;
diff --git a/cpp/src/neighbors/mg/generate_mg.py b/cpp/src/neighbors/mg/generate_mg.py
new file mode 100644
index 000000000..af5e60545
--- /dev/null
+++ b/cpp/src/neighbors/mg/generate_mg.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+header = """/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+"""
+
+include_macro = """
+#include "mg.cuh"
+"""
+
+namespace_macro = """
+namespace cuvs::neighbors::mg {
+"""
+
+footer = """
+}  // namespace cuvs::neighbors::mg
+"""
+
+flat_macro = """
+#define CUVS_INST_MG_FLAT(T, IdxT)                                                                                          \\
+  index<ivf_flat::index<T, IdxT>, T, IdxT> build(const raft::device_resources& handle,                                      \\
+                                                 const mg::index_params<ivf_flat::index_params>& index_params,              \\
+                                                 raft::host_matrix_view<const T, int64_t, row_major> index_dataset)         \\
+  {                                                                                                                         \\
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);                                       \\
+    index<ivf_flat::index<T, IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_);                                   \\
+    cuvs::neighbors::mg::detail::build(handle, index,                                                                       \\
+                                       static_cast<const cuvs::neighbors::index_params*>(&index_params),                    \\
+                                       index_dataset);                                                                      \\
+    return index;                                                                                                           \\
+  }                                                                                                                         \\
+                                                                                                                            \\
+  void extend(const raft::device_resources& handle,                                                                         \\
+              index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                                                              \\
+              raft::host_matrix_view<const T, int64_t, row_major> new_vectors,                                              \\
+              std::optional<raft::host_vector_view<const IdxT, int64_t>> new_indices)                                       \\
+  {                                                                                                                         \\
+    cuvs::neighbors::mg::detail::extend(handle, index, new_vectors, new_indices);                                           \\
+  }                                                                                                                         \\
+                                                                                                                            \\
+  void search(const raft::device_resources& handle,                                                                         \\
+              const index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                                                        \\
+              const mg::search_params<ivf_flat::search_params>& search_params,                                              \\
+              raft::host_matrix_view<const T, int64_t, row_major> queries,                                                  \\
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,                                                   \\
+              raft::host_matrix_view<float, int64_t, row_major> distances,                                                  \\
+              int64_t n_rows_per_batch)                                                                                     \\
+  {                                                                                                                         \\
+    cuvs::neighbors::mg::detail::search(handle, index,                                                                      \\
+                                        static_cast<const cuvs::neighbors::search_params*>(&search_params),                 \\
+                                        queries, neighbors, distances, n_rows_per_batch);                                   \\
+  }                                                                                                                         \\
+                                                                                                                            \\
+  void serialize(const raft::device_resources& handle,                                                                      \\
+                 const index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                                                     \\
+                 const std::string& filename)                                                                               \\
+  {                                                                                                                         \\
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                                                        \\
+  }                                                                                                                         \\
+                                                                                                                            \\
+  template<>                                                                                                                \\
+  index<ivf_flat::index<T, IdxT>, T, IdxT> deserialize_flat<T, IdxT>(const raft::device_resources& handle,                  \\
+                                                                     const std::string& filename)                           \\
+  {                                                                                                                         \\
+    auto idx = index<ivf_flat::index<T, IdxT>, T, IdxT>(handle, filename);                                                  \\
+    return idx;                                                                                                             \\
+  }                                                                                                                         \\
+                                                                                                                            \\
+  template<>                                                                                                                \\
+  index<ivf_flat::index<T, IdxT>, T, IdxT> distribute_flat<T, IdxT>(const raft::device_resources& handle,                   \\
+                                                                    const std::string& filename)                            \\
+  {                                                                                                                         \\
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);                                       \\
+    auto idx = index<ivf_flat::index<T, IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);                                     \\
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);                                         \\
+    return idx;                                                                                                             \\
+  }
+"""
+
+pq_macro = """
+#define CUVS_INST_MG_PQ(T, IdxT)                                                                                          \\
+  index<ivf_pq::index<IdxT>, T, IdxT> build(const raft::device_resources& handle,                                         \\
+                                            const mg::index_params<ivf_pq::index_params>& index_params,                   \\
+                                            raft::host_matrix_view<const T, int64_t, row_major> index_dataset)            \\
+  {                                                                                                                       \\
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);                                     \\
+    index<ivf_pq::index<IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_);                                      \\
+    cuvs::neighbors::mg::detail::build(handle, index,                                                                     \\
+                                       static_cast<const cuvs::neighbors::index_params*>(&index_params),                  \\
+                                       index_dataset);                                                                    \\
+    return index;                                                                                                         \\
+  }                                                                                                                       \\
+                                                                                                                          \\
+  void extend(const raft::device_resources& handle,                                                                       \\
+              index<ivf_pq::index<IdxT>, T, IdxT>& index,                                                                 \\
+              raft::host_matrix_view<const T, int64_t, row_major> new_vectors,                                            \\
+              std::optional<raft::host_vector_view<const IdxT, int64_t>> new_indices)                                     \\
+  {                                                                                                                       \\
+    cuvs::neighbors::mg::detail::extend(handle, index, new_vectors, new_indices);                                         \\
+  }                                                                                                                       \\
+                                                                                                                          \\
+  void search(const raft::device_resources& handle,                                                                       \\
+              const index<ivf_pq::index<IdxT>, T, IdxT>& index,                                                           \\
+              const mg::search_params<ivf_pq::search_params>& search_params,                                              \\
+              raft::host_matrix_view<const T, int64_t, row_major> queries,                                                \\
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,                                                 \\
+              raft::host_matrix_view<float, int64_t, row_major> distances,                                                \\
+              int64_t n_rows_per_batch)                                                                                   \\
+  {                                                                                                                       \\
+    cuvs::neighbors::mg::detail::search(handle, index,                                                                    \\
+                                        static_cast<const cuvs::neighbors::search_params*>(&search_params),               \\
+                                        queries, neighbors, distances, n_rows_per_batch);                                 \\
+  }                                                                                                                       \\
+                                                                                                                          \\
+  void serialize(const raft::device_resources& handle,                                                                    \\
+                 const index<ivf_pq::index<IdxT>, T, IdxT>& index,                                                        \\
+                 const std::string& filename)                                                                             \\
+  {                                                                                                                       \\
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                                                      \\
+  }                                                                                                                       \\
+                                                                                                                          \\
+  template<>                                                                                                              \\
+  index<ivf_pq::index<IdxT>, T, IdxT> deserialize_pq<T, IdxT>(const raft::device_resources& handle,                       \\
+                                                              const std::string& filename)                                \\
+  {                                                                                                                       \\
+    auto idx = index<ivf_pq::index<IdxT>, T, IdxT>(handle, filename);                                                     \\
+    return idx;                                                                                                           \\
+  }                                                                                                                       \\
+                                                                                                                          \\
+  template<>                                                                                                              \\
+  index<ivf_pq::index<IdxT>, T, IdxT> distribute_pq<T, IdxT>(const raft::device_resources& handle,                        \\
+                                                             const std::string& filename)                                 \\
+  {                                                                                                                       \\
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);                                     \\
+    auto idx = index<ivf_pq::index<IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);                                        \\
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);                                       \\
+    return idx;                                                                                                           \\
+  }
+"""
+
+cagra_macro = """
+#define CUVS_INST_MG_CAGRA(T, IdxT)                                                                                       \\
+  index<cagra::index<T, IdxT>, T, IdxT> build(const raft::device_resources& handle,                                       \\
+                                              const mg::index_params<cagra::index_params>& index_params,                  \\
+                                              raft::host_matrix_view<const T, int64_t, row_major> index_dataset)          \\
+  {                                                                                                                       \\
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);                                     \\
+    index<cagra::index<T, IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_);                                    \\
+    cuvs::neighbors::mg::detail::build(handle, index,                                                                     \\
+                                       static_cast<const cuvs::neighbors::index_params*>(&index_params),                  \\
+                                       index_dataset);                                                                    \\
+    return index;                                                                                                         \\
+  }                                                                                                                       \\
+                                                                                                                          \\
+  void search(const raft::device_resources& handle,                                                                       \\
+              const index<cagra::index<T, IdxT>, T, IdxT>& index,                                                         \\
+              const mg::search_params<cagra::search_params>& search_params,                                               \\
+              raft::host_matrix_view<const T, int64_t, row_major> queries,                                                \\
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,                                                 \\
+              raft::host_matrix_view<float, int64_t, row_major> distances,                                                \\
+              int64_t n_rows_per_batch)                                                                                   \\
+  {                                                                                                                       \\
+    cuvs::neighbors::mg::detail::search(handle, index,                                                                    \\
+                                        static_cast<const cuvs::neighbors::search_params*>(&search_params),               \\
+                                        queries, neighbors, distances, n_rows_per_batch);                                 \\
+  }                                                                                                                       \\
+                                                                                                                          \\
+  void serialize(const raft::device_resources& handle,                                                                    \\
+                 const index<cagra::index<T, IdxT>, T, IdxT>& index,                                                      \\
+                 const std::string& filename)                                                                             \\
+  {                                                                                                                       \\
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                                                      \\
+  }                                                                                                                       \\
+                                                                                                                          \\
+  template<>                                                                                                              \\
+  index<cagra::index<T, IdxT>, T, IdxT> deserialize_cagra<T, IdxT>(const raft::device_resources& handle,                  \\
+                                                                   const std::string& filename)                           \\
+  {                                                                                                                       \\
+    auto idx = index<cagra::index<T, IdxT>, T, IdxT>(handle, filename);                                                   \\
+    return idx;                                                                                                           \\
+  }                                                                                                                       \\
+                                                                                                                          \\
+  template<>                                                                                                              \\
+  index<cagra::index<T, IdxT>, T, IdxT> distribute_cagra<T, IdxT>(const raft::device_resources& handle,                   \\
+                                                                  const std::string& filename)                            \\
+  {                                                                                                                       \\
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);                                     \\
+    auto idx = index<cagra::index<T, IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);                                      \\
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);                                       \\
+    return idx;                                                                                                           \\
+  }
+"""
+
+flat_macros = dict (
+    flat = dict(
+        include=include_macro,
+        definition=flat_macro,
+        name="CUVS_INST_MG_FLAT",
+    )
+)
+
+pq_macros = dict (
+    pq = dict(
+        include=include_macro,
+        definition=pq_macro,
+        name="CUVS_INST_MG_PQ",
+    )
+)
+
+cagra_macros = dict (
+    cagra = dict(
+        include=include_macro,
+        definition=cagra_macro,
+        name="CUVS_INST_MG_CAGRA",
+    )
+)
+
+flat_types = dict(
+    float_int64_t=("float", "int64_t"),
+    int8_t_int64_t=("int8_t", "int64_t"),
+    uint8_t_int64_t=("uint8_t", "int64_t"),
+)
+
+pq_types = dict(
+    float_int64_t=("float", "int64_t"),
+    half_int64_t=("half", "int64_t"),
+    int8_t_int64_t=("int8_t", "int64_t"),
+    uint8_t_int64_t=("uint8_t", "int64_t"),
+)
+
+cagra_types = dict(
+    float_uint32_t=("float", "uint32_t"),
+    half_uint32_t=("half", "uint32_t"),
+    int8_t_uint32_t=("int8_t", "uint32_t"),
+    uint8_t_uint32_t=("uint8_t", "uint32_t"),
+)
+
+for macros, types in [(flat_macros, flat_types), (pq_macros, pq_types), (cagra_macros, cagra_types)]:
+  for type_path, (T, IdxT) in types.items():
+      for macro_path, macro in macros.items():
+          path = f"mg_{macro_path}_{type_path}.cu"
+          with open(path, "w") as f:
+              f.write(header)
+              f.write(macro['include'])
+              f.write(namespace_macro)
+              f.write(macro["definition"])
+              f.write(f"{macro['name']}({T}, {IdxT});\n\n")
+              f.write(f"#undef {macro['name']}\n")
+              f.write(footer)
+
+          print(f"src/neighbors/mg/{path}")
diff --git a/cpp/src/neighbors/mg/mg.cuh b/cpp/src/neighbors/mg/mg.cuh
new file mode 100644
index 000000000..d3f635bc4
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg.cuh
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../detail/knn_merge_parts.cuh"
+#include <raft/core/resource/nccl_clique.hpp>
+#include <raft/core/serialize.hpp>
+#include <raft/linalg/add.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>
+
+#include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/mg.hpp>
+
+namespace cuvs::neighbors {
+using namespace raft;
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void search(const raft::device_resources& handle,
+            const cuvs::neighbors::iface<AnnIndexType, T, IdxT>& interface,
+            const cuvs::neighbors::search_params* search_params,
+            raft::host_matrix_view<const T, int64_t, row_major> h_queries,
+            raft::device_matrix_view<IdxT, int64_t, row_major> d_neighbors,
+            raft::device_matrix_view<float, int64_t, row_major> d_distances);
+}  // namespace cuvs::neighbors
+
+namespace cuvs::neighbors::mg {
+void check_omp_threads(const int requirements);
+}  // namespace cuvs::neighbors::mg
+
+namespace cuvs::neighbors::mg::detail {
+using namespace cuvs::neighbors;
+using namespace raft;
+
+// local index deserialization and distribution
+template <typename AnnIndexType, typename T, typename IdxT>
+void deserialize_and_distribute(const raft::device_resources& handle,
+                                index<AnnIndexType, T, IdxT>& index,
+                                const std::string& filename)
+{
+  const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);
+  for (int rank = 0; rank < index.num_ranks_; rank++) {
+    int dev_id                            = clique.device_ids_[rank];
+    const raft::device_resources& dev_res = clique.device_resources_[rank];
+    RAFT_CUDA_TRY(cudaSetDevice(dev_id));
+    auto& ann_if = index.ann_interfaces_.emplace_back();
+    cuvs::neighbors::deserialize(dev_res, ann_if, filename);
+  }
+}
+
+// MG index deserialization
+template <typename AnnIndexType, typename T, typename IdxT>
+void deserialize(const raft::device_resources& handle,
+                 index<AnnIndexType, T, IdxT>& index,
+                 const std::string& filename)
+{
+  std::ifstream is(filename, std::ios::in | std::ios::binary);
+  if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+
+  const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);
+
+  index.mode_      = (cuvs::neighbors::mg::distribution_mode)deserialize_scalar<int>(handle, is);
+  index.num_ranks_ = deserialize_scalar<int>(handle, is);
+
+  if (index.num_ranks_ != clique.num_ranks_) {
+    RAFT_FAIL("Serialized index has %d ranks whereas NCCL clique has %d ranks",
+              index.num_ranks_,
+              clique.num_ranks_);
+  }
+
+  for (int rank = 0; rank < index.num_ranks_; rank++) {
+    int dev_id                            = clique.device_ids_[rank];
+    const raft::device_resources& dev_res = clique.device_resources_[rank];
+    RAFT_CUDA_TRY(cudaSetDevice(dev_id));
+    auto& ann_if = index.ann_interfaces_.emplace_back();
+    cuvs::neighbors::deserialize(dev_res, ann_if, is);
+  }
+
+  is.close();
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void build(const raft::device_resources& handle,
+           index<AnnIndexType, T, IdxT>& index,
+           const cuvs::neighbors::index_params* index_params,
+           raft::host_matrix_view<const T, int64_t, row_major> index_dataset)
+{
+  const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);
+
+  if (index.mode_ == REPLICATED) {
+    int64_t n_rows = index_dataset.extent(0);
+    RAFT_LOG_INFO("REPLICATED BUILD: %d*%drows", index.num_ranks_, n_rows);
+
+    index.ann_interfaces_.resize(index.num_ranks_);
+#pragma omp parallel for
+    for (int rank = 0; rank < index.num_ranks_; rank++) {
+      int dev_id                            = clique.device_ids_[rank];
+      const raft::device_resources& dev_res = clique.device_resources_[rank];
+      RAFT_CUDA_TRY(cudaSetDevice(dev_id));
+      auto& ann_if = index.ann_interfaces_[rank];
+      cuvs::neighbors::build(dev_res, ann_if, index_params, index_dataset);
+      resource::sync_stream(dev_res);
+    }
+  } else if (index.mode_ == SHARDED) {
+    int64_t n_rows           = index_dataset.extent(0);
+    int64_t n_cols           = index_dataset.extent(1);
+    int64_t n_rows_per_shard = raft::ceildiv(n_rows, (int64_t)index.num_ranks_);
+
+    RAFT_LOG_INFO("SHARDED BUILD: %d*%drows", index.num_ranks_, n_rows_per_shard);
+
+    index.ann_interfaces_.resize(index.num_ranks_);
+#pragma omp parallel for
+    for (int rank = 0; rank < index.num_ranks_; rank++) {
+      int dev_id                            = clique.device_ids_[rank];
+      const raft::device_resources& dev_res = clique.device_resources_[rank];
+      RAFT_CUDA_TRY(cudaSetDevice(dev_id));
+      int64_t offset                  = rank * n_rows_per_shard;
+      int64_t n_rows_of_current_shard = std::min(n_rows_per_shard, n_rows - offset);
+      const T* partition_ptr          = index_dataset.data_handle() + (offset * n_cols);
+      auto partition                  = raft::make_host_matrix_view<const T, int64_t, row_major>(
+        partition_ptr, n_rows_of_current_shard, n_cols);
+      auto& ann_if = index.ann_interfaces_[rank];
+      cuvs::neighbors::build(dev_res, ann_if, index_params, partition);
+      resource::sync_stream(dev_res);
+    }
+  }
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void extend(const raft::device_resources& handle,
+            index<AnnIndexType, T, IdxT>& index,
+            raft::host_matrix_view<const T, int64_t, row_major> new_vectors,
+            std::optional<raft::host_vector_view<const IdxT, int64_t>> new_indices)
+{
+  const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);
+
+  int64_t n_rows = new_vectors.extent(0);
+  if (index.mode_ == REPLICATED) {
+    RAFT_LOG_INFO("REPLICATED EXTEND: %d*%drows", index.num_ranks_, n_rows);
+
+#pragma omp parallel for
+    for (int rank = 0; rank < index.num_ranks_; rank++) {
+      int dev_id                            = clique.device_ids_[rank];
+      const raft::device_resources& dev_res = clique.device_resources_[rank];
+      RAFT_CUDA_TRY(cudaSetDevice(dev_id));
+      auto& ann_if = index.ann_interfaces_[rank];
+      cuvs::neighbors::extend(dev_res, ann_if, new_vectors, new_indices);
+      resource::sync_stream(dev_res);
+    }
+  } else if (index.mode_ == SHARDED) {
+    int64_t n_cols           = new_vectors.extent(1);
+    int64_t n_rows_per_shard = raft::ceildiv(n_rows, (int64_t)index.num_ranks_);
+
+    RAFT_LOG_INFO("SHARDED EXTEND: %d*%drows", index.num_ranks_, n_rows_per_shard);
+
+#pragma omp parallel for
+    for (int rank = 0; rank < index.num_ranks_; rank++) {
+      int dev_id                            = clique.device_ids_[rank];
+      const raft::device_resources& dev_res = clique.device_resources_[rank];
+      RAFT_CUDA_TRY(cudaSetDevice(dev_id));
+      int64_t offset                  = rank * n_rows_per_shard;
+      int64_t n_rows_of_current_shard = std::min(n_rows_per_shard, n_rows - offset);
+      const T* new_vectors_ptr        = new_vectors.data_handle() + (offset * n_cols);
+      auto new_vectors_part           = raft::make_host_matrix_view<const T, int64_t, row_major>(
+        new_vectors_ptr, n_rows_of_current_shard, n_cols);
+
+      std::optional<raft::host_vector_view<const IdxT, int64_t>> new_indices_part = std::nullopt;
+      if (new_indices.has_value()) {
+        const IdxT* new_indices_ptr = new_indices.value().data_handle() + offset;
+        new_indices_part            = raft::make_host_vector_view<const IdxT, int64_t>(
+          new_indices_ptr, n_rows_of_current_shard);
+      }
+      auto& ann_if = index.ann_interfaces_[rank];
+      cuvs::neighbors::extend(dev_res, ann_if, new_vectors_part, new_indices_part);
+      resource::sync_stream(dev_res);
+    }
+  }
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void sharded_search_with_direct_merge(const raft::comms::nccl_clique& clique,
+                                      const index<AnnIndexType, T, IdxT>& index,
+                                      const cuvs::neighbors::search_params* search_params,
+                                      raft::host_matrix_view<const T, int64_t, row_major> queries,
+                                      raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,
+                                      raft::host_matrix_view<float, int64_t, row_major> distances,
+                                      int64_t n_rows_per_batch,
+                                      int64_t n_rows,
+                                      int64_t n_cols,
+                                      int64_t n_neighbors,
+                                      int64_t n_batches)
+{
+  const auto& root_handle = clique.set_current_device_to_root_rank();
+  auto in_neighbors       = raft::make_device_matrix<IdxT, int64_t, row_major>(
+    root_handle, index.num_ranks_ * n_rows_per_batch, n_neighbors);
+  auto in_distances = raft::make_device_matrix<float, int64_t, row_major>(
+    root_handle, index.num_ranks_ * n_rows_per_batch, n_neighbors);
+  auto out_neighbors =
+    raft::make_device_matrix<IdxT, int64_t, row_major>(root_handle, n_rows_per_batch, n_neighbors);
+  auto out_distances =
+    raft::make_device_matrix<float, int64_t, row_major>(root_handle, n_rows_per_batch, n_neighbors);
+
+  for (int64_t batch_idx = 0; batch_idx < n_batches; batch_idx++) {
+    int64_t offset                  = batch_idx * n_rows_per_batch;
+    int64_t query_offset            = offset * n_cols;
+    int64_t output_offset           = offset * n_neighbors;
+    int64_t n_rows_of_current_batch = std::min((int64_t)n_rows_per_batch, n_rows - offset);
+    int64_t part_size               = n_rows_of_current_batch * n_neighbors;
+    auto query_partition            = raft::make_host_matrix_view<const T, int64_t, row_major>(
+      queries.data_handle() + query_offset, n_rows_of_current_batch, n_cols);
+
+    const int& requirements = index.num_ranks_;
+    check_omp_threads(requirements);  // should use at least num_ranks_ threads to avoid NCCL hang
+#pragma omp parallel for num_threads(index.num_ranks_)
+    for (int rank = 0; rank < index.num_ranks_; rank++) {
+      int dev_id                            = clique.device_ids_[rank];
+      const raft::device_resources& dev_res = clique.device_resources_[rank];
+      auto& ann_if                          = index.ann_interfaces_[rank];
+      RAFT_CUDA_TRY(cudaSetDevice(dev_id));
+
+      if (rank == clique.root_rank_) {  // root rank
+        uint64_t batch_offset = clique.root_rank_ * part_size;
+        auto d_neighbors      = raft::make_device_matrix_view<IdxT, int64_t, row_major>(
+          in_neighbors.data_handle() + batch_offset, n_rows_of_current_batch, n_neighbors);
+        auto d_distances = raft::make_device_matrix_view<float, int64_t, row_major>(
+          in_distances.data_handle() + batch_offset, n_rows_of_current_batch, n_neighbors);
+        cuvs::neighbors::search(
+          dev_res, ann_if, search_params, query_partition, d_neighbors, d_distances);
+
+        // wait for other ranks
+        ncclGroupStart();
+        for (int from_rank = 0; from_rank < index.num_ranks_; from_rank++) {
+          if (from_rank == clique.root_rank_) continue;
+
+          batch_offset = from_rank * part_size;
+          ncclRecv(in_neighbors.data_handle() + batch_offset,
+                   part_size * sizeof(IdxT),
+                   ncclUint8,
+                   from_rank,
+                   clique.nccl_comms_[rank],
+                   resource::get_cuda_stream(dev_res));
+          ncclRecv(in_distances.data_handle() + batch_offset,
+                   part_size * sizeof(float),
+                   ncclUint8,
+                   from_rank,
+                   clique.nccl_comms_[rank],
+                   resource::get_cuda_stream(dev_res));
+        }
+        ncclGroupEnd();
+        resource::sync_stream(dev_res);
+      } else {  // non-root ranks
+        auto d_neighbors = raft::make_device_matrix<IdxT, int64_t, row_major>(
+          dev_res, n_rows_of_current_batch, n_neighbors);
+        auto d_distances = raft::make_device_matrix<float, int64_t, row_major>(
+          dev_res, n_rows_of_current_batch, n_neighbors);
+        cuvs::neighbors::search(
+          dev_res, ann_if, search_params, query_partition, d_neighbors.view(), d_distances.view());
+
+        // send results to root rank
+        ncclGroupStart();
+        ncclSend(d_neighbors.data_handle(),
+                 part_size * sizeof(IdxT),
+                 ncclUint8,
+                 clique.root_rank_,
+                 clique.nccl_comms_[rank],
+                 resource::get_cuda_stream(dev_res));
+        ncclSend(d_distances.data_handle(),
+                 part_size * sizeof(float),
+                 ncclUint8,
+                 clique.root_rank_,
+                 clique.nccl_comms_[rank],
+                 resource::get_cuda_stream(dev_res));
+        ncclGroupEnd();
+        resource::sync_stream(dev_res);
+      }
+    }
+
+    const auto& root_handle_   = clique.set_current_device_to_root_rank();
+    auto h_trans               = std::vector<IdxT>(index.num_ranks_);
+    int64_t translation_offset = 0;
+    for (int rank = 0; rank < index.num_ranks_; rank++) {
+      h_trans[rank] = translation_offset;
+      translation_offset += index.ann_interfaces_[rank].size();
+    }
+    auto d_trans = raft::make_device_vector<IdxT, IdxT>(root_handle_, index.num_ranks_);
+    raft::copy(d_trans.data_handle(),
+               h_trans.data(),
+               index.num_ranks_,
+               resource::get_cuda_stream(root_handle_));
+
+    cuvs::neighbors::detail::knn_merge_parts(in_distances.data_handle(),
+                                             in_neighbors.data_handle(),
+                                             out_distances.data_handle(),
+                                             out_neighbors.data_handle(),
+                                             n_rows_of_current_batch,
+                                             index.num_ranks_,
+                                             n_neighbors,
+                                             resource::get_cuda_stream(root_handle_),
+                                             d_trans.data_handle());
+
+    raft::copy(neighbors.data_handle() + output_offset,
+               out_neighbors.data_handle(),
+               part_size,
+               resource::get_cuda_stream(root_handle_));
+    raft::copy(distances.data_handle() + output_offset,
+               out_distances.data_handle(),
+               part_size,
+               resource::get_cuda_stream(root_handle_));
+
+    resource::sync_stream(root_handle_);
+  }
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void sharded_search_with_tree_merge(const raft::comms::nccl_clique& clique,
+                                    const index<AnnIndexType, T, IdxT>& index,
+                                    const cuvs::neighbors::search_params* search_params,
+                                    raft::host_matrix_view<const T, int64_t, row_major> queries,
+                                    raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,
+                                    raft::host_matrix_view<float, int64_t, row_major> distances,
+                                    int64_t n_rows_per_batch,
+                                    int64_t n_rows,
+                                    int64_t n_cols,
+                                    int64_t n_neighbors,
+                                    int64_t n_batches)
+{
+  for (int64_t batch_idx = 0; batch_idx < n_batches; batch_idx++) {
+    int64_t offset                  = batch_idx * n_rows_per_batch;
+    int64_t query_offset            = offset * n_cols;
+    int64_t output_offset           = offset * n_neighbors;
+    int64_t n_rows_of_current_batch = std::min((int64_t)n_rows_per_batch, n_rows - offset);
+    auto query_partition            = raft::make_host_matrix_view<const T, int64_t, row_major>(
+      queries.data_handle() + query_offset, n_rows_of_current_batch, n_cols);
+
+    const int& requirements = index.num_ranks_;
+    check_omp_threads(requirements);  // should use at least num_ranks_ threads to avoid NCCL hang
+#pragma omp parallel for num_threads(index.num_ranks_)
+    for (int rank = 0; rank < index.num_ranks_; rank++) {
+      int dev_id                            = clique.device_ids_[rank];
+      const raft::device_resources& dev_res = clique.device_resources_[rank];
+      auto& ann_if                          = index.ann_interfaces_[rank];
+      RAFT_CUDA_TRY(cudaSetDevice(dev_id));
+
+      int64_t part_size = n_rows_of_current_batch * n_neighbors;
+
+      auto tmp_neighbors = raft::make_device_matrix<IdxT, int64_t, row_major>(
+        dev_res, 2 * n_rows_of_current_batch, n_neighbors);
+      auto tmp_distances = raft::make_device_matrix<float, int64_t, row_major>(
+        dev_res, 2 * n_rows_of_current_batch, n_neighbors);
+      auto neighbors_view = raft::make_device_matrix_view<IdxT, int64_t, row_major>(
+        tmp_neighbors.data_handle(), n_rows_of_current_batch, n_neighbors);
+      auto distances_view = raft::make_device_matrix_view<float, int64_t, row_major>(
+        tmp_distances.data_handle(), n_rows_of_current_batch, n_neighbors);
+      cuvs::neighbors::search(
+        dev_res, ann_if, search_params, query_partition, neighbors_view, distances_view);
+
+      int64_t translation_offset = 0;
+      for (int r = 0; r < rank; r++) {
+        translation_offset += index.ann_interfaces_[r].size();
+      }
+      raft::linalg::addScalar(neighbors_view.data_handle(),
+                              neighbors_view.data_handle(),
+                              (IdxT)translation_offset,
+                              part_size,
+                              resource::get_cuda_stream(dev_res));
+
+      auto d_trans = raft::make_device_vector<IdxT, IdxT>(dev_res, 2);
+      cudaMemsetAsync(
+        d_trans.data_handle(), 0, 2 * sizeof(IdxT), resource::get_cuda_stream(dev_res));
+
+      int64_t remaining = index.num_ranks_;
+      int64_t radix     = 2;
+
+      while (remaining > 1) {
+        bool received_something = false;
+        int64_t offset          = radix / 2;
+        ncclGroupStart();
+        if (rank % radix == 0)  // This is one of the receivers
+        {
+          int other_id = rank + offset;
+          if (other_id < index.num_ranks_)  // Make sure someone's sending anything
+          {
+            ncclRecv(tmp_neighbors.data_handle() + part_size,
+                     part_size * sizeof(IdxT),
+                     ncclUint8,
+                     other_id,
+                     clique.nccl_comms_[rank],
+                     resource::get_cuda_stream(dev_res));
+            ncclRecv(tmp_distances.data_handle() + part_size,
+                     part_size * sizeof(float),
+                     ncclUint8,
+                     other_id,
+                     clique.nccl_comms_[rank],
+                     resource::get_cuda_stream(dev_res));
+            received_something = true;
+          }
+        } else if (rank % radix == offset)  // This is one of the senders
+        {
+          int other_id = rank - offset;
+          ncclSend(tmp_neighbors.data_handle(),
+                   part_size * sizeof(IdxT),
+                   ncclUint8,
+                   other_id,
+                   clique.nccl_comms_[rank],
+                   resource::get_cuda_stream(dev_res));
+          ncclSend(tmp_distances.data_handle(),
+                   part_size * sizeof(float),
+                   ncclUint8,
+                   other_id,
+                   clique.nccl_comms_[rank],
+                   resource::get_cuda_stream(dev_res));
+        }
+        ncclGroupEnd();
+
+        remaining = (remaining + 1) / 2;
+        radix *= 2;
+
+        if (received_something) {
+          // merge inplace
+          cuvs::neighbors::detail::knn_merge_parts(tmp_distances.data_handle(),
+                                                   tmp_neighbors.data_handle(),
+                                                   tmp_distances.data_handle(),
+                                                   tmp_neighbors.data_handle(),
+                                                   n_rows_of_current_batch,
+                                                   2,
+                                                   n_neighbors,
+                                                   resource::get_cuda_stream(dev_res),
+                                                   d_trans.data_handle());
+
+          // If done, copy the final result
+          if (remaining <= 1) {
+            raft::copy(neighbors.data_handle() + output_offset,
+                       tmp_neighbors.data_handle(),
+                       part_size,
+                       resource::get_cuda_stream(dev_res));
+            raft::copy(distances.data_handle() + output_offset,
+                       tmp_distances.data_handle(),
+                       part_size,
+                       resource::get_cuda_stream(dev_res));
+
+            resource::sync_stream(dev_res);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void run_search_batch(const raft::comms::nccl_clique& clique,
+                      const index<AnnIndexType, T, IdxT>& index,
+                      int rank,
+                      const cuvs::neighbors::search_params* search_params,
+                      raft::host_matrix_view<const T, int64_t, row_major>& queries,
+                      raft::host_matrix_view<IdxT, int64_t, row_major>& neighbors,
+                      raft::host_matrix_view<float, int64_t, row_major>& distances,
+                      int64_t query_offset,
+                      int64_t output_offset,
+                      int64_t n_rows_of_current_batch,
+                      int64_t n_cols,
+                      int64_t n_neighbors)
+{
+  int dev_id = clique.device_ids_[rank];
+  RAFT_CUDA_TRY(cudaSetDevice(dev_id));
+  const raft::device_resources& dev_res = clique.device_resources_[rank];
+  auto& ann_if                          = index.ann_interfaces_[rank];
+
+  auto query_partition = raft::make_host_matrix_view<const T, int64_t, row_major>(
+    queries.data_handle() + query_offset, n_rows_of_current_batch, n_cols);
+  auto d_neighbors = raft::make_device_matrix<IdxT, int64_t, row_major>(
+    dev_res, n_rows_of_current_batch, n_neighbors);
+  auto d_distances = raft::make_device_matrix<float, int64_t, row_major>(
+    dev_res, n_rows_of_current_batch, n_neighbors);
+
+  cuvs::neighbors::search(
+    dev_res, ann_if, search_params, query_partition, d_neighbors.view(), d_distances.view());
+
+  raft::copy(neighbors.data_handle() + output_offset,
+             d_neighbors.data_handle(),
+             n_rows_of_current_batch * n_neighbors,
+             resource::get_cuda_stream(dev_res));
+  raft::copy(distances.data_handle() + output_offset,
+             d_distances.data_handle(),
+             n_rows_of_current_batch * n_neighbors,
+             resource::get_cuda_stream(dev_res));
+
+  resource::sync_stream(dev_res);
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void search(const raft::device_resources& handle,
+            const index<AnnIndexType, T, IdxT>& index,
+            const cuvs::neighbors::search_params* search_params,
+            raft::host_matrix_view<const T, int64_t, row_major> queries,
+            raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,
+            raft::host_matrix_view<float, int64_t, row_major> distances,
+            int64_t n_rows_per_batch)
+{
+  const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);
+
+  int64_t n_rows      = queries.extent(0);
+  int64_t n_cols      = queries.extent(1);
+  int64_t n_neighbors = neighbors.extent(1);
+
+  if (index.mode_ == REPLICATED) {
+    cuvs::neighbors::mg::replicated_search_mode search_mode;
+    if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
+      const cuvs::neighbors::mg::search_params<ivf_flat::search_params>* mg_search_params =
+        static_cast<const cuvs::neighbors::mg::search_params<ivf_flat::search_params>*>(
+          search_params);
+      search_mode = mg_search_params->search_mode;
+    } else if constexpr (std::is_same<AnnIndexType, ivf_pq::index<IdxT>>::value) {
+      const cuvs::neighbors::mg::search_params<ivf_pq::search_params>* mg_search_params =
+        static_cast<const cuvs::neighbors::mg::search_params<ivf_pq::search_params>*>(
+          search_params);
+      search_mode = mg_search_params->search_mode;
+    } else if constexpr (std::is_same<AnnIndexType, cagra::index<T, IdxT>>::value) {
+      const cuvs::neighbors::mg::search_params<cagra::search_params>* mg_search_params =
+        static_cast<const cuvs::neighbors::mg::search_params<cagra::search_params>*>(search_params);
+      search_mode = mg_search_params->search_mode;
+    }
+
+    if (search_mode == LOAD_BALANCER) {
+      int64_t n_rows_per_rank = raft::ceildiv(n_rows, (int64_t)index.num_ranks_);
+      n_rows_per_batch =
+        std::min(n_rows_per_batch, n_rows_per_rank);  // get at least num_ranks_ batches
+      int64_t n_batches = raft::ceildiv(n_rows, (int64_t)n_rows_per_batch);
+      if (n_batches <= 1) n_rows_per_batch = n_rows;
+
+      RAFT_LOG_INFO(
+        "REPLICATED SEARCH IN LOAD BALANCER MODE: %d*%drows", n_batches, n_rows_per_batch);
+
+#pragma omp parallel for
+      for (int64_t batch_idx = 0; batch_idx < n_batches; batch_idx++) {
+        int rank                        = batch_idx % index.num_ranks_;  // alternate GPUs
+        int64_t offset                  = batch_idx * n_rows_per_batch;
+        int64_t query_offset            = offset * n_cols;
+        int64_t output_offset           = offset * n_neighbors;
+        int64_t n_rows_of_current_batch = std::min(n_rows_per_batch, n_rows - offset);
+
+        run_search_batch(clique,
+                         index,
+                         rank,
+                         search_params,
+                         queries,
+                         neighbors,
+                         distances,
+                         query_offset,
+                         output_offset,
+                         n_rows_of_current_batch,
+                         n_cols,
+                         n_neighbors);
+      }
+    } else if (search_mode == ROUND_ROBIN) {
+      RAFT_LOG_INFO("REPLICATED SEARCH IN ROUND ROBIN MODE: %d*%drows", 1, n_rows);
+
+      ASSERT(n_rows <= n_rows_per_batch,
+             "In round-robin mode, n_rows must lower or equal to n_rows_per_batch");
+
+      auto& rrc    = *index.round_robin_counter_;
+      int64_t rank = rrc++;
+      rank %= index.num_ranks_;
+
+      run_search_batch(clique,
+                       index,
+                       rank,
+                       search_params,
+                       queries,
+                       neighbors,
+                       distances,
+                       0,
+                       0,
+                       n_rows,
+                       n_cols,
+                       n_neighbors);
+    }
+  } else if (index.mode_ == SHARDED) {
+    cuvs::neighbors::mg::sharded_merge_mode merge_mode;
+    if constexpr (std::is_same<AnnIndexType, ivf_flat::index<T, IdxT>>::value) {
+      const cuvs::neighbors::mg::search_params<ivf_flat::search_params>* mg_search_params =
+        static_cast<const cuvs::neighbors::mg::search_params<ivf_flat::search_params>*>(
+          search_params);
+      merge_mode = mg_search_params->merge_mode;
+    } else if constexpr (std::is_same<AnnIndexType, ivf_pq::index<IdxT>>::value) {
+      const cuvs::neighbors::mg::search_params<ivf_pq::search_params>* mg_search_params =
+        static_cast<const cuvs::neighbors::mg::search_params<ivf_pq::search_params>*>(
+          search_params);
+      merge_mode = mg_search_params->merge_mode;
+    } else if constexpr (std::is_same<AnnIndexType, cagra::index<T, IdxT>>::value) {
+      const cuvs::neighbors::mg::search_params<cagra::search_params>* mg_search_params =
+        static_cast<const cuvs::neighbors::mg::search_params<cagra::search_params>*>(search_params);
+      merge_mode = mg_search_params->merge_mode;
+    }
+
+    int64_t n_batches = raft::ceildiv(n_rows, (int64_t)n_rows_per_batch);
+    if (n_batches <= 1) n_rows_per_batch = n_rows;
+
+    if (merge_mode == MERGE_ON_ROOT_RANK) {
+      RAFT_LOG_INFO("SHARDED SEARCH WITH MERGE_ON_ROOT_RANK MERGE MODE: %d*%drows",
+                    n_batches,
+                    n_rows_per_batch);
+      sharded_search_with_direct_merge(clique,
+                                       index,
+                                       search_params,
+                                       queries,
+                                       neighbors,
+                                       distances,
+                                       n_rows_per_batch,
+                                       n_rows,
+                                       n_cols,
+                                       n_neighbors,
+                                       n_batches);
+    } else if (merge_mode == TREE_MERGE) {
+      RAFT_LOG_INFO(
+        "SHARDED SEARCH WITH TREE_MERGE MERGE MODE %d*%drows", n_batches, n_rows_per_batch);
+      sharded_search_with_tree_merge(clique,
+                                     index,
+                                     search_params,
+                                     queries,
+                                     neighbors,
+                                     distances,
+                                     n_rows_per_batch,
+                                     n_rows,
+                                     n_cols,
+                                     n_neighbors,
+                                     n_batches);
+    }
+  }
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+void serialize(const raft::device_resources& handle,
+               const index<AnnIndexType, T, IdxT>& index,
+               const std::string& filename)
+{
+  std::ofstream of(filename, std::ios::out | std::ios::binary);
+  if (!of) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+
+  const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);
+
+  serialize_scalar(handle, of, (int)index.mode_);
+  serialize_scalar(handle, of, index.num_ranks_);
+
+  for (int rank = 0; rank < index.num_ranks_; rank++) {
+    int dev_id                            = clique.device_ids_[rank];
+    const raft::device_resources& dev_res = clique.device_resources_[rank];
+    RAFT_CUDA_TRY(cudaSetDevice(dev_id));
+    auto& ann_if = index.ann_interfaces_[rank];
+    cuvs::neighbors::serialize(dev_res, ann_if, of);
+  }
+
+  of.close();
+  if (!of) { RAFT_FAIL("Error writing output %s", filename.c_str()); }
+}
+
+}  // namespace cuvs::neighbors::mg::detail
+
+namespace cuvs::neighbors::mg {
+using namespace cuvs::neighbors;
+using namespace raft;
+
+template <typename AnnIndexType, typename T, typename IdxT>
+index<AnnIndexType, T, IdxT>::index(distribution_mode mode, int num_ranks_)
+  : mode_(mode),
+    num_ranks_(num_ranks_),
+    round_robin_counter_(std::make_shared<std::atomic<int64_t>>(0))
+{
+}
+
+template <typename AnnIndexType, typename T, typename IdxT>
+index<AnnIndexType, T, IdxT>::index(const raft::device_resources& handle,
+                                    const std::string& filename)
+  : round_robin_counter_(std::make_shared<std::atomic<int64_t>>(0))
+{
+  cuvs::neighbors::mg::detail::deserialize(handle, *this, filename);
+}
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/mg_cagra_float_uint32_t.cu b/cpp/src/neighbors/mg/mg_cagra_float_uint32_t.cu
new file mode 100644
index 000000000..b11610fb4
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg_cagra_float_uint32_t.cu
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+#include "mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+#define CUVS_INST_MG_CAGRA(T, IdxT)                                                    \
+  index<cagra::index<T, IdxT>, T, IdxT> build(                                         \
+    const raft::device_resources& handle,                                              \
+    const mg::index_params<cagra::index_params>& index_params,                         \
+    raft::host_matrix_view<const T, int64_t, row_major> index_dataset)                 \
+  {                                                                                    \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);  \
+    index<cagra::index<T, IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_); \
+    cuvs::neighbors::mg::detail::build(                                                \
+      handle,                                                                          \
+      index,                                                                           \
+      static_cast<const cuvs::neighbors::index_params*>(&index_params),                \
+      index_dataset);                                                                  \
+    return index;                                                                      \
+  }                                                                                    \
+                                                                                       \
+  void search(const raft::device_resources& handle,                                    \
+              const index<cagra::index<T, IdxT>, T, IdxT>& index,                      \
+              const mg::search_params<cagra::search_params>& search_params,            \
+              raft::host_matrix_view<const T, int64_t, row_major> queries,             \
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,              \
+              raft::host_matrix_view<float, int64_t, row_major> distances,             \
+              int64_t n_rows_per_batch)                                                \
+  {                                                                                    \
+    cuvs::neighbors::mg::detail::search(                                               \
+      handle,                                                                          \
+      index,                                                                           \
+      static_cast<const cuvs::neighbors::search_params*>(&search_params),              \
+      queries,                                                                         \
+      neighbors,                                                                       \
+      distances,                                                                       \
+      n_rows_per_batch);                                                               \
+  }                                                                                    \
+                                                                                       \
+  void serialize(const raft::device_resources& handle,                                 \
+                 const index<cagra::index<T, IdxT>, T, IdxT>& index,                   \
+                 const std::string& filename)                                          \
+  {                                                                                    \
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                   \
+  }                                                                                    \
+                                                                                       \
+  template <>                                                                          \
+  index<cagra::index<T, IdxT>, T, IdxT> deserialize_cagra<T, IdxT>(                    \
+    const raft::device_resources& handle, const std::string& filename)                 \
+  {                                                                                    \
+    auto idx = index<cagra::index<T, IdxT>, T, IdxT>(handle, filename);                \
+    return idx;                                                                        \
+  }                                                                                    \
+                                                                                       \
+  template <>                                                                          \
+  index<cagra::index<T, IdxT>, T, IdxT> distribute_cagra<T, IdxT>(                     \
+    const raft::device_resources& handle, const std::string& filename)                 \
+  {                                                                                    \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);  \
+    auto idx = index<cagra::index<T, IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);   \
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);    \
+    return idx;                                                                        \
+  }
+CUVS_INST_MG_CAGRA(float, uint32_t);
+
+#undef CUVS_INST_MG_CAGRA
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/mg_cagra_half_uint32_t.cu b/cpp/src/neighbors/mg/mg_cagra_half_uint32_t.cu
new file mode 100644
index 000000000..8f76c69a3
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg_cagra_half_uint32_t.cu
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+#include "mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+#define CUVS_INST_MG_CAGRA(T, IdxT)                                                    \
+  index<cagra::index<T, IdxT>, T, IdxT> build(                                         \
+    const raft::device_resources& handle,                                              \
+    const mg::index_params<cagra::index_params>& index_params,                         \
+    raft::host_matrix_view<const T, int64_t, row_major> index_dataset)                 \
+  {                                                                                    \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);  \
+    index<cagra::index<T, IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_); \
+    cuvs::neighbors::mg::detail::build(                                                \
+      handle,                                                                          \
+      index,                                                                           \
+      static_cast<const cuvs::neighbors::index_params*>(&index_params),                \
+      index_dataset);                                                                  \
+    return index;                                                                      \
+  }                                                                                    \
+                                                                                       \
+  void search(const raft::device_resources& handle,                                    \
+              const index<cagra::index<T, IdxT>, T, IdxT>& index,                      \
+              const mg::search_params<cagra::search_params>& search_params,            \
+              raft::host_matrix_view<const T, int64_t, row_major> queries,             \
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,              \
+              raft::host_matrix_view<float, int64_t, row_major> distances,             \
+              int64_t n_rows_per_batch)                                                \
+  {                                                                                    \
+    cuvs::neighbors::mg::detail::search(                                               \
+      handle,                                                                          \
+      index,                                                                           \
+      static_cast<const cuvs::neighbors::search_params*>(&search_params),              \
+      queries,                                                                         \
+      neighbors,                                                                       \
+      distances,                                                                       \
+      n_rows_per_batch);                                                               \
+  }                                                                                    \
+                                                                                       \
+  void serialize(const raft::device_resources& handle,                                 \
+                 const index<cagra::index<T, IdxT>, T, IdxT>& index,                   \
+                 const std::string& filename)                                          \
+  {                                                                                    \
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                   \
+  }                                                                                    \
+                                                                                       \
+  template <>                                                                          \
+  index<cagra::index<T, IdxT>, T, IdxT> deserialize_cagra<T, IdxT>(                    \
+    const raft::device_resources& handle, const std::string& filename)                 \
+  {                                                                                    \
+    auto idx = index<cagra::index<T, IdxT>, T, IdxT>(handle, filename);                \
+    return idx;                                                                        \
+  }                                                                                    \
+                                                                                       \
+  template <>                                                                          \
+  index<cagra::index<T, IdxT>, T, IdxT> distribute_cagra<T, IdxT>(                     \
+    const raft::device_resources& handle, const std::string& filename)                 \
+  {                                                                                    \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);  \
+    auto idx = index<cagra::index<T, IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);   \
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);    \
+    return idx;                                                                        \
+  }
+CUVS_INST_MG_CAGRA(half, uint32_t);
+
+#undef CUVS_INST_MG_CAGRA
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/mg_cagra_int8_t_uint32_t.cu b/cpp/src/neighbors/mg/mg_cagra_int8_t_uint32_t.cu
new file mode 100644
index 000000000..67b88d742
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg_cagra_int8_t_uint32_t.cu
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+#include "mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+#define CUVS_INST_MG_CAGRA(T, IdxT)                                                    \
+  index<cagra::index<T, IdxT>, T, IdxT> build(                                         \
+    const raft::device_resources& handle,                                              \
+    const mg::index_params<cagra::index_params>& index_params,                         \
+    raft::host_matrix_view<const T, int64_t, row_major> index_dataset)                 \
+  {                                                                                    \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);  \
+    index<cagra::index<T, IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_); \
+    cuvs::neighbors::mg::detail::build(                                                \
+      handle,                                                                          \
+      index,                                                                           \
+      static_cast<const cuvs::neighbors::index_params*>(&index_params),                \
+      index_dataset);                                                                  \
+    return index;                                                                      \
+  }                                                                                    \
+                                                                                       \
+  void search(const raft::device_resources& handle,                                    \
+              const index<cagra::index<T, IdxT>, T, IdxT>& index,                      \
+              const mg::search_params<cagra::search_params>& search_params,            \
+              raft::host_matrix_view<const T, int64_t, row_major> queries,             \
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,              \
+              raft::host_matrix_view<float, int64_t, row_major> distances,             \
+              int64_t n_rows_per_batch)                                                \
+  {                                                                                    \
+    cuvs::neighbors::mg::detail::search(                                               \
+      handle,                                                                          \
+      index,                                                                           \
+      static_cast<const cuvs::neighbors::search_params*>(&search_params),              \
+      queries,                                                                         \
+      neighbors,                                                                       \
+      distances,                                                                       \
+      n_rows_per_batch);                                                               \
+  }                                                                                    \
+                                                                                       \
+  void serialize(const raft::device_resources& handle,                                 \
+                 const index<cagra::index<T, IdxT>, T, IdxT>& index,                   \
+                 const std::string& filename)                                          \
+  {                                                                                    \
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                   \
+  }                                                                                    \
+                                                                                       \
+  template <>                                                                          \
+  index<cagra::index<T, IdxT>, T, IdxT> deserialize_cagra<T, IdxT>(                    \
+    const raft::device_resources& handle, const std::string& filename)                 \
+  {                                                                                    \
+    auto idx = index<cagra::index<T, IdxT>, T, IdxT>(handle, filename);                \
+    return idx;                                                                        \
+  }                                                                                    \
+                                                                                       \
+  template <>                                                                          \
+  index<cagra::index<T, IdxT>, T, IdxT> distribute_cagra<T, IdxT>(                     \
+    const raft::device_resources& handle, const std::string& filename)                 \
+  {                                                                                    \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);  \
+    auto idx = index<cagra::index<T, IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);   \
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);    \
+    return idx;                                                                        \
+  }
+CUVS_INST_MG_CAGRA(int8_t, uint32_t);
+
+#undef CUVS_INST_MG_CAGRA
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/mg_cagra_uint8_t_uint32_t.cu b/cpp/src/neighbors/mg/mg_cagra_uint8_t_uint32_t.cu
new file mode 100644
index 000000000..f72174923
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg_cagra_uint8_t_uint32_t.cu
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+#include "mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+#define CUVS_INST_MG_CAGRA(T, IdxT)                                                    \
+  index<cagra::index<T, IdxT>, T, IdxT> build(                                         \
+    const raft::device_resources& handle,                                              \
+    const mg::index_params<cagra::index_params>& index_params,                         \
+    raft::host_matrix_view<const T, int64_t, row_major> index_dataset)                 \
+  {                                                                                    \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);  \
+    index<cagra::index<T, IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_); \
+    cuvs::neighbors::mg::detail::build(                                                \
+      handle,                                                                          \
+      index,                                                                           \
+      static_cast<const cuvs::neighbors::index_params*>(&index_params),                \
+      index_dataset);                                                                  \
+    return index;                                                                      \
+  }                                                                                    \
+                                                                                       \
+  void search(const raft::device_resources& handle,                                    \
+              const index<cagra::index<T, IdxT>, T, IdxT>& index,                      \
+              const mg::search_params<cagra::search_params>& search_params,            \
+              raft::host_matrix_view<const T, int64_t, row_major> queries,             \
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,              \
+              raft::host_matrix_view<float, int64_t, row_major> distances,             \
+              int64_t n_rows_per_batch)                                                \
+  {                                                                                    \
+    cuvs::neighbors::mg::detail::search(                                               \
+      handle,                                                                          \
+      index,                                                                           \
+      static_cast<const cuvs::neighbors::search_params*>(&search_params),              \
+      queries,                                                                         \
+      neighbors,                                                                       \
+      distances,                                                                       \
+      n_rows_per_batch);                                                               \
+  }                                                                                    \
+                                                                                       \
+  void serialize(const raft::device_resources& handle,                                 \
+                 const index<cagra::index<T, IdxT>, T, IdxT>& index,                   \
+                 const std::string& filename)                                          \
+  {                                                                                    \
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                   \
+  }                                                                                    \
+                                                                                       \
+  template <>                                                                          \
+  index<cagra::index<T, IdxT>, T, IdxT> deserialize_cagra<T, IdxT>(                    \
+    const raft::device_resources& handle, const std::string& filename)                 \
+  {                                                                                    \
+    auto idx = index<cagra::index<T, IdxT>, T, IdxT>(handle, filename);                \
+    return idx;                                                                        \
+  }                                                                                    \
+                                                                                       \
+  template <>                                                                          \
+  index<cagra::index<T, IdxT>, T, IdxT> distribute_cagra<T, IdxT>(                     \
+    const raft::device_resources& handle, const std::string& filename)                 \
+  {                                                                                    \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);  \
+    auto idx = index<cagra::index<T, IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);   \
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);    \
+    return idx;                                                                        \
+  }
+CUVS_INST_MG_CAGRA(uint8_t, uint32_t);
+
+#undef CUVS_INST_MG_CAGRA
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/mg_flat_float_int64_t.cu b/cpp/src/neighbors/mg/mg_flat_float_int64_t.cu
new file mode 100644
index 000000000..4495e2527
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg_flat_float_int64_t.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+#include "mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+#define CUVS_INST_MG_FLAT(T, IdxT)                                                        \
+  index<ivf_flat::index<T, IdxT>, T, IdxT> build(                                         \
+    const raft::device_resources& handle,                                                 \
+    const mg::index_params<ivf_flat::index_params>& index_params,                         \
+    raft::host_matrix_view<const T, int64_t, row_major> index_dataset)                    \
+  {                                                                                       \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);     \
+    index<ivf_flat::index<T, IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_); \
+    cuvs::neighbors::mg::detail::build(                                                   \
+      handle,                                                                             \
+      index,                                                                              \
+      static_cast<const cuvs::neighbors::index_params*>(&index_params),                   \
+      index_dataset);                                                                     \
+    return index;                                                                         \
+  }                                                                                       \
+                                                                                          \
+  void extend(const raft::device_resources& handle,                                       \
+              index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                            \
+              raft::host_matrix_view<const T, int64_t, row_major> new_vectors,            \
+              std::optional<raft::host_vector_view<const IdxT, int64_t>> new_indices)     \
+  {                                                                                       \
+    cuvs::neighbors::mg::detail::extend(handle, index, new_vectors, new_indices);         \
+  }                                                                                       \
+                                                                                          \
+  void search(const raft::device_resources& handle,                                       \
+              const index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                      \
+              const mg::search_params<ivf_flat::search_params>& search_params,            \
+              raft::host_matrix_view<const T, int64_t, row_major> queries,                \
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,                 \
+              raft::host_matrix_view<float, int64_t, row_major> distances,                \
+              int64_t n_rows_per_batch)                                                   \
+  {                                                                                       \
+    cuvs::neighbors::mg::detail::search(                                                  \
+      handle,                                                                             \
+      index,                                                                              \
+      static_cast<const cuvs::neighbors::search_params*>(&search_params),                 \
+      queries,                                                                            \
+      neighbors,                                                                          \
+      distances,                                                                          \
+      n_rows_per_batch);                                                                  \
+  }                                                                                       \
+                                                                                          \
+  void serialize(const raft::device_resources& handle,                                    \
+                 const index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                   \
+                 const std::string& filename)                                             \
+  {                                                                                       \
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                      \
+  }                                                                                       \
+                                                                                          \
+  template <>                                                                             \
+  index<ivf_flat::index<T, IdxT>, T, IdxT> deserialize_flat<T, IdxT>(                     \
+    const raft::device_resources& handle, const std::string& filename)                    \
+  {                                                                                       \
+    auto idx = index<ivf_flat::index<T, IdxT>, T, IdxT>(handle, filename);                \
+    return idx;                                                                           \
+  }                                                                                       \
+                                                                                          \
+  template <>                                                                             \
+  index<ivf_flat::index<T, IdxT>, T, IdxT> distribute_flat<T, IdxT>(                      \
+    const raft::device_resources& handle, const std::string& filename)                    \
+  {                                                                                       \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);     \
+    auto idx = index<ivf_flat::index<T, IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);   \
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);       \
+    return idx;                                                                           \
+  }
+CUVS_INST_MG_FLAT(float, int64_t);
+
+#undef CUVS_INST_MG_FLAT
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/mg_flat_int8_t_int64_t.cu b/cpp/src/neighbors/mg/mg_flat_int8_t_int64_t.cu
new file mode 100644
index 000000000..5494414a6
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg_flat_int8_t_int64_t.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+#include "mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+#define CUVS_INST_MG_FLAT(T, IdxT)                                                        \
+  index<ivf_flat::index<T, IdxT>, T, IdxT> build(                                         \
+    const raft::device_resources& handle,                                                 \
+    const mg::index_params<ivf_flat::index_params>& index_params,                         \
+    raft::host_matrix_view<const T, int64_t, row_major> index_dataset)                    \
+  {                                                                                       \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);     \
+    index<ivf_flat::index<T, IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_); \
+    cuvs::neighbors::mg::detail::build(                                                   \
+      handle,                                                                             \
+      index,                                                                              \
+      static_cast<const cuvs::neighbors::index_params*>(&index_params),                   \
+      index_dataset);                                                                     \
+    return index;                                                                         \
+  }                                                                                       \
+                                                                                          \
+  void extend(const raft::device_resources& handle,                                       \
+              index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                            \
+              raft::host_matrix_view<const T, int64_t, row_major> new_vectors,            \
+              std::optional<raft::host_vector_view<const IdxT, int64_t>> new_indices)     \
+  {                                                                                       \
+    cuvs::neighbors::mg::detail::extend(handle, index, new_vectors, new_indices);         \
+  }                                                                                       \
+                                                                                          \
+  void search(const raft::device_resources& handle,                                       \
+              const index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                      \
+              const mg::search_params<ivf_flat::search_params>& search_params,            \
+              raft::host_matrix_view<const T, int64_t, row_major> queries,                \
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,                 \
+              raft::host_matrix_view<float, int64_t, row_major> distances,                \
+              int64_t n_rows_per_batch)                                                   \
+  {                                                                                       \
+    cuvs::neighbors::mg::detail::search(                                                  \
+      handle,                                                                             \
+      index,                                                                              \
+      static_cast<const cuvs::neighbors::search_params*>(&search_params),                 \
+      queries,                                                                            \
+      neighbors,                                                                          \
+      distances,                                                                          \
+      n_rows_per_batch);                                                                  \
+  }                                                                                       \
+                                                                                          \
+  void serialize(const raft::device_resources& handle,                                    \
+                 const index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                   \
+                 const std::string& filename)                                             \
+  {                                                                                       \
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                      \
+  }                                                                                       \
+                                                                                          \
+  template <>                                                                             \
+  index<ivf_flat::index<T, IdxT>, T, IdxT> deserialize_flat<T, IdxT>(                     \
+    const raft::device_resources& handle, const std::string& filename)                    \
+  {                                                                                       \
+    auto idx = index<ivf_flat::index<T, IdxT>, T, IdxT>(handle, filename);                \
+    return idx;                                                                           \
+  }                                                                                       \
+                                                                                          \
+  template <>                                                                             \
+  index<ivf_flat::index<T, IdxT>, T, IdxT> distribute_flat<T, IdxT>(                      \
+    const raft::device_resources& handle, const std::string& filename)                    \
+  {                                                                                       \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);     \
+    auto idx = index<ivf_flat::index<T, IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);   \
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);       \
+    return idx;                                                                           \
+  }
+CUVS_INST_MG_FLAT(int8_t, int64_t);
+
+#undef CUVS_INST_MG_FLAT
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/mg_flat_uint8_t_int64_t.cu b/cpp/src/neighbors/mg/mg_flat_uint8_t_int64_t.cu
new file mode 100644
index 000000000..35df2146b
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg_flat_uint8_t_int64_t.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+#include "mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+#define CUVS_INST_MG_FLAT(T, IdxT)                                                        \
+  index<ivf_flat::index<T, IdxT>, T, IdxT> build(                                         \
+    const raft::device_resources& handle,                                                 \
+    const mg::index_params<ivf_flat::index_params>& index_params,                         \
+    raft::host_matrix_view<const T, int64_t, row_major> index_dataset)                    \
+  {                                                                                       \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);     \
+    index<ivf_flat::index<T, IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_); \
+    cuvs::neighbors::mg::detail::build(                                                   \
+      handle,                                                                             \
+      index,                                                                              \
+      static_cast<const cuvs::neighbors::index_params*>(&index_params),                   \
+      index_dataset);                                                                     \
+    return index;                                                                         \
+  }                                                                                       \
+                                                                                          \
+  void extend(const raft::device_resources& handle,                                       \
+              index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                            \
+              raft::host_matrix_view<const T, int64_t, row_major> new_vectors,            \
+              std::optional<raft::host_vector_view<const IdxT, int64_t>> new_indices)     \
+  {                                                                                       \
+    cuvs::neighbors::mg::detail::extend(handle, index, new_vectors, new_indices);         \
+  }                                                                                       \
+                                                                                          \
+  void search(const raft::device_resources& handle,                                       \
+              const index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                      \
+              const mg::search_params<ivf_flat::search_params>& search_params,            \
+              raft::host_matrix_view<const T, int64_t, row_major> queries,                \
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,                 \
+              raft::host_matrix_view<float, int64_t, row_major> distances,                \
+              int64_t n_rows_per_batch)                                                   \
+  {                                                                                       \
+    cuvs::neighbors::mg::detail::search(                                                  \
+      handle,                                                                             \
+      index,                                                                              \
+      static_cast<const cuvs::neighbors::search_params*>(&search_params),                 \
+      queries,                                                                            \
+      neighbors,                                                                          \
+      distances,                                                                          \
+      n_rows_per_batch);                                                                  \
+  }                                                                                       \
+                                                                                          \
+  void serialize(const raft::device_resources& handle,                                    \
+                 const index<ivf_flat::index<T, IdxT>, T, IdxT>& index,                   \
+                 const std::string& filename)                                             \
+  {                                                                                       \
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                      \
+  }                                                                                       \
+                                                                                          \
+  template <>                                                                             \
+  index<ivf_flat::index<T, IdxT>, T, IdxT> deserialize_flat<T, IdxT>(                     \
+    const raft::device_resources& handle, const std::string& filename)                    \
+  {                                                                                       \
+    auto idx = index<ivf_flat::index<T, IdxT>, T, IdxT>(handle, filename);                \
+    return idx;                                                                           \
+  }                                                                                       \
+                                                                                          \
+  template <>                                                                             \
+  index<ivf_flat::index<T, IdxT>, T, IdxT> distribute_flat<T, IdxT>(                      \
+    const raft::device_resources& handle, const std::string& filename)                    \
+  {                                                                                       \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);     \
+    auto idx = index<ivf_flat::index<T, IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);   \
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);       \
+    return idx;                                                                           \
+  }
+CUVS_INST_MG_FLAT(uint8_t, int64_t);
+
+#undef CUVS_INST_MG_FLAT
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/mg_pq_float_int64_t.cu b/cpp/src/neighbors/mg/mg_pq_float_int64_t.cu
new file mode 100644
index 000000000..c671740e6
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg_pq_float_int64_t.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+#include "mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+#define CUVS_INST_MG_PQ(T, IdxT)                                                                   \
+  index<ivf_pq::index<IdxT>, T, IdxT> build(                                                       \
+    const raft::device_resources& handle,                                                          \
+    const mg::index_params<ivf_pq::index_params>& index_params,                                    \
+    raft::host_matrix_view<const T, int64_t, row_major> index_dataset)                             \
+  {                                                                                                \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);              \
+    index<ivf_pq::index<IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_);               \
+    cuvs::neighbors::mg::detail::build(                                                            \
+      handle,                                                                                      \
+      index,                                                                                       \
+      static_cast<const cuvs::neighbors::index_params*>(&index_params),                            \
+      index_dataset);                                                                              \
+    return index;                                                                                  \
+  }                                                                                                \
+                                                                                                   \
+  void extend(const raft::device_resources& handle,                                                \
+              index<ivf_pq::index<IdxT>, T, IdxT>& index,                                          \
+              raft::host_matrix_view<const T, int64_t, row_major> new_vectors,                     \
+              std::optional<raft::host_vector_view<const IdxT, int64_t>> new_indices)              \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::extend(handle, index, new_vectors, new_indices);                  \
+  }                                                                                                \
+                                                                                                   \
+  void search(const raft::device_resources& handle,                                                \
+              const index<ivf_pq::index<IdxT>, T, IdxT>& index,                                    \
+              const mg::search_params<ivf_pq::search_params>& search_params,                       \
+              raft::host_matrix_view<const T, int64_t, row_major> queries,                         \
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,                          \
+              raft::host_matrix_view<float, int64_t, row_major> distances,                         \
+              int64_t n_rows_per_batch)                                                            \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::search(                                                           \
+      handle,                                                                                      \
+      index,                                                                                       \
+      static_cast<const cuvs::neighbors::search_params*>(&search_params),                          \
+      queries,                                                                                     \
+      neighbors,                                                                                   \
+      distances,                                                                                   \
+      n_rows_per_batch);                                                                           \
+  }                                                                                                \
+                                                                                                   \
+  void serialize(const raft::device_resources& handle,                                             \
+                 const index<ivf_pq::index<IdxT>, T, IdxT>& index,                                 \
+                 const std::string& filename)                                                      \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                               \
+  }                                                                                                \
+                                                                                                   \
+  template <>                                                                                      \
+  index<ivf_pq::index<IdxT>, T, IdxT> deserialize_pq<T, IdxT>(                                     \
+    const raft::device_resources& handle, const std::string& filename)                             \
+  {                                                                                                \
+    auto idx = index<ivf_pq::index<IdxT>, T, IdxT>(handle, filename);                              \
+    return idx;                                                                                    \
+  }                                                                                                \
+                                                                                                   \
+  template <>                                                                                      \
+  index<ivf_pq::index<IdxT>, T, IdxT> distribute_pq<T, IdxT>(const raft::device_resources& handle, \
+                                                             const std::string& filename)          \
+  {                                                                                                \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);              \
+    auto idx = index<ivf_pq::index<IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);                 \
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);                \
+    return idx;                                                                                    \
+  }
+CUVS_INST_MG_PQ(float, int64_t);
+
+#undef CUVS_INST_MG_PQ
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/mg_pq_half_int64_t.cu b/cpp/src/neighbors/mg/mg_pq_half_int64_t.cu
new file mode 100644
index 000000000..b167239c6
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg_pq_half_int64_t.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+#include "mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+#define CUVS_INST_MG_PQ(T, IdxT)                                                                   \
+  index<ivf_pq::index<IdxT>, T, IdxT> build(                                                       \
+    const raft::device_resources& handle,                                                          \
+    const mg::index_params<ivf_pq::index_params>& index_params,                                    \
+    raft::host_matrix_view<const T, int64_t, row_major> index_dataset)                             \
+  {                                                                                                \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);              \
+    index<ivf_pq::index<IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_);               \
+    cuvs::neighbors::mg::detail::build(                                                            \
+      handle,                                                                                      \
+      index,                                                                                       \
+      static_cast<const cuvs::neighbors::index_params*>(&index_params),                            \
+      index_dataset);                                                                              \
+    return index;                                                                                  \
+  }                                                                                                \
+                                                                                                   \
+  void extend(const raft::device_resources& handle,                                                \
+              index<ivf_pq::index<IdxT>, T, IdxT>& index,                                          \
+              raft::host_matrix_view<const T, int64_t, row_major> new_vectors,                     \
+              std::optional<raft::host_vector_view<const IdxT, int64_t>> new_indices)              \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::extend(handle, index, new_vectors, new_indices);                  \
+  }                                                                                                \
+                                                                                                   \
+  void search(const raft::device_resources& handle,                                                \
+              const index<ivf_pq::index<IdxT>, T, IdxT>& index,                                    \
+              const mg::search_params<ivf_pq::search_params>& search_params,                       \
+              raft::host_matrix_view<const T, int64_t, row_major> queries,                         \
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,                          \
+              raft::host_matrix_view<float, int64_t, row_major> distances,                         \
+              int64_t n_rows_per_batch)                                                            \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::search(                                                           \
+      handle,                                                                                      \
+      index,                                                                                       \
+      static_cast<const cuvs::neighbors::search_params*>(&search_params),                          \
+      queries,                                                                                     \
+      neighbors,                                                                                   \
+      distances,                                                                                   \
+      n_rows_per_batch);                                                                           \
+  }                                                                                                \
+                                                                                                   \
+  void serialize(const raft::device_resources& handle,                                             \
+                 const index<ivf_pq::index<IdxT>, T, IdxT>& index,                                 \
+                 const std::string& filename)                                                      \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                               \
+  }                                                                                                \
+                                                                                                   \
+  template <>                                                                                      \
+  index<ivf_pq::index<IdxT>, T, IdxT> deserialize_pq<T, IdxT>(                                     \
+    const raft::device_resources& handle, const std::string& filename)                             \
+  {                                                                                                \
+    auto idx = index<ivf_pq::index<IdxT>, T, IdxT>(handle, filename);                              \
+    return idx;                                                                                    \
+  }                                                                                                \
+                                                                                                   \
+  template <>                                                                                      \
+  index<ivf_pq::index<IdxT>, T, IdxT> distribute_pq<T, IdxT>(const raft::device_resources& handle, \
+                                                             const std::string& filename)          \
+  {                                                                                                \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);              \
+    auto idx = index<ivf_pq::index<IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);                 \
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);                \
+    return idx;                                                                                    \
+  }
+CUVS_INST_MG_PQ(half, int64_t);
+
+#undef CUVS_INST_MG_PQ
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/mg_pq_int8_t_int64_t.cu b/cpp/src/neighbors/mg/mg_pq_int8_t_int64_t.cu
new file mode 100644
index 000000000..127baf8fd
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg_pq_int8_t_int64_t.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+#include "mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+#define CUVS_INST_MG_PQ(T, IdxT)                                                                   \
+  index<ivf_pq::index<IdxT>, T, IdxT> build(                                                       \
+    const raft::device_resources& handle,                                                          \
+    const mg::index_params<ivf_pq::index_params>& index_params,                                    \
+    raft::host_matrix_view<const T, int64_t, row_major> index_dataset)                             \
+  {                                                                                                \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);              \
+    index<ivf_pq::index<IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_);               \
+    cuvs::neighbors::mg::detail::build(                                                            \
+      handle,                                                                                      \
+      index,                                                                                       \
+      static_cast<const cuvs::neighbors::index_params*>(&index_params),                            \
+      index_dataset);                                                                              \
+    return index;                                                                                  \
+  }                                                                                                \
+                                                                                                   \
+  void extend(const raft::device_resources& handle,                                                \
+              index<ivf_pq::index<IdxT>, T, IdxT>& index,                                          \
+              raft::host_matrix_view<const T, int64_t, row_major> new_vectors,                     \
+              std::optional<raft::host_vector_view<const IdxT, int64_t>> new_indices)              \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::extend(handle, index, new_vectors, new_indices);                  \
+  }                                                                                                \
+                                                                                                   \
+  void search(const raft::device_resources& handle,                                                \
+              const index<ivf_pq::index<IdxT>, T, IdxT>& index,                                    \
+              const mg::search_params<ivf_pq::search_params>& search_params,                       \
+              raft::host_matrix_view<const T, int64_t, row_major> queries,                         \
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,                          \
+              raft::host_matrix_view<float, int64_t, row_major> distances,                         \
+              int64_t n_rows_per_batch)                                                            \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::search(                                                           \
+      handle,                                                                                      \
+      index,                                                                                       \
+      static_cast<const cuvs::neighbors::search_params*>(&search_params),                          \
+      queries,                                                                                     \
+      neighbors,                                                                                   \
+      distances,                                                                                   \
+      n_rows_per_batch);                                                                           \
+  }                                                                                                \
+                                                                                                   \
+  void serialize(const raft::device_resources& handle,                                             \
+                 const index<ivf_pq::index<IdxT>, T, IdxT>& index,                                 \
+                 const std::string& filename)                                                      \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                               \
+  }                                                                                                \
+                                                                                                   \
+  template <>                                                                                      \
+  index<ivf_pq::index<IdxT>, T, IdxT> deserialize_pq<T, IdxT>(                                     \
+    const raft::device_resources& handle, const std::string& filename)                             \
+  {                                                                                                \
+    auto idx = index<ivf_pq::index<IdxT>, T, IdxT>(handle, filename);                              \
+    return idx;                                                                                    \
+  }                                                                                                \
+                                                                                                   \
+  template <>                                                                                      \
+  index<ivf_pq::index<IdxT>, T, IdxT> distribute_pq<T, IdxT>(const raft::device_resources& handle, \
+                                                             const std::string& filename)          \
+  {                                                                                                \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);              \
+    auto idx = index<ivf_pq::index<IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);                 \
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);                \
+    return idx;                                                                                    \
+  }
+CUVS_INST_MG_PQ(int8_t, int64_t);
+
+#undef CUVS_INST_MG_PQ
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/mg_pq_uint8_t_int64_t.cu b/cpp/src/neighbors/mg/mg_pq_uint8_t_int64_t.cu
new file mode 100644
index 000000000..869e009a5
--- /dev/null
+++ b/cpp/src/neighbors/mg/mg_pq_uint8_t_int64_t.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by generate_mg.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python generate_mg.py
+ *
+ */
+
+#include "mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+#define CUVS_INST_MG_PQ(T, IdxT)                                                                   \
+  index<ivf_pq::index<IdxT>, T, IdxT> build(                                                       \
+    const raft::device_resources& handle,                                                          \
+    const mg::index_params<ivf_pq::index_params>& index_params,                                    \
+    raft::host_matrix_view<const T, int64_t, row_major> index_dataset)                             \
+  {                                                                                                \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);              \
+    index<ivf_pq::index<IdxT>, T, IdxT> index(index_params.mode, clique.num_ranks_);               \
+    cuvs::neighbors::mg::detail::build(                                                            \
+      handle,                                                                                      \
+      index,                                                                                       \
+      static_cast<const cuvs::neighbors::index_params*>(&index_params),                            \
+      index_dataset);                                                                              \
+    return index;                                                                                  \
+  }                                                                                                \
+                                                                                                   \
+  void extend(const raft::device_resources& handle,                                                \
+              index<ivf_pq::index<IdxT>, T, IdxT>& index,                                          \
+              raft::host_matrix_view<const T, int64_t, row_major> new_vectors,                     \
+              std::optional<raft::host_vector_view<const IdxT, int64_t>> new_indices)              \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::extend(handle, index, new_vectors, new_indices);                  \
+  }                                                                                                \
+                                                                                                   \
+  void search(const raft::device_resources& handle,                                                \
+              const index<ivf_pq::index<IdxT>, T, IdxT>& index,                                    \
+              const mg::search_params<ivf_pq::search_params>& search_params,                       \
+              raft::host_matrix_view<const T, int64_t, row_major> queries,                         \
+              raft::host_matrix_view<IdxT, int64_t, row_major> neighbors,                          \
+              raft::host_matrix_view<float, int64_t, row_major> distances,                         \
+              int64_t n_rows_per_batch)                                                            \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::search(                                                           \
+      handle,                                                                                      \
+      index,                                                                                       \
+      static_cast<const cuvs::neighbors::search_params*>(&search_params),                          \
+      queries,                                                                                     \
+      neighbors,                                                                                   \
+      distances,                                                                                   \
+      n_rows_per_batch);                                                                           \
+  }                                                                                                \
+                                                                                                   \
+  void serialize(const raft::device_resources& handle,                                             \
+                 const index<ivf_pq::index<IdxT>, T, IdxT>& index,                                 \
+                 const std::string& filename)                                                      \
+  {                                                                                                \
+    cuvs::neighbors::mg::detail::serialize(handle, index, filename);                               \
+  }                                                                                                \
+                                                                                                   \
+  template <>                                                                                      \
+  index<ivf_pq::index<IdxT>, T, IdxT> deserialize_pq<T, IdxT>(                                     \
+    const raft::device_resources& handle, const std::string& filename)                             \
+  {                                                                                                \
+    auto idx = index<ivf_pq::index<IdxT>, T, IdxT>(handle, filename);                              \
+    return idx;                                                                                    \
+  }                                                                                                \
+                                                                                                   \
+  template <>                                                                                      \
+  index<ivf_pq::index<IdxT>, T, IdxT> distribute_pq<T, IdxT>(const raft::device_resources& handle, \
+                                                             const std::string& filename)          \
+  {                                                                                                \
+    const raft::comms::nccl_clique& clique = raft::resource::get_nccl_clique(handle);              \
+    auto idx = index<ivf_pq::index<IdxT>, T, IdxT>(REPLICATED, clique.num_ranks_);                 \
+    cuvs::neighbors::mg::detail::deserialize_and_distribute(handle, idx, filename);                \
+    return idx;                                                                                    \
+  }
+CUVS_INST_MG_PQ(uint8_t, int64_t);
+
+#undef CUVS_INST_MG_PQ
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/src/neighbors/mg/nccl_comm.cpp b/cpp/src/neighbors/mg/nccl_comm.cpp
new file mode 100644
index 000000000..c4556957a
--- /dev/null
+++ b/cpp/src/neighbors/mg/nccl_comm.cpp
@@ -0,0 +1,8 @@
+#include <nccl.h>
+#include <raft/core/resources.hpp>
+
+namespace raft::comms {
+void build_comms_nccl_only(raft::resources* handle, ncclComm_t nccl_comm, int num_ranks, int rank)
+{
+}
+}  // namespace raft::comms
diff --git a/cpp/src/neighbors/mg/omp_checks.cpp b/cpp/src/neighbors/mg/omp_checks.cpp
new file mode 100644
index 000000000..e09182dfe
--- /dev/null
+++ b/cpp/src/neighbors/mg/omp_checks.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <omp.h>
+#include <raft/core/logger.hpp>
+
+namespace cuvs::neighbors::mg {
+using raft::RAFT_NAME;
+
+void check_omp_threads(const int requirements)
+{
+  const int max_threads = omp_get_max_threads();
+  if (max_threads < requirements)
+    RAFT_LOG_WARN(
+      "OpenMP is only allowed %d threads to run %d GPUs. Please increase the number of OpenMP "
+      "threads to avoid NCCL hangs by modifying the environment variable OMP_NUM_THREADS.",
+      max_threads,
+      requirements);
+}
+
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index bd07bebee..f4d35e438 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -22,7 +22,7 @@ rapids_test_init()
 function(ConfigureTest)
 
   set(options OPTIONAL NOCUDA C_LIB)
-  set(oneValueArgs NAME GPUS PERCENT)
+  set(oneValueArgs NAME GPUS PERCENT ADDITIONAL_DEP)
   set(multiValueArgs PATH TARGETS CONFIGURATIONS)
 
   cmake_parse_arguments(_CUVS_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -56,6 +56,7 @@ function(ConfigureTest)
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
             $<TARGET_NAME_IF_EXISTS:conda_env>
             $<$<BOOL:${_CUVS_TEST_C_LIB}>:cuvs::c_api>
+            ${_CUVS_TEST_ADDITIONAL_DEP}
   )
   set_target_properties(
     ${TEST_NAME}
@@ -159,7 +160,7 @@ if(BUILD_TESTS)
     100
   )
 
- ConfigureTest(
+  ConfigureTest(
     NAME
     NEIGHBORS_ANN_VAMANA_TEST
     PATH
@@ -178,6 +179,12 @@ if(BUILD_TESTS)
     target_compile_definitions(NEIGHBORS_HNSW_TEST PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
   endif()
 
+  if(BUILD_MG_ALGOS)
+    ConfigureTest(
+      NAME NEIGHBORS_MG_TEST PATH neighbors/mg/test_float.cu GPUS 1 PERCENT 100 ADDITIONAL_DEP nccl
+    )
+  endif()
+
   ConfigureTest(
     NAME
     DISTANCE_TEST
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index f02568b74..fd4e330db 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -282,6 +282,8 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                             uint32_t n_take,
                             uint32_t n_skip)
   {
+    // the original data cannot be reconstructed since the dataset was normalized
+    if (index.metric() == cuvs::distance::DistanceType::CosineExpanded) { return; }
     auto& rec_list = index.lists()[label];
     auto dim       = index.dim();
     n_take         = std::min<uint32_t>(n_take, rec_list->size.load());
@@ -313,6 +315,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     auto old_list = index->lists()[label];
     auto n_rows   = old_list->size.load();
     if (n_rows == 0) { return; }
+    if (index->metric() == cuvs::distance::DistanceType::CosineExpanded) { return; }
 
     auto vectors_1 = raft::make_device_matrix<EvalT>(handle_, n_rows, index->dim());
     auto indices   = raft::make_device_vector<IdxT>(handle_, n_rows);
@@ -374,7 +377,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                                   cuvs::Compare<uint8_t>{}));
 
     // Pack a few vectors back to the list.
-    int row_offset = 9;
+    int row_offset = 5;
     int n_vec      = 3;
     ASSERT_TRUE(row_offset + n_vec < n_rows);
     size_t offset      = row_offset * index->pq_dim();
@@ -884,6 +887,25 @@ inline auto enum_variety_l2sqrt() -> test_cases_t
   });
 }
 
+inline auto enum_variety_cosine() -> test_cases_t
+{
+  return map<ivf_pq_inputs>(enum_variety(), [](const ivf_pq_inputs& x) {
+    ivf_pq_inputs y(x);
+    if (y.min_recall.has_value()) {
+      if (y.search_params.lut_dtype == CUDA_R_8U) {
+        // TODO: Increase this recall threshold for 8 bit lut
+        // (https://github.com/rapidsai/cuvs/issues/390)
+        y.min_recall = y.min_recall.value() * 0.70;
+      } else {
+        // In other cases it seems to perform a little bit better, still worse than L2
+        y.min_recall = y.min_recall.value() * 0.94;
+      }
+    }
+    y.index_params.metric = distance::DistanceType::CosineExpanded;
+    return y;
+  });
+}
+
 /**
  * Try different number of n_probes, some of which may trigger the non-fused version of the search
  * kernel.
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
index cdc6c1b7e..834fdb3d0 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_int64_t.cu
@@ -25,9 +25,13 @@ TEST_BUILD_HOST_INPUT_SEARCH(f32_f32_i64)
 TEST_BUILD_HOST_INPUT_OVERLAP_SEARCH(f32_f32_i64)
 TEST_BUILD_EXTEND_SEARCH(f32_f32_i64)
 TEST_BUILD_SERIALIZE_SEARCH(f32_f32_i64)
-INSTANTIATE(f32_f32_i64, defaults() + small_dims() + big_dims_moderate_lut());
+INSTANTIATE(f32_f32_i64,
+            defaults() + small_dims() + big_dims_moderate_lut() + enum_variety_l2() +
+              enum_variety_l2sqrt() + enum_variety_ip() + enum_variety_cosine());
 
 TEST_BUILD_SEARCH(f32_f32_i64_filter)
-INSTANTIATE(f32_f32_i64_filter, defaults() + small_dims() + big_dims_moderate_lut());
+INSTANTIATE(f32_f32_i64_filter,
+            defaults() + small_dims() + big_dims_moderate_lut() + enum_variety_l2() +
+              enum_variety_l2sqrt() + enum_variety_ip() + enum_variety_cosine());
 
 }  // namespace cuvs::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
index 80b0e2ccb..c9e5d4f01 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
@@ -25,8 +25,9 @@ TEST_BUILD_SEARCH(f32_i08_i64)
 TEST_BUILD_HOST_INPUT_SEARCH(f32_i08_i64)
 TEST_BUILD_HOST_INPUT_OVERLAP_SEARCH(f32_i08_i64)
 TEST_BUILD_SERIALIZE_SEARCH(f32_i08_i64)
-INSTANTIATE(f32_i08_i64, defaults() + big_dims() + var_k());
+INSTANTIATE(f32_i08_i64, defaults() + big_dims() + var_k() + enum_variety_l2() + enum_variety_ip());
 
 TEST_BUILD_SEARCH(f32_i08_i64_filter)
-INSTANTIATE(f32_i08_i64_filter, defaults() + big_dims() + var_k());
+INSTANTIATE(f32_i08_i64_filter,
+            defaults() + big_dims() + var_k() + enum_variety_l2() + enum_variety_ip());
 }  // namespace cuvs::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
index 0216a1e80..6e0732227 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
@@ -25,8 +25,12 @@ TEST_BUILD_SEARCH(f32_u08_i64)
 TEST_BUILD_HOST_INPUT_SEARCH(f32_u08_i64)
 TEST_BUILD_HOST_INPUT_OVERLAP_SEARCH(f32_u08_i64)
 TEST_BUILD_EXTEND_SEARCH(f32_u08_i64)
-INSTANTIATE(f32_u08_i64, small_dims_per_cluster() + enum_variety());
+INSTANTIATE(f32_u08_i64,
+            small_dims_per_cluster() + enum_variety() + enum_variety_l2() + enum_variety_l2sqrt() +
+              enum_variety_ip());
 
 TEST_BUILD_SEARCH(f32_u08_i64_filter)
-INSTANTIATE(f32_u08_i64_filter, small_dims_per_cluster() + enum_variety());
+INSTANTIATE(f32_u08_i64_filter,
+            small_dims_per_cluster() + enum_variety() + enum_variety_l2() + enum_variety_l2sqrt() +
+              enum_variety_ip());
 }  // namespace cuvs::neighbors::ivf_pq
diff --git a/cpp/test/neighbors/mg.cuh b/cpp/test/neighbors/mg.cuh
new file mode 100644
index 000000000..be30ca615
--- /dev/null
+++ b/cpp/test/neighbors/mg.cuh
@@ -0,0 +1,825 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../test_utils.cuh"
+#include "ann_utils.cuh"
+#include "naive_knn.cuh"
+
+#include <cuvs/neighbors/mg.hpp>
+#include <raft/core/resource/nccl_clique.hpp>
+
+namespace cuvs::neighbors::mg {
+
+enum class algo_t { IVF_FLAT, IVF_PQ, CAGRA };
+enum class d_mode_t { REPLICATED, SHARDED, LOCAL_THEN_DISTRIBUTED, ROUND_ROBIN };
+enum class m_mode_t { MERGE_ON_ROOT_RANK, TREE_MERGE, UNDEFINED };
+
+struct AnnMGInputs {
+  int64_t num_queries;
+  int64_t num_db_vecs;
+  int64_t dim;
+  int64_t k;
+  d_mode_t d_mode;
+  m_mode_t m_mode;
+  algo_t algo;
+  int64_t nprobe;
+  int64_t nlist;
+  cuvs::distance::DistanceType metric;
+  bool adaptive_centers;
+};
+
+template <typename T, typename DataT>
+class AnnMGTest : public ::testing::TestWithParam<AnnMGInputs> {
+ public:
+  AnnMGTest()
+    : stream_(resource::get_cuda_stream(handle_)),
+      clique_(raft::resource::get_nccl_clique(handle_)),
+      ps(::testing::TestWithParam<AnnMGInputs>::GetParam()),
+      d_index_dataset(0, stream_),
+      d_queries(0, stream_),
+      h_index_dataset(0),
+      h_queries(0)
+  {
+  }
+
+  void testAnnMG()
+  {
+    size_t queries_size = ps.num_queries * ps.k;
+    std::vector<int64_t> neighbors_ref(queries_size);
+    std::vector<T> distances_ref(queries_size);
+    std::vector<int64_t> neighbors_snmg_ann(queries_size);
+    std::vector<T> distances_snmg_ann(queries_size);
+    std::vector<uint32_t> neighbors_ref_32bits(queries_size);
+    std::vector<uint32_t> neighbors_snmg_ann_32bits(queries_size);
+
+    {
+      rmm::device_uvector<T> distances_ref_dev(queries_size, stream_);
+      rmm::device_uvector<int64_t> neighbors_ref_dev(queries_size, stream_);
+      cuvs::neighbors::naive_knn<T, DataT, int64_t>(handle_,
+                                                    distances_ref_dev.data(),
+                                                    neighbors_ref_dev.data(),
+                                                    d_queries.data(),
+                                                    d_index_dataset.data(),
+                                                    ps.num_queries,
+                                                    ps.num_db_vecs,
+                                                    ps.dim,
+                                                    ps.k,
+                                                    ps.metric);
+      update_host(distances_ref.data(), distances_ref_dev.data(), queries_size, stream_);
+      update_host(neighbors_ref.data(), neighbors_ref_dev.data(), queries_size, stream_);
+      resource::sync_stream(handle_);
+    }
+
+    int64_t n_rows_per_search_batch = 3000;  // [3000, 3000, 1000] == 7000 rows
+
+    // IVF-Flat
+    if (ps.algo == algo_t::IVF_FLAT &&
+        (ps.d_mode == d_mode_t::REPLICATED || ps.d_mode == d_mode_t::SHARDED)) {
+      distribution_mode d_mode;
+      if (ps.d_mode == d_mode_t::REPLICATED)
+        d_mode = distribution_mode::REPLICATED;
+      else
+        d_mode = distribution_mode::SHARDED;
+
+      mg::index_params<ivf_flat::index_params> index_params;
+      index_params.n_lists                  = ps.nlist;
+      index_params.metric                   = ps.metric;
+      index_params.adaptive_centers         = ps.adaptive_centers;
+      index_params.add_data_on_build        = false;
+      index_params.kmeans_trainset_fraction = 1.0;
+      index_params.metric_arg               = 0;
+      index_params.mode                     = d_mode;
+
+      mg::search_params<ivf_flat::search_params> search_params;
+      search_params.n_probes    = ps.nprobe;
+      search_params.search_mode = LOAD_BALANCER;
+
+      auto index_dataset = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_index_dataset.data(), ps.num_db_vecs, ps.dim);
+      auto queries = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_queries.data(), ps.num_queries, ps.dim);
+      auto neighbors = raft::make_host_matrix_view<int64_t, int64_t, row_major>(
+        neighbors_snmg_ann.data(), ps.num_queries, ps.k);
+      auto distances = raft::make_host_matrix_view<float, int64_t, row_major>(
+        distances_snmg_ann.data(), ps.num_queries, ps.k);
+
+      {
+        auto index = cuvs::neighbors::mg::build(handle_, index_params, index_dataset);
+        cuvs::neighbors::mg::extend(handle_, index, index_dataset, std::nullopt);
+        cuvs::neighbors::mg::serialize(handle_, index, "mg_ivf_flat_index");
+      }
+      auto new_index =
+        cuvs::neighbors::mg::deserialize_flat<DataT, int64_t>(handle_, "mg_ivf_flat_index");
+
+      if (ps.m_mode == m_mode_t::MERGE_ON_ROOT_RANK)
+        search_params.merge_mode = MERGE_ON_ROOT_RANK;
+      else
+        search_params.merge_mode = TREE_MERGE;
+      cuvs::neighbors::mg::search(
+        handle_, new_index, search_params, queries, neighbors, distances, n_rows_per_search_batch);
+      resource::sync_stream(handle_);
+
+      double min_recall = static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist);
+      ASSERT_TRUE(eval_neighbours(neighbors_ref,
+                                  neighbors_snmg_ann,
+                                  distances_ref,
+                                  distances_snmg_ann,
+                                  ps.num_queries,
+                                  ps.k,
+                                  0.001,
+                                  min_recall));
+      std::fill(neighbors_snmg_ann.begin(), neighbors_snmg_ann.end(), 0);
+      std::fill(distances_snmg_ann.begin(), distances_snmg_ann.end(), 0);
+    }
+
+    // IVF-PQ
+    if (ps.algo == algo_t::IVF_PQ &&
+        (ps.d_mode == d_mode_t::REPLICATED || ps.d_mode == d_mode_t::SHARDED)) {
+      distribution_mode d_mode;
+      if (ps.d_mode == d_mode_t::REPLICATED)
+        d_mode = distribution_mode::REPLICATED;
+      else
+        d_mode = distribution_mode::SHARDED;
+
+      mg::index_params<ivf_pq::index_params> index_params;
+      index_params.n_lists                  = ps.nlist;
+      index_params.metric                   = ps.metric;
+      index_params.add_data_on_build        = false;
+      index_params.kmeans_trainset_fraction = 1.0;
+      index_params.metric_arg               = 0;
+      index_params.mode                     = d_mode;
+
+      mg::search_params<ivf_pq::search_params> search_params;
+      search_params.n_probes    = ps.nprobe;
+      search_params.search_mode = LOAD_BALANCER;
+
+      auto index_dataset = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_index_dataset.data(), ps.num_db_vecs, ps.dim);
+      auto queries = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_queries.data(), ps.num_queries, ps.dim);
+      auto neighbors = raft::make_host_matrix_view<int64_t, int64_t, row_major>(
+        neighbors_snmg_ann.data(), ps.num_queries, ps.k);
+      auto distances = raft::make_host_matrix_view<float, int64_t, row_major>(
+        distances_snmg_ann.data(), ps.num_queries, ps.k);
+
+      {
+        auto index = cuvs::neighbors::mg::build(handle_, index_params, index_dataset);
+        cuvs::neighbors::mg::extend(handle_, index, index_dataset, std::nullopt);
+        cuvs::neighbors::mg::serialize(handle_, index, "mg_ivf_pq_index");
+      }
+      auto new_index =
+        cuvs::neighbors::mg::deserialize_pq<DataT, int64_t>(handle_, "mg_ivf_pq_index");
+
+      if (ps.m_mode == m_mode_t::MERGE_ON_ROOT_RANK)
+        search_params.merge_mode = MERGE_ON_ROOT_RANK;
+      else
+        search_params.merge_mode = TREE_MERGE;
+      cuvs::neighbors::mg::search(
+        handle_, new_index, search_params, queries, neighbors, distances, n_rows_per_search_batch);
+      resource::sync_stream(handle_);
+
+      double min_recall = static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist);
+      ASSERT_TRUE(eval_neighbours(neighbors_ref,
+                                  neighbors_snmg_ann,
+                                  distances_ref,
+                                  distances_snmg_ann,
+                                  ps.num_queries,
+                                  ps.k,
+                                  0.001,
+                                  min_recall));
+      std::fill(neighbors_snmg_ann.begin(), neighbors_snmg_ann.end(), 0);
+      std::fill(distances_snmg_ann.begin(), distances_snmg_ann.end(), 0);
+    }
+
+    // CAGRA
+    if (ps.algo == algo_t::CAGRA &&
+        (ps.d_mode == d_mode_t::REPLICATED || ps.d_mode == d_mode_t::SHARDED)) {
+      distribution_mode d_mode;
+      if (ps.d_mode == d_mode_t::REPLICATED)
+        d_mode = distribution_mode::REPLICATED;
+      else
+        d_mode = distribution_mode::SHARDED;
+
+      mg::index_params<cagra::index_params> index_params;
+      index_params.graph_build_params = cagra::graph_build_params::ivf_pq_params(
+        raft::matrix_extent<int64_t>(ps.num_db_vecs, ps.dim));
+      index_params.mode = d_mode;
+
+      mg::search_params<cagra::search_params> search_params;
+
+      auto index_dataset = raft::make_host_matrix_view<const DataT, uint32_t, row_major>(
+        h_index_dataset.data(), ps.num_db_vecs, ps.dim);
+      auto queries = raft::make_host_matrix_view<const DataT, uint32_t, row_major>(
+        h_queries.data(), ps.num_queries, ps.dim);
+      auto neighbors = raft::make_host_matrix_view<uint32_t, uint32_t, row_major>(
+        neighbors_snmg_ann_32bits.data(), ps.num_queries, ps.k);
+      auto distances = raft::make_host_matrix_view<float, uint32_t, row_major>(
+        distances_snmg_ann.data(), ps.num_queries, ps.k);
+
+      {
+        auto index = cuvs::neighbors::mg::build(handle_, index_params, index_dataset);
+        cuvs::neighbors::mg::serialize(handle_, index, "mg_cagra_index");
+      }
+      auto new_index =
+        cuvs::neighbors::mg::deserialize_cagra<DataT, uint32_t>(handle_, "mg_cagra_index");
+
+      if (ps.m_mode == m_mode_t::MERGE_ON_ROOT_RANK)
+        search_params.merge_mode = MERGE_ON_ROOT_RANK;
+      else
+        search_params.merge_mode = TREE_MERGE;
+      cuvs::neighbors::mg::search(
+        handle_, new_index, search_params, queries, neighbors, distances, n_rows_per_search_batch);
+      resource::sync_stream(handle_);
+
+      double min_recall = static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist);
+      ASSERT_TRUE(eval_neighbours(neighbors_ref_32bits,
+                                  neighbors_snmg_ann_32bits,
+                                  distances_ref,
+                                  distances_snmg_ann,
+                                  ps.num_queries,
+                                  ps.k,
+                                  0.001,
+                                  min_recall));
+      std::fill(neighbors_snmg_ann_32bits.begin(), neighbors_snmg_ann_32bits.end(), 0);
+      std::fill(distances_snmg_ann.begin(), distances_snmg_ann.end(), 0);
+    }
+
+    if (ps.algo == algo_t::IVF_FLAT && ps.d_mode == d_mode_t::LOCAL_THEN_DISTRIBUTED) {
+      ivf_flat::index_params index_params;
+      index_params.n_lists                  = ps.nlist;
+      index_params.metric                   = ps.metric;
+      index_params.adaptive_centers         = ps.adaptive_centers;
+      index_params.add_data_on_build        = true;
+      index_params.kmeans_trainset_fraction = 1.0;
+      index_params.metric_arg               = 0;
+
+      mg::search_params<ivf_flat::search_params> search_params;
+      search_params.n_probes    = ps.nprobe;
+      search_params.search_mode = LOAD_BALANCER;
+
+      {
+        auto index_dataset = raft::make_device_matrix_view<const DataT, int64_t>(
+          d_index_dataset.data(), ps.num_db_vecs, ps.dim);
+        auto index = cuvs::neighbors::ivf_flat::build(handle_, index_params, index_dataset);
+        ivf_flat::serialize(handle_, "local_ivf_flat_index", index);
+      }
+
+      auto queries = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_queries.data(), ps.num_queries, ps.dim);
+      auto neighbors = raft::make_host_matrix_view<int64_t, int64_t, row_major>(
+        neighbors_snmg_ann.data(), ps.num_queries, ps.k);
+      auto distances = raft::make_host_matrix_view<float, int64_t, row_major>(
+        distances_snmg_ann.data(), ps.num_queries, ps.k);
+
+      auto distributed_index =
+        cuvs::neighbors::mg::distribute_flat<DataT, int64_t>(handle_, "local_ivf_flat_index");
+      search_params.merge_mode = TREE_MERGE;
+      cuvs::neighbors::mg::search(handle_,
+                                  distributed_index,
+                                  search_params,
+                                  queries,
+                                  neighbors,
+                                  distances,
+                                  n_rows_per_search_batch);
+
+      resource::sync_stream(handle_);
+
+      double min_recall = static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist);
+      ASSERT_TRUE(eval_neighbours(neighbors_ref,
+                                  neighbors_snmg_ann,
+                                  distances_ref,
+                                  distances_snmg_ann,
+                                  ps.num_queries,
+                                  ps.k,
+                                  0.001,
+                                  min_recall));
+      std::fill(neighbors_snmg_ann.begin(), neighbors_snmg_ann.end(), 0);
+      std::fill(distances_snmg_ann.begin(), distances_snmg_ann.end(), 0);
+    }
+
+    if (ps.algo == algo_t::IVF_PQ && ps.d_mode == d_mode_t::LOCAL_THEN_DISTRIBUTED) {
+      ivf_pq::index_params index_params;
+      index_params.n_lists                  = ps.nlist;
+      index_params.metric                   = ps.metric;
+      index_params.add_data_on_build        = true;
+      index_params.kmeans_trainset_fraction = 1.0;
+      index_params.metric_arg               = 0;
+
+      mg::search_params<ivf_pq::search_params> search_params;
+      search_params.n_probes    = ps.nprobe;
+      search_params.search_mode = LOAD_BALANCER;
+
+      {
+        auto index_dataset = raft::make_device_matrix_view<const DataT, int64_t>(
+          d_index_dataset.data(), ps.num_db_vecs, ps.dim);
+        auto index = cuvs::neighbors::ivf_pq::build(handle_, index_params, index_dataset);
+        ivf_pq::serialize(handle_, "local_ivf_pq_index", index);
+      }
+
+      auto queries = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_queries.data(), ps.num_queries, ps.dim);
+      auto neighbors = raft::make_host_matrix_view<int64_t, int64_t, row_major>(
+        neighbors_snmg_ann.data(), ps.num_queries, ps.k);
+      auto distances = raft::make_host_matrix_view<float, int64_t, row_major>(
+        distances_snmg_ann.data(), ps.num_queries, ps.k);
+
+      auto distributed_index =
+        cuvs::neighbors::mg::distribute_pq<DataT, int64_t>(handle_, "local_ivf_pq_index");
+      search_params.merge_mode = TREE_MERGE;
+      cuvs::neighbors::mg::search(handle_,
+                                  distributed_index,
+                                  search_params,
+                                  queries,
+                                  neighbors,
+                                  distances,
+                                  n_rows_per_search_batch);
+
+      resource::sync_stream(handle_);
+
+      double min_recall = static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist);
+      ASSERT_TRUE(eval_neighbours(neighbors_ref,
+                                  neighbors_snmg_ann,
+                                  distances_ref,
+                                  distances_snmg_ann,
+                                  ps.num_queries,
+                                  ps.k,
+                                  0.001,
+                                  min_recall));
+      std::fill(neighbors_snmg_ann.begin(), neighbors_snmg_ann.end(), 0);
+      std::fill(distances_snmg_ann.begin(), distances_snmg_ann.end(), 0);
+    }
+
+    if (ps.algo == algo_t::CAGRA && ps.d_mode == d_mode_t::LOCAL_THEN_DISTRIBUTED) {
+      cagra::index_params index_params;
+      index_params.graph_build_params = cagra::graph_build_params::ivf_pq_params(
+        raft::matrix_extent<int64_t>(ps.num_db_vecs, ps.dim));
+
+      mg::search_params<cagra::search_params> search_params;
+
+      {
+        auto index_dataset = raft::make_device_matrix_view<const DataT, int64_t>(
+          d_index_dataset.data(), ps.num_db_vecs, ps.dim);
+        auto index = cuvs::neighbors::cagra::build(handle_, index_params, index_dataset);
+        cuvs::neighbors::cagra::serialize(handle_, "local_cagra_index", index);
+      }
+
+      auto queries = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_queries.data(), ps.num_queries, ps.dim);
+      auto neighbors = raft::make_host_matrix_view<uint32_t, int64_t, row_major>(
+        neighbors_snmg_ann_32bits.data(), ps.num_queries, ps.k);
+      auto distances = raft::make_host_matrix_view<float, int64_t, row_major>(
+        distances_snmg_ann.data(), ps.num_queries, ps.k);
+
+      auto distributed_index =
+        cuvs::neighbors::mg::distribute_cagra<DataT, uint32_t>(handle_, "local_cagra_index");
+
+      search_params.merge_mode = TREE_MERGE;
+      cuvs::neighbors::mg::search(handle_,
+                                  distributed_index,
+                                  search_params,
+                                  queries,
+                                  neighbors,
+                                  distances,
+                                  n_rows_per_search_batch);
+
+      resource::sync_stream(handle_);
+
+      double min_recall = static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist);
+      ASSERT_TRUE(eval_neighbours(neighbors_ref_32bits,
+                                  neighbors_snmg_ann_32bits,
+                                  distances_ref,
+                                  distances_snmg_ann,
+                                  ps.num_queries,
+                                  ps.k,
+                                  0.001,
+                                  min_recall));
+      std::fill(neighbors_snmg_ann_32bits.begin(), neighbors_snmg_ann_32bits.end(), 0);
+      std::fill(distances_snmg_ann.begin(), distances_snmg_ann.end(), 0);
+    }
+
+    if (ps.algo == algo_t::IVF_FLAT && ps.d_mode == d_mode_t::ROUND_ROBIN) {
+      ASSERT_TRUE(ps.num_queries <= 4);
+
+      mg::index_params<ivf_flat::index_params> index_params;
+      index_params.n_lists                  = ps.nlist;
+      index_params.metric                   = ps.metric;
+      index_params.adaptive_centers         = ps.adaptive_centers;
+      index_params.add_data_on_build        = false;
+      index_params.kmeans_trainset_fraction = 1.0;
+      index_params.metric_arg               = 0;
+      index_params.mode                     = REPLICATED;
+
+      mg::search_params<ivf_flat::search_params> search_params;
+      search_params.n_probes    = ps.nprobe;
+      search_params.search_mode = ROUND_ROBIN;
+
+      auto index_dataset = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_index_dataset.data(), ps.num_db_vecs, ps.dim);
+      auto small_batch_query = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_queries.data(), ps.num_queries, ps.dim);
+
+      auto index = cuvs::neighbors::mg::build(handle_, index_params, index_dataset);
+      cuvs::neighbors::mg::extend(handle_, index, index_dataset, std::nullopt);
+
+      int n_parallel_searches = 16;
+      std::vector<char> searches_correctness(n_parallel_searches);
+      std::vector<int64_t> load_balancer_neighbors_snmg_ann(n_parallel_searches * ps.num_queries *
+                                                            ps.k);
+      std::vector<float> load_balancer_distances_snmg_ann(n_parallel_searches * ps.num_queries *
+                                                          ps.k);
+#pragma omp parallel for
+      for (uint64_t search_idx = 0; search_idx < searches_correctness.size(); search_idx++) {
+        uint64_t offset            = search_idx * ps.num_queries * ps.k;
+        auto small_batch_neighbors = raft::make_host_matrix_view<int64_t, int64_t, row_major>(
+          load_balancer_neighbors_snmg_ann.data() + offset, ps.num_queries, ps.k);
+        auto small_batch_distances = raft::make_host_matrix_view<float, int64_t, row_major>(
+          load_balancer_distances_snmg_ann.data() + offset, ps.num_queries, ps.k);
+        cuvs::neighbors::mg::search(handle_,
+                                    index,
+                                    search_params,
+                                    small_batch_query,
+                                    small_batch_neighbors,
+                                    small_batch_distances,
+                                    n_rows_per_search_batch);
+
+        std::vector<int64_t> small_batch_neighbors_vec(
+          small_batch_neighbors.data_handle(),
+          small_batch_neighbors.data_handle() + small_batch_neighbors.size());
+        std::vector<float> small_batch_distances_vec(
+          small_batch_distances.data_handle(),
+          small_batch_distances.data_handle() + small_batch_distances.size());
+        searches_correctness[search_idx] = eval_neighbours(neighbors_ref,
+                                                           small_batch_neighbors_vec,
+                                                           distances_ref,
+                                                           small_batch_distances_vec,
+                                                           ps.num_queries,
+                                                           ps.k,
+                                                           0.001,
+                                                           0.9);
+      }
+      ASSERT_TRUE(std::all_of(searches_correctness.begin(),
+                              searches_correctness.end(),
+                              [](char val) { return val != 0; }));
+    }
+
+    if (ps.algo == algo_t::IVF_PQ && ps.d_mode == d_mode_t::ROUND_ROBIN) {
+      ASSERT_TRUE(ps.num_queries <= 4);
+
+      mg::index_params<ivf_pq::index_params> index_params;
+      index_params.n_lists                  = ps.nlist;
+      index_params.metric                   = ps.metric;
+      index_params.add_data_on_build        = false;
+      index_params.kmeans_trainset_fraction = 1.0;
+      index_params.metric_arg               = 0;
+      index_params.mode                     = REPLICATED;
+
+      mg::search_params<ivf_pq::search_params> search_params;
+      search_params.n_probes    = ps.nprobe;
+      search_params.search_mode = ROUND_ROBIN;
+
+      auto index_dataset = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_index_dataset.data(), ps.num_db_vecs, ps.dim);
+      auto small_batch_query = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_queries.data(), ps.num_queries, ps.dim);
+
+      auto index = cuvs::neighbors::mg::build(handle_, index_params, index_dataset);
+      cuvs::neighbors::mg::extend(handle_, index, index_dataset, std::nullopt);
+
+      int n_parallel_searches = 16;
+      std::vector<char> searches_correctness(n_parallel_searches);
+      std::vector<int64_t> load_balancer_neighbors_snmg_ann(n_parallel_searches * ps.num_queries *
+                                                            ps.k);
+      std::vector<float> load_balancer_distances_snmg_ann(n_parallel_searches * ps.num_queries *
+                                                          ps.k);
+#pragma omp parallel for
+      for (uint64_t search_idx = 0; search_idx < searches_correctness.size(); search_idx++) {
+        uint64_t offset            = search_idx * ps.num_queries * ps.k;
+        auto small_batch_neighbors = raft::make_host_matrix_view<int64_t, int64_t, row_major>(
+          load_balancer_neighbors_snmg_ann.data() + offset, ps.num_queries, ps.k);
+        auto small_batch_distances = raft::make_host_matrix_view<float, int64_t, row_major>(
+          load_balancer_distances_snmg_ann.data() + offset, ps.num_queries, ps.k);
+        cuvs::neighbors::mg::search(handle_,
+                                    index,
+                                    search_params,
+                                    small_batch_query,
+                                    small_batch_neighbors,
+                                    small_batch_distances,
+                                    n_rows_per_search_batch);
+
+        std::vector<int64_t> small_batch_neighbors_vec(
+          small_batch_neighbors.data_handle(),
+          small_batch_neighbors.data_handle() + small_batch_neighbors.size());
+        std::vector<float> small_batch_distances_vec(
+          small_batch_distances.data_handle(),
+          small_batch_distances.data_handle() + small_batch_distances.size());
+        searches_correctness[search_idx] = eval_neighbours(neighbors_ref,
+                                                           small_batch_neighbors_vec,
+                                                           distances_ref,
+                                                           small_batch_distances_vec,
+                                                           ps.num_queries,
+                                                           ps.k,
+                                                           0.001,
+                                                           0.9);
+      }
+      ASSERT_TRUE(std::all_of(searches_correctness.begin(),
+                              searches_correctness.end(),
+                              [](char val) { return val != 0; }));
+    }
+
+    if (ps.algo == algo_t::CAGRA && ps.d_mode == d_mode_t::ROUND_ROBIN) {
+      ASSERT_TRUE(ps.num_queries <= 4);
+
+      mg::index_params<cagra::index_params> index_params;
+      index_params.graph_build_params = cagra::graph_build_params::ivf_pq_params(
+        raft::matrix_extent<int64_t>(ps.num_db_vecs, ps.dim));
+      index_params.mode = REPLICATED;
+
+      mg::search_params<cagra::search_params> search_params;
+      search_params.search_mode = ROUND_ROBIN;
+
+      auto index_dataset = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_index_dataset.data(), ps.num_db_vecs, ps.dim);
+      auto small_batch_query = raft::make_host_matrix_view<const DataT, int64_t, row_major>(
+        h_queries.data(), ps.num_queries, ps.dim);
+
+      auto index = cuvs::neighbors::mg::build(handle_, index_params, index_dataset);
+
+      int n_parallel_searches = 16;
+      std::vector<char> searches_correctness(n_parallel_searches);
+      std::vector<uint32_t> load_balancer_neighbors_snmg_ann(n_parallel_searches * ps.num_queries *
+                                                             ps.k);
+      std::vector<float> load_balancer_distances_snmg_ann(n_parallel_searches * ps.num_queries *
+                                                          ps.k);
+#pragma omp parallel for
+      for (uint64_t search_idx = 0; search_idx < searches_correctness.size(); search_idx++) {
+        uint64_t offset            = search_idx * ps.num_queries * ps.k;
+        auto small_batch_neighbors = raft::make_host_matrix_view<uint32_t, int64_t, row_major>(
+          load_balancer_neighbors_snmg_ann.data() + offset, ps.num_queries, ps.k);
+        auto small_batch_distances = raft::make_host_matrix_view<float, int64_t, row_major>(
+          load_balancer_distances_snmg_ann.data() + offset, ps.num_queries, ps.k);
+        cuvs::neighbors::mg::search(handle_,
+                                    index,
+                                    search_params,
+                                    small_batch_query,
+                                    small_batch_neighbors,
+                                    small_batch_distances,
+                                    n_rows_per_search_batch);
+
+        std::vector<uint32_t> small_batch_neighbors_vec(
+          small_batch_neighbors.data_handle(),
+          small_batch_neighbors.data_handle() + small_batch_neighbors.size());
+        std::vector<float> small_batch_distances_vec(
+          small_batch_distances.data_handle(),
+          small_batch_distances.data_handle() + small_batch_distances.size());
+        searches_correctness[search_idx] = eval_neighbours(neighbors_ref_32bits,
+                                                           small_batch_neighbors_vec,
+                                                           distances_ref,
+                                                           small_batch_distances_vec,
+                                                           ps.num_queries,
+                                                           ps.k,
+                                                           0.001,
+                                                           0.9);
+      }
+      ASSERT_TRUE(std::all_of(searches_correctness.begin(),
+                              searches_correctness.end(),
+                              [](char val) { return val != 0; }));
+    }
+  }
+
+  void SetUp() override
+  {
+    d_index_dataset.resize(ps.num_db_vecs * ps.dim, stream_);
+    d_queries.resize(ps.num_queries * ps.dim, stream_);
+    h_index_dataset.resize(ps.num_db_vecs * ps.dim);
+    h_queries.resize(ps.num_queries * ps.dim);
+
+    raft::random::RngState r(1234ULL);
+    if constexpr (std::is_same<DataT, float>{}) {
+      raft::random::uniform(
+        handle_, r, d_index_dataset.data(), d_index_dataset.size(), DataT(0.1), DataT(2.0));
+      raft::random::uniform(handle_, r, d_queries.data(), d_queries.size(), DataT(0.1), DataT(2.0));
+    } else {
+      raft::random::uniformInt(
+        handle_, r, d_index_dataset.data(), d_index_dataset.size(), DataT(1), DataT(20));
+      raft::random::uniformInt(handle_, r, d_queries.data(), d_queries.size(), DataT(1), DataT(20));
+    }
+
+    raft::copy(h_index_dataset.data(),
+               d_index_dataset.data(),
+               d_index_dataset.size(),
+               resource::get_cuda_stream(handle_));
+    raft::copy(
+      h_queries.data(), d_queries.data(), d_queries.size(), resource::get_cuda_stream(handle_));
+    resource::sync_stream(handle_);
+  }
+
+  void TearDown() override {}
+
+ private:
+  raft::device_resources handle_;
+  rmm::cuda_stream_view stream_;
+  raft::comms::nccl_clique clique_;
+  AnnMGInputs ps;
+  std::vector<DataT> h_index_dataset;
+  std::vector<DataT> h_queries;
+  rmm::device_uvector<DataT> d_index_dataset;
+  rmm::device_uvector<DataT> d_queries;
+};
+
+const std::vector<AnnMGInputs> inputs = {
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::REPLICATED,
+   m_mode_t::UNDEFINED,
+   algo_t::IVF_FLAT,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::REPLICATED,
+   m_mode_t::UNDEFINED,
+   algo_t::IVF_PQ,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+
+  /*
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::REPLICATED,
+   m_mode_t::UNDEFINED,
+   algo_t::CAGRA,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+  */
+
+  /*
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::SHARDED,
+   m_mode_t::MERGE_ON_ROOT_RANK,
+   algo_t::IVF_FLAT,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::SHARDED,
+   m_mode_t::MERGE_ON_ROOT_RANK,
+   algo_t::IVF_PQ,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::SHARDED,
+   m_mode_t::MERGE_ON_ROOT_RANK,
+   algo_t::CAGRA,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::SHARDED,
+   m_mode_t::TREE_MERGE,
+   algo_t::IVF_FLAT,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::SHARDED,
+   m_mode_t::TREE_MERGE,
+   algo_t::IVF_PQ,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::SHARDED,
+   m_mode_t::TREE_MERGE,
+   algo_t::CAGRA,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+  */
+
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::LOCAL_THEN_DISTRIBUTED,
+   m_mode_t::UNDEFINED,
+   algo_t::IVF_FLAT,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::LOCAL_THEN_DISTRIBUTED,
+   m_mode_t::UNDEFINED,
+   algo_t::IVF_PQ,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+
+  /*
+  {7000,
+   10000,
+   8,
+   16,
+   d_mode_t::LOCAL_THEN_DISTRIBUTED,
+   m_mode_t::UNDEFINED,
+   algo_t::CAGRA,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+  */
+
+  {3,
+   10000,
+   8,
+   16,
+   d_mode_t::ROUND_ROBIN,
+   m_mode_t::UNDEFINED,
+   algo_t::IVF_FLAT,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+  {3,
+   10000,
+   8,
+   16,
+   d_mode_t::ROUND_ROBIN,
+   m_mode_t::UNDEFINED,
+   algo_t::IVF_PQ,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+
+  /*
+  {3,
+   10000,
+   8,
+   16,
+   d_mode_t::ROUND_ROBIN,
+   m_mode_t::UNDEFINED,
+   algo_t::CAGRA,
+   40,
+   1024,
+   cuvs::distance::DistanceType::L2Expanded,
+   true},
+  */
+};
+}  // namespace cuvs::neighbors::mg
diff --git a/cpp/test/neighbors/mg/test_float.cu b/cpp/test/neighbors/mg/test_float.cu
new file mode 100644
index 000000000..ef9c9a043
--- /dev/null
+++ b/cpp/test/neighbors/mg/test_float.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../mg.cuh"
+
+namespace cuvs::neighbors::mg {
+
+typedef AnnMGTest<float, float> AnnMGTestF_float;
+TEST_P(AnnMGTestF_float, AnnMG) { this->testAnnMG(); }
+
+INSTANTIATE_TEST_CASE_P(AnnMGTest, AnnMGTestF_float, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::mg
diff --git a/dependencies.yaml b/dependencies.yaml
index c18f53305..2b19b987f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -159,6 +159,7 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
+          - nccl>=2.19
     specific:
       - output_types: conda
         matrices:
diff --git a/docs/source/build.rst b/docs/source/build.rst
index e0659ec65..a6b8ccd13 100644
--- a/docs/source/build.rst
+++ b/docs/source/build.rst
@@ -123,6 +123,16 @@ Once installed, the shared libraries, headers (and any dependencies downloaded a
     ./build.sh libcuvs --uninstall
 
 
+Multi-GPU features
+^^^^^^^^^^^^^^^^^^
+
+To disable the multi-gpu features run :
+
+.. code-block:: bash
+
+    ./build.sh libcuvs --no-mg
+
+
 Building the Googletests
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/c_api/neighbors.rst b/docs/source/c_api/neighbors.rst
index dc55a74dc..9c3fce672 100644
--- a/docs/source/c_api/neighbors.rst
+++ b/docs/source/c_api/neighbors.rst
@@ -13,3 +13,4 @@ Nearest Neighbors
    neighbors_ivf_flat_c.rst
    neighbors_ivf_pq_c.rst
    neighbors_cagra_c.rst
+   neighbors_hnsw_c.rst
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
index e5a9fc977..d55d58eb0 100644
--- a/docs/source/cpp_api/neighbors.rst
+++ b/docs/source/cpp_api/neighbors.rst
@@ -11,7 +11,9 @@ Nearest Neighbors
 
    neighbors_bruteforce.rst
    neighbors_cagra.rst
+   neighbors_hnsw.rst
    neighbors_ivf_flat.rst
    neighbors_ivf_pq.rst
    neighbors_nn_descent.rst
    neighbors_refine.rst
+   neighbors_mg.rst
diff --git a/docs/source/cpp_api/neighbors_mg.rst b/docs/source/cpp_api/neighbors_mg.rst
new file mode 100644
index 000000000..b68defec9
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_mg.rst
@@ -0,0 +1,76 @@
+Distributed ANN
+===============
+
+The SNMG (single-node multi-GPUs) ANN API provides a set of functions to deploy ANN indexes on multiple GPUs.
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <cuvs/neighbors/mg.hpp>``
+
+namespace *cuvs::neighbors::mg*
+
+Index build parameters
+----------------------
+
+.. doxygengroup:: mg_cpp_index_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Search parameters
+----------------------
+
+.. doxygengroup:: mg_cpp_search_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index build
+-----------
+
+.. doxygengroup:: mg_cpp_index_build
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index extend
+------------
+
+.. doxygengroup:: mg_cpp_index_extend
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index search
+------------
+
+.. doxygengroup:: mg_cpp_index_search
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: mg_cpp_serialize
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index deserialize
+-----------------
+
+.. doxygengroup:: mg_cpp_deserialize
+    :project: cuvs
+    :members:
+    :content-only:
+
+Distribute pre-built local index
+--------------------------------
+
+.. doxygengroup:: mg_cpp_distribute
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/cuvs_bench/index.rst b/docs/source/cuvs_bench/index.rst
index cf812a62e..2ebf584c3 100644
--- a/docs/source/cuvs_bench/index.rst
+++ b/docs/source/cuvs_bench/index.rst
@@ -630,7 +630,7 @@ The table below contains all algorithms supported by cuVS. Each unique algorithm
    - `cuvs_brute_force`, `cuvs_cagra`, `cuvs_ivf_flat`, `cuvs_ivf_pq`, `cuvs_cagra_hnswlib`
 
 Adding a new index algorithm
-=============================
+============================
 
 Implementation and configuration
 --------------------------------
diff --git a/docs/source/python_api/neighbors.rst b/docs/source/python_api/neighbors.rst
index 022c50de3..cd4f2609c 100644
--- a/docs/source/python_api/neighbors.rst
+++ b/docs/source/python_api/neighbors.rst
@@ -11,5 +11,6 @@ Nearest Neighbors
 
    neighbors_brute_force.rst
    neighbors_cagra.rst
+   neighbors_hnsw.rst
    neighbors_ivf_flat.rst
    neighbors_ivf_pq.rst
diff --git a/docs/source/python_api/neighbors_hnsw.rst b/docs/source/python_api/neighbors_hnsw.rst
new file mode 100644
index 000000000..9922805b3
--- /dev/null
+++ b/docs/source/python_api/neighbors_hnsw.rst
@@ -0,0 +1,30 @@
+HNSW
+====
+
+This is a wrapper for hnswlib, to load a CAGRA index as an immutable HNSW index. The loaded HNSW index is only compatible in cuVS, and can be searched using wrapper functions.
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+Index search parameters
+#######################
+
+.. autoclass:: cuvs.neighbors.hnsw.SearchParams
+    :members:
+
+Index
+#####
+
+.. autoclass:: cuvs.neighbors.hnsw.Index
+    :members:
+
+Index Conversion
+################
+
+.. autofunction:: cuvs.neighbors.hnsw.from_cagra
+
+Index search
+############
+
+.. autofunction:: cuvs.neighbors.hnsw.search
diff --git a/python/cuvs/cuvs/neighbors/CMakeLists.txt b/python/cuvs/cuvs/neighbors/CMakeLists.txt
index 21c3db5da..f68bbea53 100644
--- a/python/cuvs/cuvs/neighbors/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/CMakeLists.txt
@@ -14,6 +14,7 @@
 
 add_subdirectory(brute_force)
 add_subdirectory(cagra)
+add_subdirectory(hnsw)
 add_subdirectory(ivf_flat)
 add_subdirectory(ivf_pq)
 add_subdirectory(filters)
diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd
index b23c2a4b3..bba5a91a8 100644
--- a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd
+++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd
@@ -17,6 +17,7 @@
 
 from libc.stdint cimport (
     int8_t,
+    int32_t,
     int64_t,
     uint8_t,
     uint32_t,
@@ -100,6 +101,8 @@ cdef extern from "cuvs/neighbors/cagra.h" nogil:
 
     cuvsError_t cuvsCagraIndexDestroy(cuvsCagraIndex_t index)
 
+    cuvsError_t cuvsCagraIndexGetDims(cuvsCagraIndex_t index, int32_t* dim)
+
     cuvsError_t cuvsCagraBuild(cuvsResources_t res,
                                cuvsCagraIndexParams* params,
                                DLManagedTensor* dataset,
@@ -117,6 +120,20 @@ cdef extern from "cuvs/neighbors/cagra.h" nogil:
                                    cuvsCagraIndex_t index,
                                    bool include_dataset) except +
 
+    cuvsError_t cuvsCagraSerializeToHnswlib(cuvsResources_t res,
+                                            const char * filename,
+                                            cuvsCagraIndex_t index) except +
+
     cuvsError_t cuvsCagraDeserialize(cuvsResources_t res,
                                      const char * filename,
                                      cuvsCagraIndex_t index) except +
+
+cdef class Index:
+    """
+    CAGRA index object. This object stores the trained CAGRA index state
+    which can be used to perform nearest neighbors searches.
+    """
+
+    cdef cuvsCagraIndex_t index
+    cdef bool trained
+    cdef str active_index_type
diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
index f940ab8bf..95209dbeb 100644
--- a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
+++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
@@ -36,6 +36,7 @@ from pylibraft.neighbors.common import _check_input_array
 
 from libc.stdint cimport (
     int8_t,
+    int32_t,
     int64_t,
     uint8_t,
     uint32_t,
@@ -206,16 +207,9 @@ cdef class IndexParams:
 
 
 cdef class Index:
-    """
-    CAGRA index object. This object stores the trained CAGRA index state
-    which can be used to perform nearest neighbors searches.
-    """
-
-    cdef cuvsCagraIndex_t index
-    cdef bool trained
-
     def __cinit__(self):
         self.trained = False
+        self.active_index_type = None
         check_cuvs(cuvsCagraIndexCreate(&self.index))
 
     def __dealloc__(self):
@@ -226,6 +220,12 @@ cdef class Index:
     def trained(self):
         return self.trained
 
+    @property
+    def dim(self):
+        cdef int32_t dim
+        check_cuvs(cuvsCagraIndexGetDims(self.index, &dim))
+        return dim
+
     def __repr__(self):
         # todo(dgd): update repr as we expose data through C API
         attr_str = []
@@ -299,6 +299,7 @@ def build(IndexParams index_params, dataset, resources=None):
             idx.index
         ))
         idx.trained = True
+        idx.active_index_type = dataset_ai.dtype.name
 
     return idx
 
diff --git a/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt b/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt
new file mode 100644
index 000000000..1f9c422ca
--- /dev/null
+++ b/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt
@@ -0,0 +1,24 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Set the list of Cython files to build
+set(cython_sources hnsw.pyx)
+set(linked_libraries cuvs::cuvs cuvs::c_api)
+
+# Build all of the Cython targets
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_hnsw_
+)
diff --git a/python/cuvs/cuvs/neighbors/hnsw/__init__.pxd b/python/cuvs/cuvs/neighbors/hnsw/__init__.pxd
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/cuvs/cuvs/neighbors/hnsw/__init__.py b/python/cuvs/cuvs/neighbors/hnsw/__init__.py
new file mode 100644
index 000000000..5efcdf68b
--- /dev/null
+++ b/python/cuvs/cuvs/neighbors/hnsw/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .hnsw import Index, SearchParams, from_cagra, load, save, search
+
+__all__ = [
+    "Index",
+    "SearchParams",
+    "load",
+    "save",
+    "search",
+    "from_cagra",
+]
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
new file mode 100644
index 000000000..1cdc97406
--- /dev/null
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
@@ -0,0 +1,53 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: language_level=3
+
+from libc.stdint cimport int32_t, uintptr_t
+
+from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t
+from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor
+from cuvs.distance_type cimport cuvsDistanceType
+
+
+cdef extern from "cuvs/neighbors/hnsw.h" nogil:
+    ctypedef struct cuvsHnswSearchParams:
+        int32_t ef
+        int32_t numThreads
+
+    ctypedef cuvsHnswSearchParams* cuvsHnswSearchParams_t
+
+    ctypedef struct cuvsHnswIndex:
+        uintptr_t addr
+        DLDataType dtype
+
+    ctypedef cuvsHnswIndex* cuvsHnswIndex_t
+
+    cuvsError_t cuvsHnswIndexCreate(cuvsHnswIndex_t* index)
+
+    cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index)
+
+    cuvsError_t cuvsHnswSearch(cuvsResources_t res,
+                               cuvsHnswSearchParams* params,
+                               cuvsHnswIndex_t index,
+                               DLManagedTensor* queries,
+                               DLManagedTensor* neighbors,
+                               DLManagedTensor* distances) except +
+
+    cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
+                                    const char * filename,
+                                    int32_t dim,
+                                    cuvsDistanceType metric,
+                                    cuvsHnswIndex_t index) except +
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
new file mode 100644
index 000000000..018fcfef9
--- /dev/null
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
@@ -0,0 +1,380 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: language_level=3
+
+from libc.stdint cimport uint32_t
+from libcpp cimport bool
+from libcpp.string cimport string
+
+from cuvs.common.exceptions import check_cuvs
+from cuvs.common.resources import auto_sync_resources
+
+from cuvs.common cimport cydlpack
+
+import numpy as np
+
+from cuvs.distance import DISTANCE_TYPES
+
+from cuvs.neighbors.cagra cimport cagra
+
+import os
+import uuid
+
+from pylibraft.common import auto_convert_output
+from pylibraft.common.cai_wrapper import wrap_array
+from pylibraft.common.interruptible import cuda_interruptible
+from pylibraft.neighbors.common import _check_input_array
+
+
+cdef class SearchParams:
+    """
+    HNSW search parameters
+
+    Parameters
+    ----------
+    ef: int, default = 200
+        Maximum number of candidate list size used during search.
+    num_threads: int, default = 0
+        Number of CPU threads used to increase search parallelism.
+        When set to 0, the number of threads is automatically determined
+        using OpenMP's `omp_get_max_threads()`.
+    """
+
+    cdef cuvsHnswSearchParams params
+
+    def __init__(self, *,
+                 ef=200,
+                 num_threads=0):
+        self.params.ef = ef
+        self.params.numThreads = num_threads
+
+    def __repr__(self):
+        attr_str = [attr + "=" + str(getattr(self, attr))
+                    for attr in [
+                        "ef", "num_threads"]]
+        return "SearchParams(type=HNSW, " + (", ".join(attr_str)) + ")"
+
+    @property
+    def ef(self):
+        return self.params.ef
+
+    @property
+    def num_threads(self):
+        return self.params.numThreads
+
+
+cdef class Index:
+    """
+    HNSW index object. This object stores the trained HNSW index state
+    which can be used to perform nearest neighbors searches.
+    """
+
+    cdef cuvsHnswIndex_t index
+    cdef bool trained
+
+    def __cinit__(self):
+        self.trained = False
+        check_cuvs(cuvsHnswIndexCreate(&self.index))
+
+    def __dealloc__(self):
+        if self.index is not NULL:
+            check_cuvs(cuvsHnswIndexDestroy(self.index))
+
+    @property
+    def trained(self):
+        return self.trained
+
+    def __repr__(self):
+        # todo(dgd): update repr as we expose data through C API
+        attr_str = []
+        return "Index(type=HNSW, metric=L2" + (", ".join(attr_str)) + ")"
+
+
+@auto_sync_resources
+def save(filename, cagra.Index index, resources=None):
+    """
+    Saves the CAGRA index to a file as an hnswlib index.
+    The saved index is immutable and can only be searched by the hnswlib
+    wrapper in cuVS, as the format is not compatible with the original
+    hnswlib.
+
+    Saving / loading the index is experimental. The serialization format is
+    subject to change.
+
+    Parameters
+    ----------
+    filename : string
+        Name of the file.
+    index : Index
+        Trained CAGRA index.
+    {resources_docstring}
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cuvs.neighbors import cagra
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> # Build index
+    >>> index = cagra.build(cagra.IndexParams(), dataset)
+    >>> # Serialize and deserialize the cagra index built
+    >>> hnsw.save("my_index.bin", index)
+    """
+    cdef string c_filename = filename.encode('utf-8')
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+    check_cuvs(cagra.cuvsCagraSerializeToHnswlib(res,
+                                                 c_filename.c_str(),
+                                                 index.index))
+
+
+@auto_sync_resources
+def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
+    """
+    Loads base-layer-only hnswlib index from file, which was originally
+    saved as a built CAGRA index. The loaded index is immutable and can only
+    be searched by the hnswlib wrapper in cuVS, as the format is not
+    compatible with the original hnswlib.
+
+    Saving / loading the index is experimental. The serialization format is
+    subject to change, therefore loading an index saved with a previous
+    version of cuVS is not guaranteed to work.
+
+    Parameters
+    ----------
+    filename : string
+        Name of the file.
+    dim : int
+        Dimensions of the training dataest
+    dtype : np.dtype of the saved index
+        Valid values for dtype: [np.float32, np.byte, np.ubyte]
+    metric : string denoting the metric type, default="sqeuclidean"
+        Valid values for metric: ["sqeuclidean", "inner_product"], where
+            - sqeuclidean is the euclidean distance without the square root
+              operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
+            - inner_product distance is defined as
+              distance(a, b) = \\sum_i a_i * b_i.
+    {resources_docstring}
+
+    Returns
+    -------
+    index : HnswIndex
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cuvs.neighbors import cagra
+    >>> from cuvs.neighbors import hnsw
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> # Build index
+    >>> index = cagra.build(cagra.IndexParams(), dataset)
+    >>> # Serialize the CAGRA index to hnswlib base layer only index format
+    >>> hnsw.save("my_index.bin", index)
+    >>> index = hnsw.load("my_index.bin", n_features, np.float32,
+    ...                   "sqeuclidean")
+    """
+    cdef Index idx = Index()
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+    cdef string c_filename = filename.encode('utf-8')
+    cdef cydlpack.DLDataType dl_dtype
+    if dtype == np.float32:
+        dl_dtype.code = cydlpack.kDLFloat
+        dl_dtype.bits = 32
+        dl_dtype.lanes = 1
+    elif dtype == np.ubyte:
+        dl_dtype.code = cydlpack.kDLUInt
+        dl_dtype.bits = 8
+        dl_dtype.lanes = 1
+    elif dtype == np.byte:
+        dl_dtype.code = cydlpack.kDLInt
+        dl_dtype.bits = 8
+        dl_dtype.lanes = 1
+    else:
+        raise ValueError("Only float32 is supported for dtype")
+
+    idx.index.dtype = dl_dtype
+    cdef cuvsDistanceType distance_type = DISTANCE_TYPES[metric]
+
+    check_cuvs(cuvsHnswDeserialize(
+        res,
+        c_filename.c_str(),
+        dim,
+        distance_type,
+        idx.index
+    ))
+    idx.trained = True
+    return idx
+
+
+@auto_sync_resources
+def from_cagra(cagra.Index index, temporary_index_path=None, resources=None):
+    """
+    Returns an hnsw base-layer-only index from a CAGRA index.
+
+    NOTE: This method uses the filesystem to write the CAGRA index in
+          `/tmp/<random_number>.bin` or the parameter `temporary_index_path`
+          if not None before reading it as an hnsw index,
+          then deleting the temporary file. The returned index is immutable
+          and can only be searched by the hnsw wrapper in cuVS, as the
+          format is not compatible with the original hnswlib library.
+          By `base_layer_only`, we mean that the hnsw index is created
+          without the additional layers that are used for the hierarchical
+          search in hnswlib. Instead, the base layer is used for the search.
+
+    Saving / loading the index is experimental. The serialization format is
+    subject to change.
+
+    Parameters
+    ----------
+    index : Index
+        Trained CAGRA index.
+    temporary_index_path : string, default = None
+        Path to save the temporary index file. If None, the temporary file
+        will be saved in `/tmp/<random_number>.bin`.
+    {resources_docstring}
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cuvs.neighbors import cagra
+    >>> from cuvs.neighbors import hnsw
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> # Build index
+    >>> index = cagra.build(cagra.IndexParams(), dataset)
+    >>> # Serialize the CAGRA index to hnswlib base layer only index format
+    >>> hnsw_index = hnsw.from_cagra(index)
+    """
+    uuid_num = uuid.uuid4()
+    filename = temporary_index_path if temporary_index_path else \
+        f"/tmp/{uuid_num}.bin"
+    save(filename, index, resources=resources)
+    hnsw_index = load(filename, index.dim, np.dtype(index.active_index_type),
+                      "sqeuclidean", resources=resources)
+    os.remove(filename)
+    return hnsw_index
+
+
+@auto_sync_resources
+@auto_convert_output
+def search(SearchParams search_params,
+           Index index,
+           queries,
+           k,
+           neighbors=None,
+           distances=None,
+           resources=None):
+    """
+    Find the k nearest neighbors for each query.
+
+    Parameters
+    ----------
+    search_params : SearchParams
+    index : Index
+        Trained CAGRA index.
+    queries : CUDA array interface compliant matrix shape (n_samples, dim)
+        Supported dtype [float, int]
+    k : int
+        The number of neighbors.
+    neighbors : Optional CUDA array interface compliant matrix shape
+                (n_queries, k), dtype uint64_t. If supplied, neighbor
+                indices will be written here in-place. (default None)
+    distances : Optional CUDA array interface compliant matrix shape
+                (n_queries, k) If supplied, the distances to the
+                neighbors will be written here in-place. (default None)
+    {resources_docstring}
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cuvs.neighbors import cagra, hnsw
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> n_queries = 1000
+    >>> dataset = cp.random.random_sample((n_samples, n_features),
+    ...                                   dtype=cp.float32)
+    >>> # Build index
+    >>> index = cagra.build(cagra.IndexParams(), dataset)
+    >>> # Search using the built index
+    >>> queries = cp.random.random_sample((n_queries, n_features),
+    ...                                   dtype=cp.float32)
+    >>> k = 10
+    >>> search_params = hnsw.SearchParams(
+    ...     ef=200,
+    ...     num_threads=0
+    ... )
+    >>> # Convert CAGRA index to HNSW
+    >>> hnsw_index = hnsw.from_cagra(index)
+    >>> # Using a pooling allocator reduces overhead of temporary array
+    >>> # creation during search. This is useful if multiple searches
+    >>> # are performed with same query size.
+    >>> distances, neighbors = hnsw.search(search_params, index, queries,
+    ...                                     k)
+    >>> neighbors = cp.asarray(neighbors)
+    >>> distances = cp.asarray(distances)
+    """
+    if not index.trained:
+        raise ValueError("Index needs to be built before calling search.")
+
+    # todo(dgd): we can make the check of dtype a parameter of wrap_array
+    # in RAFT to make this a single call
+    queries_ai = wrap_array(queries)
+    _check_input_array(queries_ai, [np.dtype('float32'),
+                                    np.dtype('uint8'),
+                                    np.dtype('int8')])
+
+    cdef uint32_t n_queries = queries_ai.shape[0]
+
+    if neighbors is None:
+        neighbors = np.empty((n_queries, k), dtype='uint64')
+
+    neighbors_ai = wrap_array(neighbors)
+    _check_input_array(neighbors_ai, [np.dtype('uint64')],
+                       exp_rows=n_queries, exp_cols=k)
+
+    if distances is None:
+        distances = np.empty((n_queries, k), dtype='float32')
+
+    distances_ai = wrap_array(distances)
+    _check_input_array(distances_ai, [np.dtype('float32')],
+                       exp_rows=n_queries, exp_cols=k)
+
+    cdef cuvsHnswSearchParams* params = &search_params.params
+    cdef cydlpack.DLManagedTensor* queries_dlpack = \
+        cydlpack.dlpack_c(queries_ai)
+    cdef cydlpack.DLManagedTensor* neighbors_dlpack = \
+        cydlpack.dlpack_c(neighbors_ai)
+    cdef cydlpack.DLManagedTensor* distances_dlpack = \
+        cydlpack.dlpack_c(distances_ai)
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+
+    with cuda_interruptible():
+        check_cuvs(cuvsHnswSearch(
+            res,
+            params,
+            index.index,
+            queries_dlpack,
+            neighbors_dlpack,
+            distances_dlpack
+        ))
+
+    return (distances, neighbors)
diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py
new file mode 100644
index 000000000..0ae97266b
--- /dev/null
+++ b/python/cuvs/cuvs/test/test_hnsw.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     h ttp://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import normalize
+
+from cuvs.neighbors import cagra, hnsw
+from cuvs.test.ann_utils import calc_recall, generate_data
+
+
+def run_hnsw_build_search_test(
+    n_rows=1000,
+    n_cols=10,
+    n_queries=100,
+    k=10,
+    dtype=np.float32,
+    metric="sqeuclidean",
+    build_algo="ivf_pq",
+    intermediate_graph_degree=128,
+    graph_degree=64,
+    search_params={},
+):
+    dataset = generate_data((n_rows, n_cols), dtype)
+    if metric == "inner_product":
+        dataset = normalize(dataset, norm="l2", axis=1)
+        if dtype in [np.int8, np.uint8]:
+            pytest.skip(
+                "inner_product metric is not supported for int8/uint8 data"
+            )
+        if build_algo == "nn_descent":
+            pytest.skip("inner_product metric is not supported for nn_descent")
+
+    build_params = cagra.IndexParams(
+        metric=metric,
+        intermediate_graph_degree=intermediate_graph_degree,
+        graph_degree=graph_degree,
+        build_algo=build_algo,
+    )
+
+    index = cagra.build(build_params, dataset)
+
+    assert index.trained
+
+    hnsw_index = hnsw.from_cagra(index)
+
+    queries = generate_data((n_queries, n_cols), dtype)
+
+    search_params = hnsw.SearchParams(**search_params)
+
+    out_dist, out_idx = hnsw.search(search_params, hnsw_index, queries, k)
+
+    # Calculate reference values with sklearn
+    skl_metric = {
+        "sqeuclidean": "sqeuclidean",
+        "inner_product": "cosine",
+        "euclidean": "euclidean",
+    }[metric]
+    nn_skl = NearestNeighbors(
+        n_neighbors=k, algorithm="brute", metric=skl_metric
+    )
+    nn_skl.fit(dataset)
+    skl_dist, skl_idx = nn_skl.kneighbors(queries, return_distance=True)
+
+    recall = calc_recall(out_idx, skl_idx)
+    assert recall > 0.95
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
+@pytest.mark.parametrize("k", [10, 20])
+@pytest.mark.parametrize("ef", [30, 40])
+@pytest.mark.parametrize("num_threads", [2, 4])
+@pytest.mark.parametrize("metric", ["sqeuclidean"])
+@pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
+def test_hnsw(dtype, k, ef, num_threads, metric, build_algo):
+    # Note that inner_product tests use normalized input which we cannot
+    # represent in int8, therefore we test only sqeuclidean metric here.
+    run_hnsw_build_search_test(
+        dtype=dtype,
+        k=k,
+        metric=metric,
+        build_algo=build_algo,
+        search_params={"ef": ef, "num_threads": num_threads},
+    )
diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
new file mode 100644
index 000000000..2b4213016
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import os
+import sys
+
+import cupy as cp
+import numpy as np
+import rmm
+from pylibraft.common import DeviceResources
+from rmm.allocators.cupy import rmm_cupy_allocator
+
+from cuvs.neighbors.brute_force import knn
+
+from .utils import memmap_bin_file, suffix_from_dtype, write_bin
+
+
+def generate_random_queries(n_queries, n_features, dtype=np.float32):
+    print("Generating random queries")
+    if np.issubdtype(dtype, np.integer):
+        queries = cp.random.randint(
+            0, 255, size=(n_queries, n_features), dtype=dtype
+        )
+    else:
+        queries = cp.random.uniform(size=(n_queries, n_features)).astype(dtype)
+    return queries
+
+
+def choose_random_queries(dataset, n_queries):
+    print("Choosing random vector from dataset as query vectors")
+    query_idx = np.random.choice(
+        dataset.shape[0], size=(n_queries,), replace=False
+    )
+    return dataset[query_idx, :]
+
+
+def calc_truth(dataset, queries, k, metric="sqeuclidean"):
+    handle = DeviceResources()
+    n_samples = dataset.shape[0]
+    n = 500000  # batch size for processing neighbors
+    i = 0
+    indices = None
+    distances = None
+    queries = cp.asarray(queries, dtype=cp.float32)
+
+    while i < n_samples:
+        print("Step {0}/{1}:".format(i // n, n_samples // n))
+        n_batch = n if i + n <= n_samples else n_samples - i
+
+        X = cp.asarray(dataset[i : i + n_batch, :], cp.float32)
+
+        D, Ind = knn(X, queries, k, metric=metric, handle=handle)
+        handle.sync()
+
+        D, Ind = cp.asarray(D), cp.asarray(Ind)
+        Ind += i  # shift neighbor index by offset i
+
+        if distances is None:
+            distances = D
+            indices = Ind
+        else:
+            distances = cp.concatenate([distances, D], axis=1)
+            indices = cp.concatenate([indices, Ind], axis=1)
+            idx = cp.argsort(distances, axis=1)[:, :k]
+            distances = cp.take_along_axis(distances, idx, axis=1)
+            indices = cp.take_along_axis(indices, idx, axis=1)
+
+        i += n_batch
+
+    return distances, indices
+
+
+def main():
+    pool = rmm.mr.PoolMemoryResource(
+        rmm.mr.CudaMemoryResource(), initial_pool_size=2**30
+    )
+    rmm.mr.set_current_device_resource(pool)
+    cp.cuda.set_allocator(rmm_cupy_allocator)
+
+    parser = argparse.ArgumentParser(
+        prog="generate_groundtruth",
+        description="Generate true neighbors using exact NN search. "
+        "The input and output files are in big-ann-benchmark's binary format.",
+        epilog="""Example usage
+    # With existing query file
+    python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
+fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
+
+    # With randomly generated queries
+    python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
+fbin --output=groundtruth_dir --queries=random --n_queries=10000
+
+    # Using only a subset of the dataset. Define queries by randomly
+    # selecting vectors from the (subset of the) dataset.
+    python -m cuvs_bench.generate_groundtruth --dataset /dataset/base.\
+fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \
+--queries=random-choice --n_queries=10000
+    """,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    parser.add_argument("dataset", type=str, help="input dataset file name")
+    parser.add_argument(
+        "--queries",
+        type=str,
+        default="random",
+        help="Queries file name, or one of 'random-choice' or 'random' "
+        "(default). 'random-choice': select n_queries vectors from the input "
+        "dataset. 'random': generate n_queries as uniform random numbers.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="",
+        help="output directory name (default current dir)",
+    )
+
+    parser.add_argument(
+        "--n_queries",
+        type=int,
+        default=10000,
+        help="Number of quries to generate (if no query file is given). "
+        "Default: 10000.",
+    )
+
+    parser.add_argument(
+        "-N",
+        "--rows",
+        default=None,
+        type=int,
+        help="use only first N rows from dataset, by default the whole "
+        "dataset is used",
+    )
+    parser.add_argument(
+        "-D",
+        "--cols",
+        default=None,
+        type=int,
+        help="number of features (dataset columns). "
+        "Default: read from dataset file.",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        help="Dataset dtype. When not specified, then derived from extension."
+        " Supported types: 'float32', 'float16', 'uint8', 'int8'",
+    )
+
+    parser.add_argument(
+        "-k",
+        type=int,
+        default=100,
+        help="Number of neighbors (per query) to calculate",
+    )
+    parser.add_argument(
+        "--metric",
+        type=str,
+        default="sqeuclidean",
+        help="Metric to use while calculating distances. Valid metrics are "
+        "those that are accepted by cuvs.neighbors.brute_force.knn. Most"
+        " commonly used with cuVS are 'sqeuclidean' and 'inner_product'",
+    )
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+    args = parser.parse_args()
+
+    if args.rows is not None:
+        print("Reading subset of the data, nrows=", args.rows)
+    else:
+        print("Reading whole dataset")
+
+    # Load input data
+    dataset = memmap_bin_file(
+        args.dataset, args.dtype, shape=(args.rows, args.cols)
+    )
+    n_features = dataset.shape[1]
+    dtype = dataset.dtype
+
+    print(
+        "Dataset size {:6.1f} GB, shape {}, dtype {}".format(
+            dataset.size * dataset.dtype.itemsize / 1e9,
+            dataset.shape,
+            np.dtype(dtype),
+        )
+    )
+
+    if len(args.output) > 0:
+        os.makedirs(args.output, exist_ok=True)
+
+    if args.queries == "random" or args.queries == "random-choice":
+        if args.n_queries is None:
+            raise RuntimeError(
+                "n_queries must be given to generate random queries"
+            )
+        if args.queries == "random":
+            queries = generate_random_queries(
+                args.n_queries, n_features, dtype
+            )
+        elif args.queries == "random-choice":
+            queries = choose_random_queries(dataset, args.n_queries)
+
+        queries_filename = os.path.join(
+            args.output, "queries" + suffix_from_dtype(dtype)
+        )
+        print("Writing queries file", queries_filename)
+        write_bin(queries_filename, queries)
+    else:
+        print("Reading queries from file", args.queries)
+        queries = memmap_bin_file(args.queries, dtype)
+
+    print("Calculating true nearest neighbors")
+    distances, indices = calc_truth(dataset, queries, args.k, args.metric)
+
+    write_bin(
+        os.path.join(args.output, "groundtruth.neighbors.ibin"),
+        indices.astype(np.uint32),
+    )
+    write_bin(
+        os.path.join(args.output, "groundtruth.distances.fbin"),
+        distances.astype(np.float32),
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
new file mode 100644
index 000000000..a969b3d89
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py
@@ -0,0 +1,101 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+
+import numpy as np
+
+
+def dtype_from_filename(filename):
+    ext = os.path.splitext(filename)[1]
+    if ext == ".fbin":
+        return np.float32
+    if ext == ".hbin":
+        return np.float16
+    elif ext == ".ibin":
+        return np.int32
+    elif ext == ".u8bin":
+        return np.ubyte
+    elif ext == ".i8bin":
+        return np.byte
+    else:
+        raise RuntimeError("Not supported file extension" + ext)
+
+
+def suffix_from_dtype(dtype):
+    if dtype == np.float32:
+        return ".fbin"
+    if dtype == np.float16:
+        return ".hbin"
+    elif dtype == np.int32:
+        return ".ibin"
+    elif dtype == np.ubyte:
+        return ".u8bin"
+    elif dtype == np.byte:
+        return ".i8bin"
+    else:
+        raise RuntimeError("Not supported dtype extension" + dtype)
+
+
+def memmap_bin_file(
+    bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32
+):
+    extent_itemsize = np.dtype(size_dtype).itemsize
+    offset = int(extent_itemsize) * 2
+    if bin_file is None:
+        return None
+    if dtype is None:
+        dtype = dtype_from_filename(bin_file)
+
+    if mode[0] == "r":
+        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
+        if shape is None:
+            shape = (a[0], a[1])
+        else:
+            shape = tuple(
+                [
+                    aval if sval is None else sval
+                    for aval, sval in zip(a, shape)
+                ]
+            )
+
+        return np.memmap(
+            bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape
+        )
+    elif mode[0] == "w":
+        if shape is None:
+            raise ValueError("Need to specify shape to map file in write mode")
+
+        print("creating file", bin_file)
+        dirname = os.path.dirname(bin_file)
+        if len(dirname) > 0:
+            os.makedirs(dirname, exist_ok=True)
+        a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,))
+        a[0] = shape[0]
+        a[1] = shape[1]
+        a.flush()
+        del a
+        fp = np.memmap(
+            bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape
+        )
+        return fp
+
+
+def write_bin(fname, data):
+    print("writing", fname, data.shape, data.dtype, "...")
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/__main__.py b/python/cuvs_bench/cuvs_bench/get_dataset/__main__.py
new file mode 100644
index 000000000..a6b154ef2
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/get_dataset/__main__.py
@@ -0,0 +1,115 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import subprocess
+import sys
+from urllib.request import urlretrieve
+
+
+def get_dataset_path(name, ann_bench_data_path):
+    if not os.path.exists(ann_bench_data_path):
+        os.mkdir(ann_bench_data_path)
+    return os.path.join(ann_bench_data_path, f"{name}.hdf5")
+
+
+def download_dataset(url, path):
+    if not os.path.exists(path):
+        print(f"downloading {url} -> {path}...")
+        urlretrieve(url, path)
+
+
+def convert_hdf5_to_fbin(path, normalize):
+    scripts_path = os.path.dirname(os.path.realpath(__file__))
+    ann_bench_scripts_path = os.path.join(scripts_path, "hdf5_to_fbin.py")
+    print(f"calling script {ann_bench_scripts_path}")
+    if normalize and "angular" in path:
+        subprocess.run(
+            ["python", ann_bench_scripts_path, "-n", "%s" % path], check=True
+        )
+    else:
+        subprocess.run(
+            ["python", ann_bench_scripts_path, "%s" % path], check=True
+        )
+
+
+def move(name, ann_bench_data_path):
+    if "angular" in name:
+        new_name = name.replace("angular", "inner")
+    else:
+        new_name = name
+    new_path = os.path.join(ann_bench_data_path, new_name)
+    if not os.path.exists(new_path):
+        os.mkdir(new_path)
+    for bin_name in [
+        "base.fbin",
+        "query.fbin",
+        "groundtruth.neighbors.ibin",
+        "groundtruth.distances.fbin",
+    ]:
+        os.rename(
+            f"{ann_bench_data_path}/{name}.{bin_name}",
+            f"{new_path}/{bin_name}",
+        )
+
+
+def download(name, normalize, ann_bench_data_path):
+    path = get_dataset_path(name, ann_bench_data_path)
+    try:
+        url = f"http://ann-benchmarks.com/{name}.hdf5"
+        download_dataset(url, path)
+
+        convert_hdf5_to_fbin(path, normalize)
+
+        move(name, ann_bench_data_path)
+    except Exception:
+        print(f"Cannot download {url}")
+        raise
+
+
+def main():
+    call_path = os.getcwd()
+    if "RAPIDS_DATASET_ROOT_DIR" in os.environ:
+        default_dataset_path = os.getenv("RAPIDS_DATASET_ROOT_DIR")
+    else:
+        default_dataset_path = os.path.join(call_path, "datasets/")
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--dataset", help="dataset to download", default="glove-100-angular"
+    )
+    parser.add_argument(
+        "--dataset-path",
+        help="path to download dataset",
+        default=default_dataset_path,
+    )
+    parser.add_argument(
+        "--normalize",
+        help="normalize cosine distance to inner product",
+        action="store_true",
+    )
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+    args = parser.parse_args()
+
+    download(args.dataset, args.normalize, args.dataset_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py
new file mode 100644
index 000000000..1255e42dc
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py
@@ -0,0 +1,49 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from __future__ import absolute_import, division, print_function
+
+import sys
+
+import numpy as np
+
+
+def read_fbin(fname):
+    shape = np.fromfile(fname, dtype=np.uint32, count=2)
+    if float(shape[0]) * shape[1] * 4 > 2_000_000_000:
+        data = np.memmap(fname, dtype=np.float32, offset=8, mode="r").reshape(
+            shape
+        )
+    else:
+        data = np.fromfile(fname, dtype=np.float32, offset=8).reshape(shape)
+    return data
+
+
+def write_bin(fname, data):
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
+
+
+if len(sys.argv) != 3:
+    print(
+        "usage: %s input.fbin output.f16bin" % (sys.argv[0]),
+        file=sys.stderr,
+    )
+    sys.exit(-1)
+
+data = read_fbin(sys.argv[1]).astype(np.float16)
+write_bin(sys.argv[2], data)
diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py
new file mode 100644
index 000000000..317051aa2
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py
@@ -0,0 +1,90 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import sys
+
+import h5py
+import numpy as np
+
+
+def normalize(x):
+    norm = np.linalg.norm(x, axis=1)
+    return (x.T / norm).T
+
+
+def write_bin(fname, data):
+    with open(fname, "wb") as f:
+        np.asarray(data.shape, dtype=np.uint32).tofile(f)
+        data.tofile(f)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2 and len(sys.argv) != 3:
+        print(
+            "usage: %s [-n] <input>.hdf5\n" % (sys.argv[0]),
+            "  -n: normalize base/query set\n",
+            "outputs: <input>.base.fbin\n",
+            "         <input>.query.fbin\n",
+            "         <input>.groundtruth.neighbors.ibin\n",
+            "         <input>.groundtruth.distances.fbin",
+            file=sys.stderr,
+        )
+        sys.exit(-1)
+
+    need_normalize = False
+    if len(sys.argv) == 3:
+        assert sys.argv[1] == "-n"
+        need_normalize = True
+    fname_prefix = sys.argv[-1]
+    assert fname_prefix.endswith(".hdf5")
+    fname_prefix = fname_prefix[:-5]
+
+    hdf5 = h5py.File(sys.argv[-1], "r")
+    assert (
+        hdf5.attrs["distance"] == "angular"
+        or hdf5.attrs["distance"] == "euclidean"
+    )
+    assert hdf5["train"].dtype == np.float32
+    assert hdf5["test"].dtype == np.float32
+    assert hdf5["neighbors"].dtype == np.int32
+    assert hdf5["distances"].dtype == np.float32
+
+    base = hdf5["train"][:]
+    query = hdf5["test"][:]
+    if need_normalize:
+        base = normalize(base)
+        query = normalize(query)
+    elif hdf5.attrs["distance"] == "angular":
+        print(
+            "warning: input has angular distance, ",
+            "specify -n to normalize base/query set!\n",
+        )
+
+    output_fname = fname_prefix + ".base.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, base)
+
+    output_fname = fname_prefix + ".query.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, query)
+
+    output_fname = fname_prefix + ".groundtruth.neighbors.ibin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, hdf5["neighbors"][:])
+
+    output_fname = fname_prefix + ".groundtruth.distances.fbin"
+    print("writing", output_fname, "...")
+    write_bin(output_fname, hdf5["distances"][:])
diff --git a/python/cuvs_bench/cuvs_bench/run/__main__.py b/python/cuvs_bench/cuvs_bench/run/__main__.py
index bf9f8586d..58fc5291b 100644
--- a/python/cuvs_bench/cuvs_bench/run/__main__.py
+++ b/python/cuvs_bench/cuvs_bench/run/__main__.py
@@ -19,8 +19,9 @@
 from typing import Optional
 
 import click
-from data_export import convert_json_to_csv_build, convert_json_to_csv_search
-from run import run_benchmark
+
+from .data_export import convert_json_to_csv_build, convert_json_to_csv_search
+from .run import run_benchmark
 
 
 @click.command()
diff --git a/python/cuvs_bench/cuvs_bench/run/run.py b/python/cuvs_bench/cuvs_bench/run/run.py
index a65d4b5fe..0159d2c19 100644
--- a/python/cuvs_bench/cuvs_bench/run/run.py
+++ b/python/cuvs_bench/cuvs_bench/run/run.py
@@ -21,7 +21,8 @@
 from typing import Any, Dict, Optional, Tuple
 
 import yaml
-from runners import cuvs_bench_cpp
+
+from .runners import cuvs_bench_cpp
 
 
 def rmm_present() -> bool:
diff --git a/python/cuvs_bench/cuvs_bench/split_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/split_groundtruth/__main__.py
new file mode 100644
index 000000000..7fee30e42
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/split_groundtruth/__main__.py
@@ -0,0 +1,57 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import subprocess
+import sys
+
+
+def split_groundtruth(groundtruth_filepath):
+    ann_bench_scripts_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "split_groundtruth.pl"
+    )
+    pwd = os.getcwd()
+    path_to_groundtruth = os.path.normpath(groundtruth_filepath).split(os.sep)
+    if len(path_to_groundtruth) > 1:
+        os.chdir(os.path.join(*path_to_groundtruth[:-1]))
+    groundtruth_filename = path_to_groundtruth[-1]
+    subprocess.run(
+        [ann_bench_scripts_path, groundtruth_filename, "groundtruth"],
+        check=True,
+    )
+    os.chdir(pwd)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--groundtruth",
+        help="Path to billion-scale dataset groundtruth file",
+        required=True,
+    )
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+    args = parser.parse_args()
+
+    split_groundtruth(args.groundtruth)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/cuvs_bench/cuvs_bench/split_groundtruth/split_groundtruth.pl b/python/cuvs_bench/cuvs_bench/split_groundtruth/split_groundtruth.pl
new file mode 100644
index 000000000..b0a59f806
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/split_groundtruth/split_groundtruth.pl
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+# =============================================================================
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+use warnings;
+use strict;
+use autodie qw(open close);
+
+
+@ARGV == 2
+  or die "usage: $0 input output_prefix\n";
+
+open my $fh, '<:raw', $ARGV[0];
+
+my $raw;
+read($fh, $raw, 8);
+my ($nrows, $dim) = unpack('LL', $raw);
+
+my $expected_size = 8 + $nrows * $dim * (4 + 4);
+my $size = (stat($fh))[7];
+$size == $expected_size
+  or die("error: expected size is $expected_size, but actual size is $size\n");
+
+
+open my $fh_out1, '>:raw', "$ARGV[1].neighbors.ibin";
+open my $fh_out2, '>:raw', "$ARGV[1].distances.fbin";
+
+print {$fh_out1} $raw;
+print {$fh_out2} $raw;
+
+read($fh, $raw, $nrows * $dim * 4);
+print {$fh_out1} $raw;
+read($fh, $raw, $nrows * $dim * 4);
+print {$fh_out2} $raw;